From 8e92c9ce5dd62932065aede14b0870965a3919cb Mon Sep 17 00:00:00 2001 From: Sergey Date: Sun, 21 Apr 2024 21:53:43 -0600 Subject: [PATCH 1/7] infra --- .gitignore | 1 + pyproject.toml | 17 +++- src/cookbooks/llama_index.ipynb | 0 src/openparse/__init__.py | 3 +- src/openparse/doc_parser.py | 4 + src/openparse/embeddings/__init__.py | 96 ++++++++++++++++++++ src/openparse/pdf.py | 40 +++++++- src/openparse/processing/basic_transforms.py | 15 +-- src/openparse/schemas.py | 46 +++++++--- 9 files changed, 198 insertions(+), 24 deletions(-) create mode 100644 src/cookbooks/llama_index.ipynb create mode 100644 src/openparse/embeddings/__init__.py diff --git a/.gitignore b/.gitignore index 9279baa..ff8db49 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,5 @@ test-output.pdf notebooks/ sample-docs/ weights/ +.env diff --git a/pyproject.toml b/pyproject.toml index c28c68e..4dceebd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,8 @@ dependencies = [ "pypdf >= 4.0.0", "pdfminer.six >= 20200401", "tiktoken >= 0.3", - "openai >= 1.0.0", "numpy", + "llama-index-embeddings-openai", ] [project.urls] @@ -32,6 +32,21 @@ ml = [ "transformers", "tokenizers", ] +embeddings-azure-openai = [ + "llama-index-embeddings-azure-openai", +] +embeddings-cohere = [ + "llama-index-embeddings-cohere", +] +embeddings-huggingface = [ + "llama-index-embeddings-huggingface", +] +embeddings-huggingface-optimum = [ + "llama-index-embeddings-huggingface-optimum", +] +embeddings-text-embeddings-inference = [ + "llama-index-embeddings-text-embeddings-inference", +] [project.scripts] openparse-download = "openparse.cli:download_unitable_weights" diff --git a/src/cookbooks/llama_index.ipynb b/src/cookbooks/llama_index.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/src/openparse/__init__.py b/src/openparse/__init__.py index fa61a23..1ec4ba0 100644 --- a/src/openparse/__init__.py +++ b/src/openparse/__init__.py @@ -2,7 +2,7 @@ from openparse.doc_parser import ( DocumentParser, ) -from openparse import processing, version +from openparse import processing, version, embeddings from openparse.config import config from openparse.schemas import ( Bbox, @@ -28,4 +28,5 @@ "processing", "version", "config", + "embeddings", ] diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py index feefa89..e3774fb 100644 --- a/src/openparse/doc_parser.py +++ b/src/openparse/doc_parser.py @@ -117,6 +117,10 @@ def parse( table_parsing_kwargs=( table_args_obj.model_dump() if table_args_obj else None ), + creation_date=doc.file_metadata.get("creation_date"), + last_modified_date=doc.file_metadata.get("last_modified_date"), + last_accessed_date=doc.file_metadata.get("last_accessed_date"), + file_size=doc.file_metadata.get("file_size"), ) return parsed_doc diff --git a/src/openparse/embeddings/__init__.py b/src/openparse/embeddings/__init__.py new file mode 100644 index 0000000..2ec446b --- /dev/null +++ b/src/openparse/embeddings/__init__.py @@ -0,0 +1,96 @@ +""" +This is meant to provide a simple wrapper around llama_index's embeddings classes. +""" + +from typing import Dict, Type + +from llama_index.core.embeddings import BaseEmbedding + + +class ImportErrorProxy: + """ + Used to raise an ImportError when an attribute or method is accessed on a class that failed to import. + """ + + def __init__(self, class_name, install_command): + self.class_name = class_name + self.install_command = install_command + self.error_message = ( + f"Missing optional dependency for '{class_name}'. " + f"Please install it by running: '{install_command}'." + ) + + def __getattr__(self, name): + raise ImportError( + f"{self.error_message} The attribute '{name}' cannot be used." + ) + + def __call__(self, *args, **kwargs): + raise ImportError(self.error_message) + + +try: + from llama_index.embeddings.openai import ( + OpenAIEmbedding, + ) + +except ImportError: + OpenAIEmbedding = ImportErrorProxy( + "OpenAIEmbedding", + "pip install openparse[embeddings-openai]", + ) + +try: + from llama_index.embeddings.azure_openai import ( + AzureOpenAIEmbedding, + ) + +except ImportError: + AzureOpenAIEmbedding = ImportErrorProxy( + "AzureOpenAIEmbedding", + "pip install openparse[embeddings-azure-openai]", + ) + +try: + from llama_index.embeddings.huggingface import ( + HuggingFaceInferenceAPIEmbedding, + ) + +except ImportError: + HuggingFaceInferenceAPIEmbedding = ImportErrorProxy( + "HuggingFaceInferenceAPIEmbedding", + "pip install openparse[embeddings-huggingface]", + ) + + +try: + from llama_index.embeddings.huggingface_optimum import ( + OptimumEmbedding, + ) + +except ImportError: + OptimumEmbedding = ImportErrorProxy( + "OptimumEmbedding", + "pip install openparse[embeddings-huggingface-optimum]", + ) + +try: + from llama_index.embeddings.cohere import CohereEmbedding + +except ImportError: + CohereEmbedding = ImportErrorProxy( + "CohereEmbedding", + "pip install openparse[embeddings-cohere]", + ) + + +try: + from llama_index.embeddings.text_embeddings_inference import ( + TextEmbeddingsInference, + ) + +except ImportError: + TextEmbeddingsInference = ImportErrorProxy( + "TextEmbeddingsInference", + "pip install openparse[embeddings-text-embeddings-inference]", + ) diff --git a/src/openparse/pdf.py b/src/openparse/pdf.py index bbed5cc..1813b78 100644 --- a/src/openparse/pdf.py +++ b/src/openparse/pdf.py @@ -1,7 +1,10 @@ +import os +import mimetypes +import datetime as dt import random import tempfile from pathlib import Path -from typing import Iterator, List, Literal, Optional, Union, Tuple, Any +from typing import Iterator, List, Literal, Optional, Union, Tuple, Any, Dict from pydantic import BaseModel from pdfminer.high_level import extract_pages @@ -9,7 +12,6 @@ from pypdf import PdfReader, PdfWriter from openparse.schemas import Bbox, Node -from openparse import consts class _BboxWithColor(BaseModel): @@ -60,13 +62,41 @@ def _prepare_bboxes_for_drawing( return res +def file_metadata(file_path: Union[str, Path]) -> Dict: + """Get some handy metadate from filesystem. + + Args: + file_path: str: file path in str + """ + return { + "file_path": file_path, + "file_name": os.path.basename(file_path), + "file_type": mimetypes.guess_type(file_path)[0], + "file_size": os.path.getsize(file_path), + "creation_date": dt.datetime.fromtimestamp( + Path(file_path).stat().st_ctime + ).strftime("%Y-%m-%d"), + "last_modified_date": dt.datetime.fromtimestamp( + Path(file_path).stat().st_mtime + ).strftime("%Y-%m-%d"), + "last_accessed_date": dt.datetime.fromtimestamp( + Path(file_path).stat().st_atime + ).strftime("%Y-%m-%d"), + } + + class Pdf: """ Simple utility class for working with PDF files. This class wraps the PdfReader and PdfWriter classes from pypdf. """ def __init__(self, file: Union[str, Path, PdfReader]): - self.file_path = str(file) if isinstance(file, (str, Path)) else None + self.file_path = None + self.file_metadata = dict() + if isinstance(file, (str, Path)): + self.file_path = str(file) + self.file_metadata = file_metadata(file) + self.reader = PdfReader(file) if isinstance(file, (str, Path)) else file self.writer = PdfWriter() for page in self.reader.pages: @@ -175,7 +205,7 @@ def display_with_bboxes( bboxes = [node.bbox for node in nodes] flattened_bboxes = _prepare_bboxes_for_drawing(bboxes, annotations) - marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0]._coordinates) + marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0].coordinates) if not page_nums: page_nums = list(range(marked_up_doc.page_count)) for page_num in page_nums: @@ -193,7 +223,7 @@ def export_with_bboxes( bboxes = [node.bbox for node in nodes] flattened_bboxes = _prepare_bboxes_for_drawing(bboxes, annotations) - marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0]._coordinates) + marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0].coordinates) marked_up_doc.save(str(output_pdf)) def _flip_coordinates(self, bbox: Bbox) -> Bbox: diff --git a/src/openparse/processing/basic_transforms.py b/src/openparse/processing/basic_transforms.py index d2ba0a4..a055c0f 100644 --- a/src/openparse/processing/basic_transforms.py +++ b/src/openparse/processing/basic_transforms.py @@ -33,18 +33,21 @@ def process(self, nodes: List[Node]) -> List[Node]: updated_nodes.append(node) continue - new_elements = [ - element - for element in node.elements - if not ( + new_elements = [] + for element in node.elements: + should_include = not ( isinstance(element, TextElement) and self.intersects_any_table( element.bbox, tables_by_page[element.page] ) ) - ] - if new_elements: + if should_include: + new_elements.append(element) + + if new_elements and len(new_elements) != len(node.elements): updated_nodes.append(Node(elements=tuple(new_elements))) + elif len(new_elements) == len(node.elements): + updated_nodes.append(node) return updated_nodes diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py index de07c64..065cb1d 100644 --- a/src/openparse/schemas.py +++ b/src/openparse/schemas.py @@ -1,6 +1,8 @@ import re from collections import defaultdict, namedtuple from enum import Enum +import datetime as dt +import uuid from functools import cached_property from typing import Any, List, Literal, Optional, Tuple, Union, Set @@ -355,12 +357,25 @@ def _determine_relationship( class Node(BaseModel): - elements: Tuple[Union[TextElement, TableElement], ...] = Field(exclude=True) - _tokenization_lower_limit: int = consts.TOKENIZATION_LOWER_LIMIT - _tokenization_upper_limit: int = consts.TOKENIZATION_UPPER_LIMIT - _coordinates: Literal["top-left", "bottom-left"] = ( - consts.COORDINATE_SYSTEM + elements: Tuple[Union[TextElement, TableElement], ...] = Field( + exclude=True, frozen=True + ) + tokenization_lower_limit: int = Field( + default=consts.TOKENIZATION_LOWER_LIMIT, frozen=True, exclude=True + ) + tokenization_upper_limit: int = Field( + default=consts.TOKENIZATION_UPPER_LIMIT, frozen=True, exclude=True + ) + coordinates: Literal["top-left", "bottom-left"] = Field( + default=consts.COORDINATE_SYSTEM, frozen=True, exclude=True ) # controlled globally for now, should be moved into elements + embedding: Optional[List[float]] = Field( + default=None, description="Embedding of the node." + ) + + id_: str = Field( + default_factory=lambda: str(uuid.uuid4()), description="Unique ID of the node." + ) @computed_field # type: ignore @cached_property @@ -464,11 +479,11 @@ def is_stub(self) -> bool: @cached_property def is_small(self) -> bool: - return self.tokens < self._tokenization_lower_limit + return self.tokens < self.tokenization_lower_limit @cached_property def is_large(self) -> bool: - return self.tokens > self._tokenization_upper_limit + return self.tokens > self.tokenization_upper_limit @cached_property def num_pages(self) -> int: @@ -494,7 +509,7 @@ def reading_order(self) -> ReadingOrder: min_page = min(element.bbox.page for element in self.elements) min_x0 = min(element.bbox.x0 for element in self.elements) - if self._coordinates == "bottom-left": + if self.coordinates == "bottom-left": y_position = -min(element.bbox.y0 for element in self.elements) else: raise NotImplementedError( @@ -531,7 +546,7 @@ def __lt__(self, other: "Node") -> bool: if not isinstance(other, Node): return NotImplemented - assert self._coordinates == other._coordinates, "Coordinate systems must match." + assert self.coordinates == other.coordinates, "Coordinate systems must match." return self.reading_order < other.reading_order @@ -552,8 +567,6 @@ def __add__(self, other: "Node") -> "Node": new_elems = self.elements + other.elements return Node(elements=new_elems) - model_config = ConfigDict(frozen=True) - ####################### ### PARSED DOCUMENT ### @@ -561,8 +574,19 @@ def __add__(self, other: "Node") -> "Node": class ParsedDocument(BaseModel): + id_: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Unique ID of the node.", + ) nodes: List[Node] filename: str num_pages: int coordinate_system: Literal["top-left", "bottom-left"] = "bottom-left" table_parsing_kwargs: Optional[dict] = None + last_modified_date: Optional[dt.date] = None + last_accessed_date: Optional[dt.date] = None + creation_date: Optional[dt.date] = None + file_size: Optional[int] = None + + def to_llama_index(self): + raise NotImplementedError("Not implemented yet.") From fe57262ea66c11cc0345ad67d286987b4cb81487 Mon Sep 17 00:00:00 2001 From: Sergey Date: Wed, 1 May 2024 15:36:49 -0600 Subject: [PATCH 2/7] clean checkpoint --- pyproject.toml | 20 +++----------------- src/openparse/__init__.py | 3 +-- 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4dceebd..ed6bd55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,8 @@ name = "openparse" description = "Streamlines the process of preparing documents for LLM's." readme = "README.md" requires-python = ">=3.8" -version = "0.5.2" +license = "MIT" +version = "0.5.5" authors = [{name = "Sergey Filimonov", email = "hello@sergey.fyi"}] dependencies = [ "PyMuPDF >= 1.23.2", @@ -16,8 +17,8 @@ dependencies = [ "pypdf >= 4.0.0", "pdfminer.six >= 20200401", "tiktoken >= 0.3", + "openai >= 1.0.0", "numpy", - "llama-index-embeddings-openai", ] [project.urls] @@ -32,21 +33,6 @@ ml = [ "transformers", "tokenizers", ] -embeddings-azure-openai = [ - "llama-index-embeddings-azure-openai", -] -embeddings-cohere = [ - "llama-index-embeddings-cohere", -] -embeddings-huggingface = [ - "llama-index-embeddings-huggingface", -] -embeddings-huggingface-optimum = [ - "llama-index-embeddings-huggingface-optimum", -] -embeddings-text-embeddings-inference = [ - "llama-index-embeddings-text-embeddings-inference", -] [project.scripts] openparse-download = "openparse.cli:download_unitable_weights" diff --git a/src/openparse/__init__.py b/src/openparse/__init__.py index 1ec4ba0..fa61a23 100644 --- a/src/openparse/__init__.py +++ b/src/openparse/__init__.py @@ -2,7 +2,7 @@ from openparse.doc_parser import ( DocumentParser, ) -from openparse import processing, version, embeddings +from openparse import processing, version from openparse.config import config from openparse.schemas import ( Bbox, @@ -28,5 +28,4 @@ "processing", "version", "config", - "embeddings", ] From b93df6ab10fffd65958c497cfb8273d0099232dc Mon Sep 17 00:00:00 2001 From: Sergey Date: Wed, 1 May 2024 17:12:24 -0600 Subject: [PATCH 3/7] serialization func done --- requirements-dev.txt | 1 + src/cookbooks/llama_index.ipynb | 225 ++++++++++++++++++ src/openparse/pdf.py | 16 +- .../processing/semantic_transforms.py | 4 +- src/openparse/schemas.py | 97 +++++++- src/tests/test_doc_parser.py | 10 +- 6 files changed, 336 insertions(+), 17 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index fa133dd..0efca34 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,3 +9,4 @@ beautifulsoup4 twine packaging wheel +llama-index diff --git a/src/cookbooks/llama_index.ipynb b/src/cookbooks/llama_index.ipynb index e69de29..59673bc 100644 --- a/src/cookbooks/llama_index.ipynb +++ b/src/cookbooks/llama_index.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-05-01 17:04:54-- https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf\n", + "Resolving sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)... 162.243.189.2\n", + "Connecting to sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 379188 (370K) [application/pdf]\n", + "Saving to: ‘sample-docs/lyft-10k.pdf’\n", + "\n", + "sample-docs/lyft-10 100%[===================>] 370.30K 1.99MB/s in 0.2s \n", + "\n", + "2024-05-01 17:04:57 (1.99 MB/s) - ‘sample-docs/lyft-10k.pdf’ saved [379188/379188]\n", + "\n" + ] + } + ], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"..\")\n", + "\n", + "!wget https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf -O sample-docs/lyft-10k.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# make sure llama-index is installed, it's not a formal dependency of open-parse\n", + "# %pip install llama-index" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished parsing\n" + ] + } + ], + "source": [ + "import openparse\n", + "\n", + "doc_path = \"./sample-docs/lyft-10k.pdf\"\n", + "parser = openparse.DocumentParser()\n", + "parsed_doc = parser.parse(doc_path)\n", + "\n", + "print(\"Finished parsing\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Node ID: 33747e2d-0478-4628-b112-d733b1fc5039\n", + "Text: Securities registered pursuant to Section 12(g) of the\n", + "Act:**None** Indicate by check mark if the Registrant is a well-known\n", + "seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒\n", + "No ☐ Indicate by check mark if the Registrant is not required to file\n", + "reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\n", + "Indicate by check ma...\n" + ] + } + ], + "source": [ + "nodes = parsed_doc.to_llama_index_nodes()\n", + "\n", + "print(nodes[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id_': '33747e2d-0478-4628-b112-d733b1fc5039',\n", + " 'embedding': None,\n", + " 'metadata': {'bbox': [{'page': 0,\n", + " 'page_height': 792.0,\n", + " 'page_width': 612.0,\n", + " 'x0': 17.31,\n", + " 'y0': 332.53,\n", + " 'x1': 586.25,\n", + " 'y1': 424.21}]},\n", + " 'excluded_embed_metadata_keys': ['bbox'],\n", + " 'excluded_llm_metadata_keys': ['bbox'],\n", + " 'relationships': {: {'node_id': '59644551-d995-4d0a-88f5-49bbf22f0617',\n", + " 'node_type': ,\n", + " 'metadata': {'bbox': [{'page': 0,\n", + " 'page_height': 792.0,\n", + " 'page_width': 612.0,\n", + " 'x0': 17.31,\n", + " 'y0': 457.56,\n", + " 'x1': 590.92,\n", + " 'y1': 743.41}]},\n", + " 'hash': '77baa9ef95633b4c77c243ed3db29b4555c4f1f78c5b68f620eb8c4ff7f0a480',\n", + " 'class_name': 'RelatedNodeInfo'},\n", + " : {'node_id': '50744a8a-4ccb-4efa-a625-a1e7e3feec0c',\n", + " 'node_type': ,\n", + " 'metadata': {'bbox': [{'page': 0,\n", + " 'page_height': 792.0,\n", + " 'page_width': 612.0,\n", + " 'x0': 17.31,\n", + " 'y0': 211.34,\n", + " 'x1': 586.62,\n", + " 'y1': 290.85}]},\n", + " 'hash': '965f5304799146fde0d2bb8fb5726c0646ddf46df3ada30a902adee9005c2333',\n", + " 'class_name': 'RelatedNodeInfo'},\n", + " : {'node_id': 'dc94d72c-ec16-41b7-9e01-f359284464d2',\n", + " 'node_type': ,\n", + " 'metadata': {'file_name': 'lyft-10k.pdf',\n", + " 'file_size': 379188,\n", + " 'creation_date': '2024-05-01',\n", + " 'last_modified_date': '2024-04-07'},\n", + " 'hash': '60b974c64ec56d53a58cfe7703901cd049f6f11c39af6612861193672cc07bd9',\n", + " 'class_name': 'RelatedNodeInfo'}},\n", + " 'text': 'Securities registered pursuant to Section 12(g) of the Act:**None**\\nIndicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐\\nIndicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\\nIndicate by check mark whether the Registrant: (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such\\nshorter period that the Registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐\\nIndicate by check mark whether the Registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§232.405 of this chapter) during\\nthe preceding 12 months (or for such shorter period that the Registrant was required to submit such files). Yes ☒ No ☐\\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, smaller reporting company, or an emerging growth company. See the definitions of\\n“large accelerated filer,” “accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.',\n", + " 'start_char_idx': None,\n", + " 'end_char_idx': None,\n", + " 'text_template': '{metadata_str}\\n\\n{content}',\n", + " 'metadata_template': '{key}: {value}',\n", + " 'metadata_seperator': '\\n',\n", + " 'class_name': 'TextNode'}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes[1].dict()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Let's add the nodes to a vector store" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import VectorStoreIndex\n", + "\n", + "index = VectorStoreIndex(nodes=nodes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Now let's query our index" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "They generate revenue primarily from service fees and commissions collected from drivers for their use of the ridesharing marketplace. Additionally, they earn revenue from riders renting Light Vehicles, drivers renting vehicles through Express Drive, Lyft Rentals renters, Lyft Driver Center and Lyft Auto Care users, and by providing their ridesharing marketplace to organizations through Lyft Business offerings. In the second quarter of 2021, they also started generating revenues from licensing and data access agreements with third-party autonomous vehicle companies.\n" + ] + } + ], + "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"What do they do to make money?\")\n", + "print(response)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open-parse-notebooks", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/openparse/pdf.py b/src/openparse/pdf.py index 1813b78..d4177f9 100644 --- a/src/openparse/pdf.py +++ b/src/openparse/pdf.py @@ -137,10 +137,10 @@ def to_pymupdf_doc(self): """ try: import fitz # type: ignore - except ImportError: + except ImportError as err: raise ImportError( "PyMuPDF (fitz) is not installed. This method requires PyMuPDF." - ) + ) from err if not self.writer.pages: return fitz.open(self.file_path) @@ -156,10 +156,10 @@ def _draw_bboxes( ): try: import fitz - except ImportError: + except ImportError as err: raise ImportError( "PyMuPDF (fitz) is not installed. This method requires PyMuPDF." - ) + ) from err pdf = self.to_pymupdf_doc() @@ -197,15 +197,15 @@ def display_with_bboxes( """ try: from IPython.display import Image, display # type: ignore - except ImportError: + except ImportError as err: raise ImportError( "IPython is required to display PDFs. Please install it with `pip install ipython`." - ) + ) from err assert nodes, "At least one node is required." bboxes = [node.bbox for node in nodes] flattened_bboxes = _prepare_bboxes_for_drawing(bboxes, annotations) - marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0].coordinates) + marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0].coordinate_system) if not page_nums: page_nums = list(range(marked_up_doc.page_count)) for page_num in page_nums: @@ -223,7 +223,7 @@ def export_with_bboxes( bboxes = [node.bbox for node in nodes] flattened_bboxes = _prepare_bboxes_for_drawing(bboxes, annotations) - marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0].coordinates) + marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0].coordinate_system) marked_up_doc.save(str(output_pdf)) def _flip_coordinates(self, bbox: Bbox) -> Bbox: diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py index d01d085..2a0f28d 100644 --- a/src/openparse/processing/semantic_transforms.py +++ b/src/openparse/processing/semantic_transforms.py @@ -61,10 +61,10 @@ def embed_many(self, texts: List[str]) -> List[List[float]]: def _create_client(self): try: from openai import OpenAI - except ImportError: + except ImportError as err: raise ImportError( "You need to install the openai package to use this feature." - ) + ) from err return OpenAI(api_key=self.api_key) diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py index 065cb1d..4c3d6a9 100644 --- a/src/openparse/schemas.py +++ b/src/openparse/schemas.py @@ -366,7 +366,7 @@ class Node(BaseModel): tokenization_upper_limit: int = Field( default=consts.TOKENIZATION_UPPER_LIMIT, frozen=True, exclude=True ) - coordinates: Literal["top-left", "bottom-left"] = Field( + coordinate_system: Literal["top-left", "bottom-left"] = Field( default=consts.COORDINATE_SYSTEM, frozen=True, exclude=True ) # controlled globally for now, should be moved into elements embedding: Optional[List[float]] = Field( @@ -374,9 +374,16 @@ class Node(BaseModel): ) id_: str = Field( - default_factory=lambda: str(uuid.uuid4()), description="Unique ID of the node." + default_factory=lambda: str(uuid.uuid4()), + description="Unique ID of the node.", + exclude=True, ) + @computed_field # type: ignore + @cached_property + def node_id(self) -> str: + return self.id_ + @computed_field # type: ignore @cached_property def variant(self) -> Set[Literal["text", "table"]]: @@ -509,7 +516,7 @@ def reading_order(self) -> ReadingOrder: min_page = min(element.bbox.page for element in self.elements) min_x0 = min(element.bbox.x0 for element in self.elements) - if self.coordinates == "bottom-left": + if self.coordinate_system == "bottom-left": y_position = -min(element.bbox.y0 for element in self.elements) else: raise NotImplementedError( @@ -542,11 +549,29 @@ def overlaps( return False + def to_llama_index(self): + try: + from llama_index.core.schema import TextNode as LlamaIndexTextNode + except ImportError as err: + raise ImportError( + "llama_index is not installed. Please install it with `pip install llama_index`." + ) from err + return LlamaIndexTextNode( + id_=self.id_, + text=self.text, + embedding=self.embedding, + metadata={"bbox": [b.model_dump(mode="json") for b in self.bbox]}, + excluded_embed_metadata_keys=["bbox"], + excluded_llm_metadata_keys=["bbox"], + ) + def __lt__(self, other: "Node") -> bool: if not isinstance(other, Node): return NotImplemented - assert self.coordinates == other.coordinates, "Coordinate systems must match." + assert ( + self.coordinate_system == other.coordinate_system + ), "Coordinate systems must match." return self.reading_order < other.reading_order @@ -577,6 +602,7 @@ class ParsedDocument(BaseModel): id_: str = Field( default_factory=lambda: str(uuid.uuid4()), description="Unique ID of the node.", + exclude=True, ) nodes: List[Node] filename: str @@ -588,5 +614,64 @@ class ParsedDocument(BaseModel): creation_date: Optional[dt.date] = None file_size: Optional[int] = None - def to_llama_index(self): - raise NotImplementedError("Not implemented yet.") + @cached_property + @computed_field + def doc_id(self) -> str: + return self.id_ + + def to_llama_index_nodes(self): + try: + from llama_index.core.schema import Document as LlamaIndexDocument + except ImportError as err: + raise ImportError( + "llama_index is not installed. Please install it with `pip install llama_index`." + ) from err + + li_doc = LlamaIndexDocument( + id_=self.id_, + metadata={ + "file_name": self.filename, + "file_size": self.file_size, + "creation_date": self.creation_date.isoformat(), + "last_modified_date": self.last_modified_date.isoformat(), + }, + excluded_embed_metadata_keys=[ + "file_size", + "creation_date", + "last_modified_date", + ], + excluded_llm_metadata_keys=[ + "file_name", + "file_size", + "creation_date", + "last_modified_date", + ], + ) + li_nodes = self._nodes_to_llama_index(li_doc) + + return li_nodes + + def _nodes_to_llama_index(self, llama_index_doc): + try: + from llama_index.core.schema import NodeRelationship + except ImportError as err: + raise ImportError( + "llama_index is not installed. Please install it with `pip install llama_index`." + ) from err + + li_nodes = [node.to_llama_index() for node in sorted(self.nodes)] + for i in range(len(li_nodes) - 1): + li_nodes[i].relationships[NodeRelationship.NEXT] = li_nodes[ + i + 1 + ].as_related_node_info() + + li_nodes[i + 1].relationships[NodeRelationship.PREVIOUS] = li_nodes[ + i + ].as_related_node_info() + + for li_node in li_nodes: + li_node.relationships[NodeRelationship.PARENT] = ( + llama_index_doc.as_related_node_info() + ) + + return li_nodes diff --git a/src/tests/test_doc_parser.py b/src/tests/test_doc_parser.py index 6b90de8..ddb126a 100644 --- a/src/tests/test_doc_parser.py +++ b/src/tests/test_doc_parser.py @@ -3,7 +3,6 @@ def test_parse_doc(): - basic_doc_path = "src/evals/data/full-pdfs/mock-1-page-lease.pdf" parser = openparse.DocumentParser() parsed_basic_doc = parser.parse(basic_doc_path) @@ -49,3 +48,12 @@ def test_parse_tables_with_pymupdf(): parsed_doc2 = parser.parse(doc_with_tables_path) assert len(parsed_doc2.nodes) >= 1 assert parsed_doc2.nodes[-1].text.startswith("= 1 From 5d6fe821427731344774ea7c64e8e9c3db5c1eb4 Mon Sep 17 00:00:00 2001 From: Sergey Date: Wed, 1 May 2024 17:15:03 -0600 Subject: [PATCH 4/7] removed embedding stuff --- src/openparse/embeddings/__init__.py | 96 ---------------------------- 1 file changed, 96 deletions(-) delete mode 100644 src/openparse/embeddings/__init__.py diff --git a/src/openparse/embeddings/__init__.py b/src/openparse/embeddings/__init__.py deleted file mode 100644 index 2ec446b..0000000 --- a/src/openparse/embeddings/__init__.py +++ /dev/null @@ -1,96 +0,0 @@ -""" -This is meant to provide a simple wrapper around llama_index's embeddings classes. -""" - -from typing import Dict, Type - -from llama_index.core.embeddings import BaseEmbedding - - -class ImportErrorProxy: - """ - Used to raise an ImportError when an attribute or method is accessed on a class that failed to import. - """ - - def __init__(self, class_name, install_command): - self.class_name = class_name - self.install_command = install_command - self.error_message = ( - f"Missing optional dependency for '{class_name}'. " - f"Please install it by running: '{install_command}'." - ) - - def __getattr__(self, name): - raise ImportError( - f"{self.error_message} The attribute '{name}' cannot be used." - ) - - def __call__(self, *args, **kwargs): - raise ImportError(self.error_message) - - -try: - from llama_index.embeddings.openai import ( - OpenAIEmbedding, - ) - -except ImportError: - OpenAIEmbedding = ImportErrorProxy( - "OpenAIEmbedding", - "pip install openparse[embeddings-openai]", - ) - -try: - from llama_index.embeddings.azure_openai import ( - AzureOpenAIEmbedding, - ) - -except ImportError: - AzureOpenAIEmbedding = ImportErrorProxy( - "AzureOpenAIEmbedding", - "pip install openparse[embeddings-azure-openai]", - ) - -try: - from llama_index.embeddings.huggingface import ( - HuggingFaceInferenceAPIEmbedding, - ) - -except ImportError: - HuggingFaceInferenceAPIEmbedding = ImportErrorProxy( - "HuggingFaceInferenceAPIEmbedding", - "pip install openparse[embeddings-huggingface]", - ) - - -try: - from llama_index.embeddings.huggingface_optimum import ( - OptimumEmbedding, - ) - -except ImportError: - OptimumEmbedding = ImportErrorProxy( - "OptimumEmbedding", - "pip install openparse[embeddings-huggingface-optimum]", - ) - -try: - from llama_index.embeddings.cohere import CohereEmbedding - -except ImportError: - CohereEmbedding = ImportErrorProxy( - "CohereEmbedding", - "pip install openparse[embeddings-cohere]", - ) - - -try: - from llama_index.embeddings.text_embeddings_inference import ( - TextEmbeddingsInference, - ) - -except ImportError: - TextEmbeddingsInference = ImportErrorProxy( - "TextEmbeddingsInference", - "pip install openparse[embeddings-text-embeddings-inference]", - ) From 08ea9d913404c02f2308331e4927d06dcba06b62 Mon Sep 17 00:00:00 2001 From: Sergey Date: Wed, 1 May 2024 17:16:59 -0600 Subject: [PATCH 5/7] bump version --- pyproject.toml | 2 +- src/openparse/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ed6bd55..f0b1447 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ description = "Streamlines the process of preparing documents for LLM's." readme = "README.md" requires-python = ">=3.8" license = "MIT" -version = "0.5.5" +version = "0.5.6" authors = [{name = "Sergey Filimonov", email = "hello@sergey.fyi"}] dependencies = [ "PyMuPDF >= 1.23.2", diff --git a/src/openparse/version.py b/src/openparse/version.py index d690686..cd0bc57 100644 --- a/src/openparse/version.py +++ b/src/openparse/version.py @@ -1,4 +1,4 @@ -OPEN_PARSE_VERSION = "0.5.5" +OPEN_PARSE_VERSION = "0.5.6" def version_info() -> str: From 289345daa43621dbab593987265465cf640ce4fb Mon Sep 17 00:00:00 2001 From: Sergey Date: Wed, 1 May 2024 17:44:29 -0600 Subject: [PATCH 6/7] updated error messages --- src/openparse/schemas.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py index 4c3d6a9..0d0f45f 100644 --- a/src/openparse/schemas.py +++ b/src/openparse/schemas.py @@ -357,6 +357,11 @@ def _determine_relationship( class Node(BaseModel): + id_: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Unique ID of the node.", + exclude=True, + ) elements: Tuple[Union[TextElement, TableElement], ...] = Field( exclude=True, frozen=True ) @@ -373,12 +378,6 @@ class Node(BaseModel): default=None, description="Embedding of the node." ) - id_: str = Field( - default_factory=lambda: str(uuid.uuid4()), - description="Unique ID of the node.", - exclude=True, - ) - @computed_field # type: ignore @cached_property def node_id(self) -> str: @@ -554,7 +553,7 @@ def to_llama_index(self): from llama_index.core.schema import TextNode as LlamaIndexTextNode except ImportError as err: raise ImportError( - "llama_index is not installed. Please install it with `pip install llama_index`." + "llama_index is not installed. Please install it with `pip install llama-index`." ) from err return LlamaIndexTextNode( id_=self.id_, @@ -624,7 +623,7 @@ def to_llama_index_nodes(self): from llama_index.core.schema import Document as LlamaIndexDocument except ImportError as err: raise ImportError( - "llama_index is not installed. Please install it with `pip install llama_index`." + "llama_index is not installed. Please install it with `pip install llama-index`." ) from err li_doc = LlamaIndexDocument( @@ -656,7 +655,7 @@ def _nodes_to_llama_index(self, llama_index_doc): from llama_index.core.schema import NodeRelationship except ImportError as err: raise ImportError( - "llama_index is not installed. Please install it with `pip install llama_index`." + "llama_index is not installed. Please install it with `pip install llama-index`." ) from err li_nodes = [node.to_llama_index() for node in sorted(self.nodes)] From 7faea1db17cf4c0a75bf43eb588d8abe3e047bf5 Mon Sep 17 00:00:00 2001 From: Sergey Date: Wed, 1 May 2024 17:58:26 -0600 Subject: [PATCH 7/7] updated docs --- docs/integrations.md | 29 +++++++++++++++++++++++++++++ mkdocs.yml | 1 + 2 files changed, 30 insertions(+) create mode 100644 docs/integrations.md diff --git a/docs/integrations.md b/docs/integrations.md new file mode 100644 index 0000000..ebad0cd --- /dev/null +++ b/docs/integrations.md @@ -0,0 +1,29 @@ +## Llama Index + +We have a simple integration with Llama Index. You can convert the parsed document to Llama Index nodes and then create an index from those nodes. + +```py +import openparse +from llama_index.core import VectorStoreIndex + +doc_path = "./sample-docs/lyft-10k.pdf" +parser = openparse.DocumentParser() +parsed_doc = parser.parse(doc_path) + +nodes = parsed_doc.to_llama_index_nodes() +index = VectorStoreIndex(nodes=nodes) +``` + +Now you can query the index + +```py +query_engine = index.as_query_engine() +response = query_engine.query("What do they do to make money?") +print(response) +``` + +You can also add nodes to an existing index + +```py +existing_index.insert_nodes(nodes) +``` diff --git a/mkdocs.yml b/mkdocs.yml index eeb71c9..93da1ae 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -78,6 +78,7 @@ nav: - Advanced: - Customization: processing/customization.md - Serializing Results: serialization.md + - Integrations: integrations.md - Visualization: visualization.md - Config: config.md