diff --git a/.gitignore b/.gitignore index 9279baa..ff8db49 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,5 @@ test-output.pdf notebooks/ sample-docs/ weights/ +.env diff --git a/docs/integrations.md b/docs/integrations.md new file mode 100644 index 0000000..ebad0cd --- /dev/null +++ b/docs/integrations.md @@ -0,0 +1,29 @@ +## Llama Index + +We have a simple integration with Llama Index. You can convert the parsed document to Llama Index nodes and then create an index from those nodes. + +```py +import openparse +from llama_index.core import VectorStoreIndex + +doc_path = "./sample-docs/lyft-10k.pdf" +parser = openparse.DocumentParser() +parsed_doc = parser.parse(doc_path) + +nodes = parsed_doc.to_llama_index_nodes() +index = VectorStoreIndex(nodes=nodes) +``` + +Now you can query the index + +```py +query_engine = index.as_query_engine() +response = query_engine.query("What do they do to make money?") +print(response) +``` + +You can also add nodes to an existing index + +```py +existing_index.insert_nodes(nodes) +``` diff --git a/mkdocs.yml b/mkdocs.yml index eeb71c9..93da1ae 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -78,6 +78,7 @@ nav: - Advanced: - Customization: processing/customization.md - Serializing Results: serialization.md + - Integrations: integrations.md - Visualization: visualization.md - Config: config.md diff --git a/pyproject.toml b/pyproject.toml index ed6bd55..f0b1447 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ description = "Streamlines the process of preparing documents for LLM's." readme = "README.md" requires-python = ">=3.8" license = "MIT" -version = "0.5.5" +version = "0.5.6" authors = [{name = "Sergey Filimonov", email = "hello@sergey.fyi"}] dependencies = [ "PyMuPDF >= 1.23.2", diff --git a/requirements-dev.txt b/requirements-dev.txt index fa133dd..0efca34 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,3 +9,4 @@ beautifulsoup4 twine packaging wheel +llama-index diff --git a/src/cookbooks/llama_index.ipynb b/src/cookbooks/llama_index.ipynb new file mode 100644 index 0000000..59673bc --- /dev/null +++ b/src/cookbooks/llama_index.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-05-01 17:04:54-- https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf\n", + "Resolving sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)... 162.243.189.2\n", + "Connecting to sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 379188 (370K) [application/pdf]\n", + "Saving to: ‘sample-docs/lyft-10k.pdf’\n", + "\n", + "sample-docs/lyft-10 100%[===================>] 370.30K 1.99MB/s in 0.2s \n", + "\n", + "2024-05-01 17:04:57 (1.99 MB/s) - ‘sample-docs/lyft-10k.pdf’ saved [379188/379188]\n", + "\n" + ] + } + ], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"..\")\n", + "\n", + "!wget https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf -O sample-docs/lyft-10k.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# make sure llama-index is installed, it's not a formal dependency of open-parse\n", + "# %pip install llama-index" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished parsing\n" + ] + } + ], + "source": [ + "import openparse\n", + "\n", + "doc_path = \"./sample-docs/lyft-10k.pdf\"\n", + "parser = openparse.DocumentParser()\n", + "parsed_doc = parser.parse(doc_path)\n", + "\n", + "print(\"Finished parsing\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Node ID: 33747e2d-0478-4628-b112-d733b1fc5039\n", + "Text: Securities registered pursuant to Section 12(g) of the\n", + "Act:**None** Indicate by check mark if the Registrant is a well-known\n", + "seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒\n", + "No ☐ Indicate by check mark if the Registrant is not required to file\n", + "reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\n", + "Indicate by check ma...\n" + ] + } + ], + "source": [ + "nodes = parsed_doc.to_llama_index_nodes()\n", + "\n", + "print(nodes[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'id_': '33747e2d-0478-4628-b112-d733b1fc5039',\n", + " 'embedding': None,\n", + " 'metadata': {'bbox': [{'page': 0,\n", + " 'page_height': 792.0,\n", + " 'page_width': 612.0,\n", + " 'x0': 17.31,\n", + " 'y0': 332.53,\n", + " 'x1': 586.25,\n", + " 'y1': 424.21}]},\n", + " 'excluded_embed_metadata_keys': ['bbox'],\n", + " 'excluded_llm_metadata_keys': ['bbox'],\n", + " 'relationships': {: {'node_id': '59644551-d995-4d0a-88f5-49bbf22f0617',\n", + " 'node_type': ,\n", + " 'metadata': {'bbox': [{'page': 0,\n", + " 'page_height': 792.0,\n", + " 'page_width': 612.0,\n", + " 'x0': 17.31,\n", + " 'y0': 457.56,\n", + " 'x1': 590.92,\n", + " 'y1': 743.41}]},\n", + " 'hash': '77baa9ef95633b4c77c243ed3db29b4555c4f1f78c5b68f620eb8c4ff7f0a480',\n", + " 'class_name': 'RelatedNodeInfo'},\n", + " : {'node_id': '50744a8a-4ccb-4efa-a625-a1e7e3feec0c',\n", + " 'node_type': ,\n", + " 'metadata': {'bbox': [{'page': 0,\n", + " 'page_height': 792.0,\n", + " 'page_width': 612.0,\n", + " 'x0': 17.31,\n", + " 'y0': 211.34,\n", + " 'x1': 586.62,\n", + " 'y1': 290.85}]},\n", + " 'hash': '965f5304799146fde0d2bb8fb5726c0646ddf46df3ada30a902adee9005c2333',\n", + " 'class_name': 'RelatedNodeInfo'},\n", + " : {'node_id': 'dc94d72c-ec16-41b7-9e01-f359284464d2',\n", + " 'node_type': ,\n", + " 'metadata': {'file_name': 'lyft-10k.pdf',\n", + " 'file_size': 379188,\n", + " 'creation_date': '2024-05-01',\n", + " 'last_modified_date': '2024-04-07'},\n", + " 'hash': '60b974c64ec56d53a58cfe7703901cd049f6f11c39af6612861193672cc07bd9',\n", + " 'class_name': 'RelatedNodeInfo'}},\n", + " 'text': 'Securities registered pursuant to Section 12(g) of the Act:**None**\\nIndicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐\\nIndicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\\nIndicate by check mark whether the Registrant: (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such\\nshorter period that the Registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐\\nIndicate by check mark whether the Registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§232.405 of this chapter) during\\nthe preceding 12 months (or for such shorter period that the Registrant was required to submit such files). Yes ☒ No ☐\\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, smaller reporting company, or an emerging growth company. See the definitions of\\n“large accelerated filer,” “accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.',\n", + " 'start_char_idx': None,\n", + " 'end_char_idx': None,\n", + " 'text_template': '{metadata_str}\\n\\n{content}',\n", + " 'metadata_template': '{key}: {value}',\n", + " 'metadata_seperator': '\\n',\n", + " 'class_name': 'TextNode'}" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes[1].dict()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Let's add the nodes to a vector store" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import VectorStoreIndex\n", + "\n", + "index = VectorStoreIndex(nodes=nodes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Now let's query our index" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "They generate revenue primarily from service fees and commissions collected from drivers for their use of the ridesharing marketplace. Additionally, they earn revenue from riders renting Light Vehicles, drivers renting vehicles through Express Drive, Lyft Rentals renters, Lyft Driver Center and Lyft Auto Care users, and by providing their ridesharing marketplace to organizations through Lyft Business offerings. In the second quarter of 2021, they also started generating revenues from licensing and data access agreements with third-party autonomous vehicle companies.\n" + ] + } + ], + "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"What do they do to make money?\")\n", + "print(response)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "open-parse-notebooks", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py index 4c474bc..390f549 100644 --- a/src/openparse/doc_parser.py +++ b/src/openparse/doc_parser.py @@ -117,6 +117,10 @@ def parse( table_parsing_kwargs=( table_args_obj.model_dump() if table_args_obj else None ), + creation_date=doc.file_metadata.get("creation_date"), + last_modified_date=doc.file_metadata.get("last_modified_date"), + last_accessed_date=doc.file_metadata.get("last_accessed_date"), + file_size=doc.file_metadata.get("file_size"), ) return parsed_doc diff --git a/src/openparse/pdf.py b/src/openparse/pdf.py index 100f347..04d6b5c 100644 --- a/src/openparse/pdf.py +++ b/src/openparse/pdf.py @@ -1,15 +1,16 @@ +import os +import mimetypes +import datetime as dt import random import io from pathlib import Path -from typing import Iterator, List, Literal, Optional, Union, Tuple, Any - +from typing import Iterator, List, Literal, Optional, Union, Tuple, Any, Dict from pydantic import BaseModel from pdfminer.high_level import extract_pages from pdfminer.layout import LTPage from pypdf import PdfReader, PdfWriter from openparse.schemas import Bbox, Node -from openparse import consts class _BboxWithColor(BaseModel): @@ -60,13 +61,41 @@ def _prepare_bboxes_for_drawing( return res +def file_metadata(file_path: Union[str, Path]) -> Dict: + """Get some handy metadate from filesystem. + + Args: + file_path: str: file path in str + """ + return { + "file_path": file_path, + "file_name": os.path.basename(file_path), + "file_type": mimetypes.guess_type(file_path)[0], + "file_size": os.path.getsize(file_path), + "creation_date": dt.datetime.fromtimestamp( + Path(file_path).stat().st_ctime + ).strftime("%Y-%m-%d"), + "last_modified_date": dt.datetime.fromtimestamp( + Path(file_path).stat().st_mtime + ).strftime("%Y-%m-%d"), + "last_accessed_date": dt.datetime.fromtimestamp( + Path(file_path).stat().st_atime + ).strftime("%Y-%m-%d"), + } + + class Pdf: """ Simple utility class for working with PDF files. This class wraps the PdfReader and PdfWriter classes from pypdf. """ def __init__(self, file: Union[str, Path, PdfReader]): - self.file_path = str(file) if isinstance(file, (str, Path)) else None + self.file_path = None + self.file_metadata = dict() + if isinstance(file, (str, Path)): + self.file_path = str(file) + self.file_metadata = file_metadata(file) + self.reader = PdfReader(file) if isinstance(file, (str, Path)) else file self.writer = PdfWriter() for page in self.reader.pages: @@ -107,10 +136,10 @@ def to_pymupdf_doc(self): """ try: import fitz # type: ignore - except ImportError: + except ImportError as err: raise ImportError( "PyMuPDF (fitz) is not installed. This method requires PyMuPDF." - ) + ) from err if not self.writer.pages: return fitz.open(self.file_path) @@ -126,10 +155,10 @@ def _draw_bboxes( ): try: import fitz - except ImportError: + except ImportError as err: raise ImportError( "PyMuPDF (fitz) is not installed. This method requires PyMuPDF." - ) + ) from err pdf = self.to_pymupdf_doc() @@ -167,15 +196,15 @@ def display_with_bboxes( """ try: from IPython.display import Image, display # type: ignore - except ImportError: + except ImportError as err: raise ImportError( "IPython is required to display PDFs. Please install it with `pip install ipython`." - ) + ) from err assert nodes, "At least one node is required." bboxes = [node.bbox for node in nodes] flattened_bboxes = _prepare_bboxes_for_drawing(bboxes, annotations) - marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0]._coordinates) + marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0].coordinate_system) if not page_nums: page_nums = list(range(marked_up_doc.page_count)) for page_num in page_nums: @@ -193,7 +222,7 @@ def export_with_bboxes( bboxes = [node.bbox for node in nodes] flattened_bboxes = _prepare_bboxes_for_drawing(bboxes, annotations) - marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0]._coordinates) + marked_up_doc = self._draw_bboxes(flattened_bboxes, nodes[0].coordinate_system) marked_up_doc.save(str(output_pdf)) def _flip_coordinates(self, bbox: Bbox) -> Bbox: diff --git a/src/openparse/processing/basic_transforms.py b/src/openparse/processing/basic_transforms.py index d2ba0a4..a055c0f 100644 --- a/src/openparse/processing/basic_transforms.py +++ b/src/openparse/processing/basic_transforms.py @@ -33,18 +33,21 @@ def process(self, nodes: List[Node]) -> List[Node]: updated_nodes.append(node) continue - new_elements = [ - element - for element in node.elements - if not ( + new_elements = [] + for element in node.elements: + should_include = not ( isinstance(element, TextElement) and self.intersects_any_table( element.bbox, tables_by_page[element.page] ) ) - ] - if new_elements: + if should_include: + new_elements.append(element) + + if new_elements and len(new_elements) != len(node.elements): updated_nodes.append(Node(elements=tuple(new_elements))) + elif len(new_elements) == len(node.elements): + updated_nodes.append(node) return updated_nodes diff --git a/src/openparse/processing/semantic_transforms.py b/src/openparse/processing/semantic_transforms.py index d01d085..2a0f28d 100644 --- a/src/openparse/processing/semantic_transforms.py +++ b/src/openparse/processing/semantic_transforms.py @@ -61,10 +61,10 @@ def embed_many(self, texts: List[str]) -> List[List[float]]: def _create_client(self): try: from openai import OpenAI - except ImportError: + except ImportError as err: raise ImportError( "You need to install the openai package to use this feature." - ) + ) from err return OpenAI(api_key=self.api_key) diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py index de07c64..0d0f45f 100644 --- a/src/openparse/schemas.py +++ b/src/openparse/schemas.py @@ -1,6 +1,8 @@ import re from collections import defaultdict, namedtuple from enum import Enum +import datetime as dt +import uuid from functools import cached_property from typing import Any, List, Literal, Optional, Tuple, Union, Set @@ -355,12 +357,31 @@ def _determine_relationship( class Node(BaseModel): - elements: Tuple[Union[TextElement, TableElement], ...] = Field(exclude=True) - _tokenization_lower_limit: int = consts.TOKENIZATION_LOWER_LIMIT - _tokenization_upper_limit: int = consts.TOKENIZATION_UPPER_LIMIT - _coordinates: Literal["top-left", "bottom-left"] = ( - consts.COORDINATE_SYSTEM + id_: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Unique ID of the node.", + exclude=True, + ) + elements: Tuple[Union[TextElement, TableElement], ...] = Field( + exclude=True, frozen=True + ) + tokenization_lower_limit: int = Field( + default=consts.TOKENIZATION_LOWER_LIMIT, frozen=True, exclude=True + ) + tokenization_upper_limit: int = Field( + default=consts.TOKENIZATION_UPPER_LIMIT, frozen=True, exclude=True + ) + coordinate_system: Literal["top-left", "bottom-left"] = Field( + default=consts.COORDINATE_SYSTEM, frozen=True, exclude=True ) # controlled globally for now, should be moved into elements + embedding: Optional[List[float]] = Field( + default=None, description="Embedding of the node." + ) + + @computed_field # type: ignore + @cached_property + def node_id(self) -> str: + return self.id_ @computed_field # type: ignore @cached_property @@ -464,11 +485,11 @@ def is_stub(self) -> bool: @cached_property def is_small(self) -> bool: - return self.tokens < self._tokenization_lower_limit + return self.tokens < self.tokenization_lower_limit @cached_property def is_large(self) -> bool: - return self.tokens > self._tokenization_upper_limit + return self.tokens > self.tokenization_upper_limit @cached_property def num_pages(self) -> int: @@ -494,7 +515,7 @@ def reading_order(self) -> ReadingOrder: min_page = min(element.bbox.page for element in self.elements) min_x0 = min(element.bbox.x0 for element in self.elements) - if self._coordinates == "bottom-left": + if self.coordinate_system == "bottom-left": y_position = -min(element.bbox.y0 for element in self.elements) else: raise NotImplementedError( @@ -527,11 +548,29 @@ def overlaps( return False + def to_llama_index(self): + try: + from llama_index.core.schema import TextNode as LlamaIndexTextNode + except ImportError as err: + raise ImportError( + "llama_index is not installed. Please install it with `pip install llama-index`." + ) from err + return LlamaIndexTextNode( + id_=self.id_, + text=self.text, + embedding=self.embedding, + metadata={"bbox": [b.model_dump(mode="json") for b in self.bbox]}, + excluded_embed_metadata_keys=["bbox"], + excluded_llm_metadata_keys=["bbox"], + ) + def __lt__(self, other: "Node") -> bool: if not isinstance(other, Node): return NotImplemented - assert self._coordinates == other._coordinates, "Coordinate systems must match." + assert ( + self.coordinate_system == other.coordinate_system + ), "Coordinate systems must match." return self.reading_order < other.reading_order @@ -552,8 +591,6 @@ def __add__(self, other: "Node") -> "Node": new_elems = self.elements + other.elements return Node(elements=new_elems) - model_config = ConfigDict(frozen=True) - ####################### ### PARSED DOCUMENT ### @@ -561,8 +598,79 @@ def __add__(self, other: "Node") -> "Node": class ParsedDocument(BaseModel): + id_: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Unique ID of the node.", + exclude=True, + ) nodes: List[Node] filename: str num_pages: int coordinate_system: Literal["top-left", "bottom-left"] = "bottom-left" table_parsing_kwargs: Optional[dict] = None + last_modified_date: Optional[dt.date] = None + last_accessed_date: Optional[dt.date] = None + creation_date: Optional[dt.date] = None + file_size: Optional[int] = None + + @cached_property + @computed_field + def doc_id(self) -> str: + return self.id_ + + def to_llama_index_nodes(self): + try: + from llama_index.core.schema import Document as LlamaIndexDocument + except ImportError as err: + raise ImportError( + "llama_index is not installed. Please install it with `pip install llama-index`." + ) from err + + li_doc = LlamaIndexDocument( + id_=self.id_, + metadata={ + "file_name": self.filename, + "file_size": self.file_size, + "creation_date": self.creation_date.isoformat(), + "last_modified_date": self.last_modified_date.isoformat(), + }, + excluded_embed_metadata_keys=[ + "file_size", + "creation_date", + "last_modified_date", + ], + excluded_llm_metadata_keys=[ + "file_name", + "file_size", + "creation_date", + "last_modified_date", + ], + ) + li_nodes = self._nodes_to_llama_index(li_doc) + + return li_nodes + + def _nodes_to_llama_index(self, llama_index_doc): + try: + from llama_index.core.schema import NodeRelationship + except ImportError as err: + raise ImportError( + "llama_index is not installed. Please install it with `pip install llama-index`." + ) from err + + li_nodes = [node.to_llama_index() for node in sorted(self.nodes)] + for i in range(len(li_nodes) - 1): + li_nodes[i].relationships[NodeRelationship.NEXT] = li_nodes[ + i + 1 + ].as_related_node_info() + + li_nodes[i + 1].relationships[NodeRelationship.PREVIOUS] = li_nodes[ + i + ].as_related_node_info() + + for li_node in li_nodes: + li_node.relationships[NodeRelationship.PARENT] = ( + llama_index_doc.as_related_node_info() + ) + + return li_nodes diff --git a/src/openparse/version.py b/src/openparse/version.py index d690686..cd0bc57 100644 --- a/src/openparse/version.py +++ b/src/openparse/version.py @@ -1,4 +1,4 @@ -OPEN_PARSE_VERSION = "0.5.5" +OPEN_PARSE_VERSION = "0.5.6" def version_info() -> str: diff --git a/src/tests/test_doc_parser.py b/src/tests/test_doc_parser.py index 6b90de8..ddb126a 100644 --- a/src/tests/test_doc_parser.py +++ b/src/tests/test_doc_parser.py @@ -3,7 +3,6 @@ def test_parse_doc(): - basic_doc_path = "src/evals/data/full-pdfs/mock-1-page-lease.pdf" parser = openparse.DocumentParser() parsed_basic_doc = parser.parse(basic_doc_path) @@ -49,3 +48,12 @@ def test_parse_tables_with_pymupdf(): parsed_doc2 = parser.parse(doc_with_tables_path) assert len(parsed_doc2.nodes) >= 1 assert parsed_doc2.nodes[-1].text.startswith("= 1