Filimoa · Filimoa · May 20, 2024 · Apr 14, 2024 · Apr 14, 2024 · Apr 19, 2024
diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml
@@ -8,14 +8,13 @@ body:
       value: Thank you for contributing to open-parse! ✊
 
   - type: checkboxes
-    id: checks
+    id: version-checks
     attributes:
       label: Initial Checks
       description: Just making sure you're using the latest version
       options:
         - label: I confirm that I'm on the latest version
           required: true
-
   - type: textarea
     id: description
     attributes:
@@ -41,3 +40,20 @@ body:
 
         ...
       render: Python
+
+  - type: textarea
+    id: version
+    attributes:
+      label: Python, open-parse & OS Version
+      description: |
+        Which version of Python & open-parse are you using, and which Operating System?
+
+        Please run the following command and copy the output below:
+
+        ```bash
+        python -c "import openparse.version; print(openparse.version.version_info())"
+        ```
+
+      render: Text
+    validations:
+      required: true
diff --git a/.gitignore b/.gitignore
@@ -37,4 +37,5 @@ notebooks/
 sample-docs/
 weights/
 .env
+.python-version
 
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -20,7 +20,7 @@ We parse text into markdown by looking at the font size and style charachter by
 
 Spans get combined into lines and lines get combined into elements. Elements are the basic building blocks of the document. They can be headings, paragraphs, lists of bullets, etc.
 
-Optionally we can use PyMuPDF to OCR the document. This is not recommended as a default due to the additional computational cost and inherent inaccuracies of OCR. We're looking at integrating [doctr](https://github.com/mindee/doctr).
+Optionally we can use PyMuPDF to OCR the document. This is not recommended as a default due to the additional computational cost and inherent inaccuracies of OCR. We're looking at integrating [doctr](https://github.com/mindee/doctr).  Early tests make this library seem very heavy and slow.
 
 Here's an article that goes into more details on available [OCR libraries](https://source.opennews.org/articles/our-search-best-ocr-tool-2023/).
 

diff --git a/docs/index.md b/docs/index.md
@@ -63,7 +63,7 @@ parsed_content = parser.parse(basic_doc_path)
 
 ## Cookbooks
 
-https://github.com/Filimoa/open-parse/tree/main/src/cookbooks
+[Other Cookbooks](https://github.com/Filimoa/open-parse/tree/main/src/cookbooks)
 
 
 ## Sponsors

diff --git a/docs/integrations.md b/docs/integrations.md
@@ -0,0 +1,29 @@
+## Llama Index 
+
+We have a simple integration with Llama Index. You can convert the parsed document to Llama Index nodes and then create an index from those nodes.
+
+```py
+import openparse
+from llama_index.core import VectorStoreIndex
+
+doc_path = "./sample-docs/lyft-10k.pdf"
+parser = openparse.DocumentParser()
+parsed_doc = parser.parse(doc_path)
+
+nodes = parsed_doc.to_llama_index_nodes()
+index = VectorStoreIndex(nodes=nodes)
+```
+
+Now you can query the index 
+
+```py
+query_engine = index.as_query_engine()
+response = query_engine.query("What do they do to make money?")
+print(response)
+```
+
+You can also add nodes to an existing index
+
+```py
+existing_index.insert_nodes(nodes)
+```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -78,6 +78,7 @@ nav:
       - Advanced:
           - Customization: processing/customization.md
   - Serializing Results: serialization.md
+  - Integrations: integrations.md
   - Visualization: visualization.md
   - Config: config.md
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,14 @@
 [build-system]
-requires = ["setuptools>=42", "wheel"]
+requires = ["setuptools", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "openparse"
 description = "Streamlines the process of preparing documents for LLM's."
 readme = "README.md"
 requires-python = ">=3.8"
-version = "0.5.2"
+license = { text = "MIT" }
+version = "0.5.6"
 authors = [{name = "Sergey Filimonov", email = "hello@sergey.fyi"}]
 dependencies = [
     "PyMuPDF >= 1.23.2",

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -9,3 +9,4 @@ beautifulsoup4
 twine
 packaging
 wheel
+llama-index
diff --git a/src/cookbooks/llama_index.ipynb b/src/cookbooks/llama_index.ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-05-01 17:04:54--  https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf\n",
+      "Resolving sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)... 162.243.189.2\n",
+      "Connecting to sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 379188 (370K) [application/pdf]\n",
+      "Saving to: ‘sample-docs/lyft-10k.pdf’\n",
+      "\n",
+      "sample-docs/lyft-10 100%[===================>] 370.30K  1.99MB/s    in 0.2s    \n",
+      "\n",
+      "2024-05-01 17:04:57 (1.99 MB/s) - ‘sample-docs/lyft-10k.pdf’ saved [379188/379188]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "\n",
+    "sys.path.append(\"..\")\n",
+    "\n",
+    "!wget https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf -O sample-docs/lyft-10k.pdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make sure llama-index is installed, it's not a formal dependency of open-parse\n",
+    "# %pip install llama-index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished parsing\n"
+     ]
+    }
+   ],
+   "source": [
+    "import openparse\n",
+    "\n",
+    "doc_path = \"./sample-docs/lyft-10k.pdf\"\n",
+    "parser = openparse.DocumentParser()\n",
+    "parsed_doc = parser.parse(doc_path)\n",
+    "\n",
+    "print(\"Finished parsing\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Node ID: 33747e2d-0478-4628-b112-d733b1fc5039\n",
+      "Text: Securities registered pursuant to Section 12(g) of the\n",
+      "Act:**None** Indicate by check mark if the Registrant is a well-known\n",
+      "seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒\n",
+      "No ☐ Indicate by check mark if the Registrant is not required to file\n",
+      "reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\n",
+      "Indicate by check ma...\n"
+     ]
+    }
+   ],
+   "source": [
+    "nodes = parsed_doc.to_llama_index_nodes()\n",
+    "\n",
+    "print(nodes[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id_': '33747e2d-0478-4628-b112-d733b1fc5039',\n",
+       " 'embedding': None,\n",
+       " 'metadata': {'bbox': [{'page': 0,\n",
+       "    'page_height': 792.0,\n",
+       "    'page_width': 612.0,\n",
+       "    'x0': 17.31,\n",
+       "    'y0': 332.53,\n",
+       "    'x1': 586.25,\n",
+       "    'y1': 424.21}]},\n",
+       " 'excluded_embed_metadata_keys': ['bbox'],\n",
+       " 'excluded_llm_metadata_keys': ['bbox'],\n",
+       " 'relationships': {<NodeRelationship.PREVIOUS: '2'>: {'node_id': '59644551-d995-4d0a-88f5-49bbf22f0617',\n",
+       "   'node_type': <ObjectType.TEXT: '1'>,\n",
+       "   'metadata': {'bbox': [{'page': 0,\n",
+       "      'page_height': 792.0,\n",
+       "      'page_width': 612.0,\n",
+       "      'x0': 17.31,\n",
+       "      'y0': 457.56,\n",
+       "      'x1': 590.92,\n",
+       "      'y1': 743.41}]},\n",
+       "   'hash': '77baa9ef95633b4c77c243ed3db29b4555c4f1f78c5b68f620eb8c4ff7f0a480',\n",
+       "   'class_name': 'RelatedNodeInfo'},\n",
+       "  <NodeRelationship.NEXT: '3'>: {'node_id': '50744a8a-4ccb-4efa-a625-a1e7e3feec0c',\n",
+       "   'node_type': <ObjectType.TEXT: '1'>,\n",
+       "   'metadata': {'bbox': [{'page': 0,\n",
+       "      'page_height': 792.0,\n",
+       "      'page_width': 612.0,\n",
+       "      'x0': 17.31,\n",
+       "      'y0': 211.34,\n",
+       "      'x1': 586.62,\n",
+       "      'y1': 290.85}]},\n",
+       "   'hash': '965f5304799146fde0d2bb8fb5726c0646ddf46df3ada30a902adee9005c2333',\n",
+       "   'class_name': 'RelatedNodeInfo'},\n",
+       "  <NodeRelationship.PARENT: '4'>: {'node_id': 'dc94d72c-ec16-41b7-9e01-f359284464d2',\n",
+       "   'node_type': <ObjectType.DOCUMENT: '4'>,\n",
+       "   'metadata': {'file_name': 'lyft-10k.pdf',\n",
+       "    'file_size': 379188,\n",
+       "    'creation_date': '2024-05-01',\n",
+       "    'last_modified_date': '2024-04-07'},\n",
+       "   'hash': '60b974c64ec56d53a58cfe7703901cd049f6f11c39af6612861193672cc07bd9',\n",
+       "   'class_name': 'RelatedNodeInfo'}},\n",
+       " 'text': 'Securities registered pursuant to Section 12(g) of the Act:**None**\\nIndicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐\\nIndicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\\nIndicate by check mark whether the Registrant: (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such\\nshorter period that the Registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐\\nIndicate by check mark whether the Registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§232.405 of this chapter) during\\nthe preceding 12 months (or for such shorter period that the Registrant was required to submit such files). Yes ☒ No ☐\\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, smaller reporting company, or an emerging growth company. See the definitions of\\n“large accelerated filer,” “accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.',\n",
+       " 'start_char_idx': None,\n",
+       " 'end_char_idx': None,\n",
+       " 'text_template': '{metadata_str}\\n\\n{content}',\n",
+       " 'metadata_template': '{key}: {value}',\n",
+       " 'metadata_seperator': '\\n',\n",
+       " 'class_name': 'TextNode'}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nodes[1].dict()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Let's add the nodes to a vector store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import VectorStoreIndex\n",
+    "\n",
+    "index = VectorStoreIndex(nodes=nodes)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Now let's query our index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "They generate revenue primarily from service fees and commissions collected from drivers for their use of the ridesharing marketplace. Additionally, they earn revenue from riders renting Light Vehicles, drivers renting vehicles through Express Drive, Lyft Rentals renters, Lyft Driver Center and Lyft Auto Care users, and by providing their ridesharing marketplace to organizations through Lyft Business offerings. In the second quarter of 2021, they also started generating revenues from licensing and data access agreements with third-party autonomous vehicle companies.\n"
+     ]
+    }
+   ],
+   "source": [
+    "query_engine = index.as_query_engine()\n",
+    "response = query_engine.query(\"What do they do to make money?\")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "open-parse-notebooks",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/openparse/types.py → src/openparse/_types.py b/src/openparse/types.py → src/openparse/_types.py
diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py
@@ -3,7 +3,7 @@
 
 from openparse import tables, text, consts
 from openparse.pdf import Pdf
-from openparse.types import NOT_GIVEN, NotGiven
+from openparse._types import NOT_GIVEN, NotGiven
 from openparse.processing import (
     IngestionPipeline,
     BasicIngestionPipeline,
@@ -34,7 +34,7 @@ class PyMuPDFArgsDict(TypedDict, total=False):
 
 
 def _table_args_dict_to_model(
-    args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict]
+    args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict],
 ) -> Union[tables.TableTransformersArgs, tables.PyMuPDFArgs]:
     if args_dict["parsing_algorithm"] == "table-transformers":
         return tables.TableTransformersArgs(**args_dict)