Filimoa · Filimoa · May 2, 2024 · Apr 22, 2024 · May 1, 2024 · May 1, 2024
diff --git a/.gitignore b/.gitignore
@@ -36,4 +36,5 @@ test-output.pdf
 notebooks/
 sample-docs/
 weights/
+.env
 
diff --git a/docs/integrations.md b/docs/integrations.md
@@ -0,0 +1,29 @@
+## Llama Index 
+
+We have a simple integration with Llama Index. You can convert the parsed document to Llama Index nodes and then create an index from those nodes.
+
+```py
+import openparse
+from llama_index.core import VectorStoreIndex
+
+doc_path = "./sample-docs/lyft-10k.pdf"
+parser = openparse.DocumentParser()
+parsed_doc = parser.parse(doc_path)
+
+nodes = parsed_doc.to_llama_index_nodes()
+index = VectorStoreIndex(nodes=nodes)
+```
+
+Now you can query the index 
+
+```py
+query_engine = index.as_query_engine()
+response = query_engine.query("What do they do to make money?")
+print(response)
+```
+
+You can also add nodes to an existing index
+
+```py
+existing_index.insert_nodes(nodes)
+```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -78,6 +78,7 @@ nav:
       - Advanced:
           - Customization: processing/customization.md
   - Serializing Results: serialization.md
+  - Integrations: integrations.md
   - Visualization: visualization.md
   - Config: config.md
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ description = "Streamlines the process of preparing documents for LLM's."
 readme = "README.md"
 requires-python = ">=3.8"
 license = "MIT"
-version = "0.5.5"
+version = "0.5.6"
 authors = [{name = "Sergey Filimonov", email = "hello@sergey.fyi"}]
 dependencies = [
     "PyMuPDF >= 1.23.2",

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -9,3 +9,4 @@ beautifulsoup4
 twine
 packaging
 wheel
+llama-index
diff --git a/src/cookbooks/llama_index.ipynb b/src/cookbooks/llama_index.ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-05-01 17:04:54--  https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf\n",
+      "Resolving sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)... 162.243.189.2\n",
+      "Connecting to sergey-filimonov.nyc3.digitaloceanspaces.com (sergey-filimonov.nyc3.digitaloceanspaces.com)|162.243.189.2|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 379188 (370K) [application/pdf]\n",
+      "Saving to: ‘sample-docs/lyft-10k.pdf’\n",
+      "\n",
+      "sample-docs/lyft-10 100%[===================>] 370.30K  1.99MB/s    in 0.2s    \n",
+      "\n",
+      "2024-05-01 17:04:57 (1.99 MB/s) - ‘sample-docs/lyft-10k.pdf’ saved [379188/379188]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "\n",
+    "sys.path.append(\"..\")\n",
+    "\n",
+    "!wget https://sergey-filimonov.nyc3.digitaloceanspaces.com/open-parse/sample-docs/lyft_2021-first-20-pages.pdf -O sample-docs/lyft-10k.pdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# make sure llama-index is installed, it's not a formal dependency of open-parse\n",
+    "# %pip install llama-index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished parsing\n"
+     ]
+    }
+   ],
+   "source": [
+    "import openparse\n",
+    "\n",
+    "doc_path = \"./sample-docs/lyft-10k.pdf\"\n",
+    "parser = openparse.DocumentParser()\n",
+    "parsed_doc = parser.parse(doc_path)\n",
+    "\n",
+    "print(\"Finished parsing\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Node ID: 33747e2d-0478-4628-b112-d733b1fc5039\n",
+      "Text: Securities registered pursuant to Section 12(g) of the\n",
+      "Act:**None** Indicate by check mark if the Registrant is a well-known\n",
+      "seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒\n",
+      "No ☐ Indicate by check mark if the Registrant is not required to file\n",
+      "reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\n",
+      "Indicate by check ma...\n"
+     ]
+    }
+   ],
+   "source": [
+    "nodes = parsed_doc.to_llama_index_nodes()\n",
+    "\n",
+    "print(nodes[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id_': '33747e2d-0478-4628-b112-d733b1fc5039',\n",
+       " 'embedding': None,\n",
+       " 'metadata': {'bbox': [{'page': 0,\n",
+       "    'page_height': 792.0,\n",
+       "    'page_width': 612.0,\n",
+       "    'x0': 17.31,\n",
+       "    'y0': 332.53,\n",
+       "    'x1': 586.25,\n",
+       "    'y1': 424.21}]},\n",
+       " 'excluded_embed_metadata_keys': ['bbox'],\n",
+       " 'excluded_llm_metadata_keys': ['bbox'],\n",
+       " 'relationships': {<NodeRelationship.PREVIOUS: '2'>: {'node_id': '59644551-d995-4d0a-88f5-49bbf22f0617',\n",
+       "   'node_type': <ObjectType.TEXT: '1'>,\n",
+       "   'metadata': {'bbox': [{'page': 0,\n",
+       "      'page_height': 792.0,\n",
+       "      'page_width': 612.0,\n",
+       "      'x0': 17.31,\n",
+       "      'y0': 457.56,\n",
+       "      'x1': 590.92,\n",
+       "      'y1': 743.41}]},\n",
+       "   'hash': '77baa9ef95633b4c77c243ed3db29b4555c4f1f78c5b68f620eb8c4ff7f0a480',\n",
+       "   'class_name': 'RelatedNodeInfo'},\n",
+       "  <NodeRelationship.NEXT: '3'>: {'node_id': '50744a8a-4ccb-4efa-a625-a1e7e3feec0c',\n",
+       "   'node_type': <ObjectType.TEXT: '1'>,\n",
+       "   'metadata': {'bbox': [{'page': 0,\n",
+       "      'page_height': 792.0,\n",
+       "      'page_width': 612.0,\n",
+       "      'x0': 17.31,\n",
+       "      'y0': 211.34,\n",
+       "      'x1': 586.62,\n",
+       "      'y1': 290.85}]},\n",
+       "   'hash': '965f5304799146fde0d2bb8fb5726c0646ddf46df3ada30a902adee9005c2333',\n",
+       "   'class_name': 'RelatedNodeInfo'},\n",
+       "  <NodeRelationship.PARENT: '4'>: {'node_id': 'dc94d72c-ec16-41b7-9e01-f359284464d2',\n",
+       "   'node_type': <ObjectType.DOCUMENT: '4'>,\n",
+       "   'metadata': {'file_name': 'lyft-10k.pdf',\n",
+       "    'file_size': 379188,\n",
+       "    'creation_date': '2024-05-01',\n",
+       "    'last_modified_date': '2024-04-07'},\n",
+       "   'hash': '60b974c64ec56d53a58cfe7703901cd049f6f11c39af6612861193672cc07bd9',\n",
+       "   'class_name': 'RelatedNodeInfo'}},\n",
+       " 'text': 'Securities registered pursuant to Section 12(g) of the Act:**None**\\nIndicate by check mark if the Registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. Yes ☒ No ☐\\nIndicate by check mark if the Registrant is not required to file reports pursuant to Section 13 or 15(d) of the Act. Yes ☐ No ☒\\nIndicate by check mark whether the Registrant: (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act of 1934 during the preceding 12 months (or for such\\nshorter period that the Registrant was required to file such reports), and (2) has been subject to such filing requirements for the past 90 days. Yes ☒ No ☐\\nIndicate by check mark whether the Registrant has submitted electronically every Interactive Data File required to be submitted pursuant to Rule 405 of Regulation S-T (§232.405 of this chapter) during\\nthe preceding 12 months (or for such shorter period that the Registrant was required to submit such files). Yes ☒ No ☐\\nIndicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, smaller reporting company, or an emerging growth company. See the definitions of\\n“large accelerated filer,” “accelerated filer,” “smaller reporting company,” and “emerging growth company” in Rule 12b-2 of the Exchange Act.',\n",
+       " 'start_char_idx': None,\n",
+       " 'end_char_idx': None,\n",
+       " 'text_template': '{metadata_str}\\n\\n{content}',\n",
+       " 'metadata_template': '{key}: {value}',\n",
+       " 'metadata_seperator': '\\n',\n",
+       " 'class_name': 'TextNode'}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nodes[1].dict()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Let's add the nodes to a vector store"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import VectorStoreIndex\n",
+    "\n",
+    "index = VectorStoreIndex(nodes=nodes)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Now let's query our index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "They generate revenue primarily from service fees and commissions collected from drivers for their use of the ridesharing marketplace. Additionally, they earn revenue from riders renting Light Vehicles, drivers renting vehicles through Express Drive, Lyft Rentals renters, Lyft Driver Center and Lyft Auto Care users, and by providing their ridesharing marketplace to organizations through Lyft Business offerings. In the second quarter of 2021, they also started generating revenues from licensing and data access agreements with third-party autonomous vehicle companies.\n"
+     ]
+    }
+   ],
+   "source": [
+    "query_engine = index.as_query_engine()\n",
+    "response = query_engine.query(\"What do they do to make money?\")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "open-parse-notebooks",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py
@@ -117,6 +117,10 @@ def parse(
             table_parsing_kwargs=(
                 table_args_obj.model_dump() if table_args_obj else None
             ),
+            creation_date=doc.file_metadata.get("creation_date"),
+            last_modified_date=doc.file_metadata.get("last_modified_date"),
+            last_accessed_date=doc.file_metadata.get("last_accessed_date"),
+            file_size=doc.file_metadata.get("file_size"),
         )
         return parsed_doc