Skip to content

Commit

Permalink
fixed the pdf extraction workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
diptanu committed Sep 28, 2024
1 parent 5f3c2f1 commit f1e18a9
Show file tree
Hide file tree
Showing 3 changed files with 237 additions and 204 deletions.
6 changes: 4 additions & 2 deletions examples/pdf_document_extraction/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,7 @@ httpx
pydantic
--extra-index-url https://miropsota.github.io/torch_packages_builder
detectron2
deepdoctection[pt]
langchain_text_splitters
langchain_text_splitters
py-inkwell
lancedb
pyarrow
31 changes: 20 additions & 11 deletions examples/pdf_document_extraction/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import httpx
from pydantic import BaseModel

from indexify.extractors.pdf_parser import Page, PageFragmentType, PDFParser
from indexify.functions_sdk.data_objects import File, IndexifyData
from indexify.functions_sdk.graph import Graph
from indexify.functions_sdk.indexify_functions import (
Expand All @@ -25,17 +24,27 @@ def download_pdf(url: str) -> File:


class Document(BaseModel):
from inkwell import Page
pages: List[Page]


@indexify_function()
def parse_pdf(file: File) -> Document:
"""
Parse pdf file and returns pages:
"""
parser = PDFParser(file.data)
pages: List[Page] = parser.parse()
return Document(pages=pages)
class PDFParse(IndexifyFunction):
name = "pdf-parse"
description = "Parser class that captures a pdf file"

def __init__(self):
super().__init__()
from inkwell import Pipeline
self._pipeline = Pipeline()

def run(self, input: File) -> Document:
from inkwell import Page
import tempfile
with tempfile.TemporaryFile() as f:
f.write(input.data)
pages: List[Page] = self._pipeline.process(f.name)
return Document(pages=pages)



class TextChunk(IndexifyData):
Expand All @@ -50,6 +59,7 @@ def extract_chunks(document: Document) -> List[TextChunk]:
"""
Extract chunks from document
"""
from inkwell import PageFragmentType
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
Expand Down Expand Up @@ -103,6 +113,7 @@ def __init__(self):

def run(self, document: Document) -> List[ImageWithEmbedding]:
from PIL import Image
from inkwell import PageFragmentType
from sentence_transformers import SentenceTransformer

if self.model is None:
Expand All @@ -125,8 +136,6 @@ def run(self, document: Document) -> List[ImageWithEmbedding]:


from lancedb.pydantic import LanceModel, Vector


class ImageEmbeddingTable(LanceModel):
vector: Vector(512)
page_number: int
Expand Down
Loading

0 comments on commit f1e18a9

Please sign in to comment.