-
-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: quivr core tox test + parsers (#2929)
- Loading branch information
Showing
43 changed files
with
8,359 additions
and
333 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Using a slim version for a smaller base image | ||
FROM python:3.11.6-slim-bullseye | ||
|
||
# Install GEOS library, Rust, and other dependencies, then clean up | ||
RUN apt-get clean && apt-get update && apt-get install -y \ | ||
curl \ | ||
gcc \ | ||
# Additional dependencies for document handling | ||
libmagic-dev \ | ||
tesseract-ocr \ | ||
poppler-utils \ | ||
libreoffice \ | ||
pandoc && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Set the working directory | ||
WORKDIR /code | ||
|
||
# Install Poetry | ||
RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \ | ||
cd /usr/local/bin && \ | ||
ln -s /opt/poetry/bin/poetry && \ | ||
poetry config virtualenvs.create false | ||
|
||
# Add Poetry to PATH | ||
ENV PATH="/root/.local/bin:$PATH" | ||
|
||
# Copy the current directory contents into the container at /app | ||
COPY ./pyproject.toml ./poetry.lock* /code/ | ||
|
||
# Install project dependencies | ||
RUN poetry install --no-root --with test | ||
|
||
ENV PYTHONPATH=/code |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
56 changes: 56 additions & 0 deletions
56
backend/core/quivr_core/processor/implementations/megaparse_processor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import logging | ||
|
||
import tiktoken | ||
from langchain_core.documents import Document | ||
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter | ||
from megaparse import MegaParse | ||
|
||
from quivr_core.files.file import QuivrFile | ||
from quivr_core.processor.processor_base import ProcessorBase | ||
from quivr_core.processor.registry import FileExtension | ||
from quivr_core.processor.splitter import SplitterConfig | ||
|
||
logger = logging.getLogger("quivr_core") | ||
|
||
|
||
class MegaparseProcessor(ProcessorBase): | ||
supported_extensions = [FileExtension.pdf] | ||
|
||
def __init__( | ||
self, | ||
splitter: TextSplitter | None = None, | ||
splitter_config: SplitterConfig = SplitterConfig(), | ||
llama_parse_api_key: str | None = None, | ||
strategy: str = "fast", | ||
) -> None: | ||
self.loader_cls = MegaParse | ||
self.enc = tiktoken.get_encoding("cl100k_base") | ||
self.splitter_config = splitter_config | ||
self.megaparse_kwargs = { | ||
"llama_parse_api_key": llama_parse_api_key, | ||
"strategy": strategy, | ||
} | ||
|
||
if splitter: | ||
self.text_splitter = splitter | ||
else: | ||
self.text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( | ||
chunk_size=splitter_config.chunk_size, | ||
chunk_overlap=splitter_config.chunk_overlap, | ||
) | ||
|
||
@property | ||
def processor_metadata(self): | ||
return { | ||
"chunk_overlap": self.splitter_config.chunk_overlap, | ||
} | ||
|
||
async def process_file_inner(self, file: QuivrFile) -> list[Document]: | ||
mega_parse = MegaParse(file_path=file.path, **self.megaparse_kwargs) # type: ignore | ||
document: Document = await mega_parse.aload() | ||
if len(document.page_content) > self.splitter_config.chunk_size: | ||
docs = self.text_splitter.split_documents([document]) | ||
for doc in docs: | ||
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} | ||
return docs | ||
return [document] |
Oops, something went wrong.