Skip to content

Commit

Permalink
feat: quivr core tox test + parsers (#2929)
Browse files Browse the repository at this point in the history
  • Loading branch information
AmineDiro authored Jul 30, 2024
1 parent 3f9f24f commit 6855585
Show file tree
Hide file tree
Showing 43 changed files with 8,359 additions and 333 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,6 @@ backend/core/examples/chatbot/.chainlit/config.toml
backend/core/examples/chatbot/.chainlit/translations/en-US.json

*.log

# Tox
.tox
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ repos:
rev: v4.6.0
hooks:
- id: check-added-large-files
args: ["--maxkb=5000"]
- id: check-toml
- id: check-yaml
- id: end-of-file-fixer
Expand Down
1 change: 0 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ ENV PATH="/root/.cargo/bin:${PATH}" \

WORKDIR /code


# Copy monorepo dependencies
# CORE
COPY core/pyproject.toml core/README.md core/poetry.lock /code/core/
Expand Down
34 changes: 34 additions & 0 deletions backend/core/Dockerfile.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Using a slim version for a smaller base image
FROM python:3.11.6-slim-bullseye

# Install GEOS library, Rust, and other dependencies, then clean up
RUN apt-get clean && apt-get update && apt-get install -y \
curl \
gcc \
# Additional dependencies for document handling
libmagic-dev \
tesseract-ocr \
poppler-utils \
libreoffice \
pandoc && \
rm -rf /var/lib/apt/lists/*

# Set the working directory
WORKDIR /code

# Install Poetry
RUN curl -sSL https://install.python-poetry.org | POETRY_HOME=/opt/poetry python && \
cd /usr/local/bin && \
ln -s /opt/poetry/bin/poetry && \
poetry config virtualenvs.create false

# Add Poetry to PATH
ENV PATH="/root/.local/bin:$PATH"

# Copy the current directory contents into the container at /app
COPY ./pyproject.toml ./poetry.lock* /code/

# Install project dependencies
RUN poetry install --no-root --with test

ENV PYTHONPATH=/code
4,152 changes: 3,899 additions & 253 deletions backend/core/poetry.lock

Large diffs are not rendered by default.

65 changes: 62 additions & 3 deletions backend/core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,69 @@ readme = "README.md"
repository = "https://github.com/QuivrHQ/quivr"

[tool.poetry.dependencies]
python = "^3.11"
python = "^3.11,<3.13"
pydantic = "^2.7.4"
langchain-core = "^0.2.10"
langchain = "^0.2.9"
httpx = "^0.27.0"
rich = "^13.7.1"
tiktoken = "^0.7.0"
aiofiles = ">=23.0.0,<25.0.0"
faiss-cpu = { version = "^1.8.0.post1", optional = true }
langchain-community = { version = "^0.2.6", optional = true }
langchain-openai = { version = "^0.1.14", optional = true }
# To install unstructured, you’ll also need to install the following system dependencies:
# libmagic, poppler, libreoffice, pandoc, and tesseract.
# NOTE: for now poetry doesn't support groups as extra:
# see : https://github.com/pythofeat: async load n-poetry/poetry/issues/4842

torch = [
{ version = "2.3.1", source = "pypi", platform = "darwin", optional = true },
{ version = "2.3.1", source = "pypi", platform = "linux", markers = "platform_machine != 'x86_64'", optional = true },
{ version = "2.3.1+cpu", source = "pytorch-cpu-src", platform = "linux", markers = "platform_machine == 'x86_64'", optional = true },
{ version = "2.3.1+cpu", source = "pytorch-cpu-src", platform = "win32", optional = true },
]
torchvision = [
{ version = "0.18.1", source = "pypi", platform = "darwin", optional = true },
{ version = "0.18.1", source = "pypi", platform = "linux", markers = "platform_machine != 'x86_64'", optional = true },
{ version = "0.18.1+cpu", source = "pytorch-cpu-src", platform = "linux", markers = "platform_machine == 'x86_64'", optional = true },
{ version = "0.18.1+cpu", source = "pytorch-cpu-src", platform = "win32", optional = true },
]
megaparse = { version = "0.0.24", optional = true, source = "pypi" }

docx2txt = { version = "^0.8", optional = true }
unstructured = { version = "^0.15.0", optional = true, extras = [
"epub",
"odt",
"docx",
"doc",
"pptx",
"ppt",
"xlsx",
] }

[tool.poetry.extras]
base = ["langchain-community", "faiss-cpu", "langchain-openai"]
pdf = []
csv = ["langchain-community"]
md = ["langchain-community"]
ipynb = ["langchain-community"]
epub = ["unstructured", "langchain-community"]
odt = ["unstructured", "langchain-community", "docx2txt"]
docx = ["unstructured", "langchain-community", "docx2txt"]
pptx = ["unstructured", "langchain-community"]
xlsx = ["unstructured", "langchain-community"]
pdf = ["langchain-community", "megaparse"]
all = [
"langchain-community",
"faiss-cpu",
"langchain-openai",
"unstructured",
"docx2txt",
"megaparse",
]

[tool.poetry.group.dev]
optional = true

[tool.poetry.group.dev.dependencies]
mypy = "^1.10.0"
Expand All @@ -30,12 +78,16 @@ ipykernel = "*"
ruff = "^0.4.8"
flake8 = "*"
flake8-black = "*"
pytest-cov = "^5.0.0"

[tool.poetry.group.test]
optional = true

[tool.poetry.group.test.dependencies]
pytest-asyncio = "^0.23.7"
pytest = "^8.2.2"
pytest-xdist = "^3.6.1"
pytest-cov = "^5.0.0"
tox = "^4.16.0"


[tool.mypy]
Expand Down Expand Up @@ -86,8 +138,15 @@ filterwarnings = ["ignore::DeprecationWarning"]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"base: these tests require quivr-core with extra `base` to be installed",
"tika: these tests require a tika server to be running",
"unstructured: these tests require `unstructured` dependency",
]

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[[tool.poetry.source]]
name = "pytorch-cpu-src"
url = "https://download.pytorch.org/whl/cpu"
priority = "explicit"
2 changes: 2 additions & 0 deletions backend/core/quivr_core/files/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@ class FileExtension(str, Enum):
txt = ".txt"
pdf = ".pdf"
csv = ".csv"
doc = ".doc"
docx = ".docx"
pptx = ".pptx"
xls = ".xls"
xlsx = ".xlsx"
md = ".md"
mdx = ".mdx"
markdown = ".markdown"
bib = ".bib"
epub = ".epub"
html = ".html"
odt = ".odt"
Expand Down
28 changes: 17 additions & 11 deletions backend/core/quivr_core/processor/implementations/default.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from typing import Any, List, Type, TypeVar

import tiktoken
Expand All @@ -9,10 +10,11 @@
PythonLoader,
UnstructuredEPubLoader,
UnstructuredExcelLoader,
UnstructuredFileLoader,
UnstructuredHTMLLoader,
UnstructuredMarkdownLoader,
UnstructuredODTLoader,
UnstructuredPDFLoader,
UnstructuredPowerPointLoader,
)
from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.text import TextLoader
Expand All @@ -23,8 +25,7 @@
from quivr_core.processor.processor_base import ProcessorBase
from quivr_core.processor.splitter import SplitterConfig

enc = tiktoken.get_encoding("cl100k_base")

logger = logging.getLogger("quivr_core")

P = TypeVar("P", bound=BaseLoader)

Expand All @@ -40,6 +41,8 @@ def __init__(self, *args, **loader_kwargs) -> None:
def _build_processor(
cls_name: str, load_cls: Type[P], cls_extensions: List[FileExtension | str]
) -> Type[ProcessorInit]:
enc = tiktoken.get_encoding("cl100k_base")

class _Processor(ProcessorBase):
supported_extensions = cls_extensions

Expand Down Expand Up @@ -72,9 +75,9 @@ def processor_metadata(self) -> dict[str, Any]:
}

async def process_file_inner(self, file: QuivrFile) -> list[Document]:
if "__init__" in self.loader_cls.__dict__:
if hasattr(self.loader_cls, "__init__"):
# NOTE: mypy can't correctly type this as BaseLoader doesn't have a constructor method
loader = self.loader_cls(file.path, **self.loader_kwargs) # type: ignore
loader = self.loader_cls(file_path=file.path, **self.loader_kwargs) # type: ignore
else:
loader = self.loader_cls()

Expand All @@ -93,12 +96,14 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
TikTokenTxtProcessor = _build_processor(
"TikTokenTxtProcessor", TextLoader, [FileExtension.txt]
)
DOCXProcessor = _build_processor("DOCXProcessor", Docx2txtLoader, [FileExtension.docx])
DOCXProcessor = _build_processor(
"DOCXProcessor", Docx2txtLoader, [FileExtension.docx, FileExtension.doc]
)
XLSXProcessor = _build_processor(
"XLSXProcessor", UnstructuredExcelLoader, [FileExtension.xlsx, FileExtension.xls]
)
PPTProcessor = _build_processor(
"PPTProcessor", UnstructuredFileLoader, [FileExtension.pptx]
"PPTProcessor", UnstructuredPowerPointLoader, [FileExtension.pptx]
)
MarkdownProcessor = _build_processor(
"MarkdownProcessor",
Expand All @@ -108,11 +113,9 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
EpubProcessor = _build_processor(
"EpubProcessor", UnstructuredEPubLoader, [FileExtension.epub]
)
BibTexProcessor = _build_processor(
"BibTexProcessor", BibtexLoader, [FileExtension.epub]
)
BibTexProcessor = _build_processor("BibTexProcessor", BibtexLoader, [FileExtension.bib])
ODTProcessor = _build_processor(
"ODTProcessor", UnstructuredPDFLoader, [FileExtension.odt]
"ODTProcessor", UnstructuredODTLoader, [FileExtension.odt]
)
HTMLProcessor = _build_processor(
"HTMLProcessor", UnstructuredHTMLLoader, [FileExtension.html]
Expand All @@ -121,3 +124,6 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]:
NotebookProcessor = _build_processor(
"NotebookProcessor", NotebookLoader, [FileExtension.ipynb]
)
UnstructuredPDFProcessor = _build_processor(
"UnstructuredPDFProcessor", UnstructuredPDFLoader, [FileExtension.pdf]
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import logging

import tiktoken
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
from megaparse import MegaParse

from quivr_core.files.file import QuivrFile
from quivr_core.processor.processor_base import ProcessorBase
from quivr_core.processor.registry import FileExtension
from quivr_core.processor.splitter import SplitterConfig

logger = logging.getLogger("quivr_core")


class MegaparseProcessor(ProcessorBase):
supported_extensions = [FileExtension.pdf]

def __init__(
self,
splitter: TextSplitter | None = None,
splitter_config: SplitterConfig = SplitterConfig(),
llama_parse_api_key: str | None = None,
strategy: str = "fast",
) -> None:
self.loader_cls = MegaParse
self.enc = tiktoken.get_encoding("cl100k_base")
self.splitter_config = splitter_config
self.megaparse_kwargs = {
"llama_parse_api_key": llama_parse_api_key,
"strategy": strategy,
}

if splitter:
self.text_splitter = splitter
else:
self.text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=splitter_config.chunk_size,
chunk_overlap=splitter_config.chunk_overlap,
)

@property
def processor_metadata(self):
return {
"chunk_overlap": self.splitter_config.chunk_overlap,
}

async def process_file_inner(self, file: QuivrFile) -> list[Document]:
mega_parse = MegaParse(file_path=file.path, **self.megaparse_kwargs) # type: ignore
document: Document = await mega_parse.aload()
if len(document.page_content) > self.splitter_config.chunk_size:
docs = self.text_splitter.split_documents([document])
for doc in docs:
doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))}
return docs
return [document]
Loading

0 comments on commit 6855585

Please sign in to comment.