Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: move parsers quivr core #2884

Merged
merged 14 commits into from
Jul 22, 2024
48 changes: 24 additions & 24 deletions backend/core/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion backend/core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@ python = "^3.11"
pydantic = "^2.7.4"
langchain-core = "^0.2.10"
httpx = "^0.27.0"
rich = "^13.7.1"
tiktoken = "^0.7.0"
aiofiles = ">=23.0.0,<25.0.0"
faiss-cpu = { version = "^1.8.0.post1", optional = true }
langchain-community = { version = "^0.2.6", optional = true }
langchain-openai = { version = "^0.1.14", optional = true }
rich = "^13.7.1"

[tool.poetry.extras]
base = ["langchain-community", "faiss-cpu", "langchain-openai"]
Expand Down
2 changes: 1 addition & 1 deletion backend/core/quivr_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def register_entries():
name,
spec.value.replace(":", "."),
errtxt=err_msg,
override=True,
append=True,
)


Expand Down
70 changes: 16 additions & 54 deletions backend/core/quivr_core/brain/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,62 +14,18 @@

from quivr_core.brain.info import BrainInfo, ChatHistoryInfo
from quivr_core.chat import ChatHistory
from quivr_core.config import LLMEndpointConfig, RAGConfig
from quivr_core.config import RAGConfig
from quivr_core.files.file import load_qfile
from quivr_core.llm import LLMEndpoint
from quivr_core.models import ParsedRAGChunkResponse, ParsedRAGResponse, SearchResult
from quivr_core.processor.registry import get_processor_class
from quivr_core.quivr_rag import QuivrQARAG
from quivr_core.storage.file import load_qfile
from quivr_core.storage.local_storage import TransparentStorage
from quivr_core.storage.storage_base import StorageBase

logger = logging.getLogger("quivr_core")


async def _build_default_vectordb(
docs: list[Document], embedder: Embeddings
) -> VectorStore:
try:
from langchain_community.vectorstores import FAISS

logger.debug("Using Faiss-CPU as vector store.")
# TODO(@aminediro) : embedding call is not concurrent for all documents but waits
# We can actually wait on all processing
if len(docs) > 0:
vector_db = await FAISS.afrom_documents(documents=docs, embedding=embedder)
return vector_db
else:
raise ValueError("can't initialize brain without documents")

except ImportError as e:
raise ImportError(
"Please provide a valid vector store or install quivr-core['base'] package for using the default one."
) from e


def _default_embedder() -> Embeddings:
try:
from langchain_openai import OpenAIEmbeddings
from .brain_defaults import build_default_vectordb, default_embedder, default_llm

logger.debug("Loaded OpenAIEmbeddings as default LLM for brain")
embedder = OpenAIEmbeddings()
return embedder
except ImportError as e:
raise ImportError(
"Please provide a valid Embedder or install quivr-core['base'] package for using the defaultone."
) from e


def _default_llm() -> LLMEndpoint:
try:
logger.debug("Loaded ChatOpenAI as default LLM for brain")
llm = LLMEndpoint.from_config(LLMEndpointConfig())
return llm

except ImportError as e:
raise ImportError(
"Please provide a valid BaseLLM or install quivr-core['base'] package"
) from e
logger = logging.getLogger("quivr_core")


async def process_files(
Expand All @@ -80,6 +36,7 @@ async def process_files(
try:
if file.file_extension:
processor_cls = get_processor_class(file.file_extension)
logger.debug(f"processing {file} using class {processor_cls.__name__}")
processor = processor_cls(**processor_kwargs)
docs = await processor.process_file(file)
knowledge.extend(docs)
Expand Down Expand Up @@ -171,18 +128,21 @@ async def afrom_files(
skip_file_error: bool = False,
):
if llm is None:
llm = _default_llm()
llm = default_llm()

if embedder is None:
embedder = _default_embedder()
embedder = default_embedder()

brain_id = uuid4()

# TODO: run in parallel using tasks

for path in file_paths:
file = await load_qfile(brain_id, path)
await storage.upload_file(file)

logger.debug(f"uploaded all files to {storage}")

# Parse files
docs = await process_files(
storage=storage,
Expand All @@ -191,10 +151,12 @@ async def afrom_files(

# Building brain's vectordb
if vector_db is None:
vector_db = await _build_default_vectordb(docs, embedder)
vector_db = await build_default_vectordb(docs, embedder)
else:
await vector_db.aadd_documents(docs)

logger.debug(f"added {len(docs)} chunks to vectordb")

return cls(
id=brain_id,
name=name,
Expand Down Expand Up @@ -241,16 +203,16 @@ async def afrom_langchain_documents(
embedder: Embeddings | None = None,
) -> Self:
if llm is None:
llm = _default_llm()
llm = default_llm()

if embedder is None:
embedder = _default_embedder()
embedder = default_embedder()

brain_id = uuid4()

# Building brain's vectordb
if vector_db is None:
vector_db = await _build_default_vectordb(langchain_documents, embedder)
vector_db = await build_default_vectordb(langchain_documents, embedder)
else:
await vector_db.aadd_documents(langchain_documents)

Expand Down
55 changes: 55 additions & 0 deletions backend/core/quivr_core/brain/brain_defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import logging

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore

from quivr_core.config import LLMEndpointConfig
from quivr_core.llm import LLMEndpoint

logger = logging.getLogger("quivr_core")


async def build_default_vectordb(
docs: list[Document], embedder: Embeddings
) -> VectorStore:
try:
from langchain_community.vectorstores import FAISS

logger.debug("Using Faiss-CPU as vector store.")
# TODO(@aminediro) : embedding call is usually not concurrent for all documents but waits
if len(docs) > 0:
vector_db = await FAISS.afrom_documents(documents=docs, embedding=embedder)
return vector_db
else:
raise ValueError("can't initialize brain without documents")

except ImportError as e:
raise ImportError(
"Please provide a valid vector store or install quivr-core['base'] package for using the default one."
) from e


def default_embedder() -> Embeddings:
try:
from langchain_openai import OpenAIEmbeddings

logger.debug("Loaded OpenAIEmbeddings as default LLM for brain")
embedder = OpenAIEmbeddings()
return embedder
except ImportError as e:
raise ImportError(
"Please provide a valid Embedder or install quivr-core['base'] package for using the defaultone."
) from e


def default_llm() -> LLMEndpoint:
try:
logger.debug("Loaded ChatOpenAI as default LLM for brain")
llm = LLMEndpoint.from_config(LLMEndpointConfig())
return llm

except ImportError as e:
raise ImportError(
"Please provide a valid BaseLLM or install quivr-core['base'] package"
) from e
3 changes: 3 additions & 0 deletions backend/core/quivr_core/files/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .file import QuivrFile

__all__ = ["QuivrFile"]
Loading
Loading