Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More Embedding Models [Draft] #23

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ notebooks/
sample-docs/
weights/
.env
.python-version

17 changes: 16 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ dependencies = [
"pypdf >= 4.0.0",
"pdfminer.six >= 20200401",
"tiktoken >= 0.3",
"openai >= 1.0.0",
"numpy",
"llama-index-embeddings-openai",
]

[project.urls]
Expand All @@ -33,6 +33,21 @@ ml = [
"transformers",
"tokenizers",
]
embeddings-azure-openai = [
"llama-index-embeddings-azure-openai",
]
embeddings-cohere = [
"llama-index-embeddings-cohere",
]
embeddings-huggingface = [
"llama-index-embeddings-huggingface",
]
embeddings-huggingface-optimum = [
"llama-index-embeddings-huggingface-optimum",
]
embeddings-text-embeddings-inference = [
"llama-index-embeddings-text-embeddings-inference",
]

[project.scripts]
openparse-download = "openparse.cli:download_unitable_weights"
Expand Down
300 changes: 300 additions & 0 deletions src/cookbooks/experimental/clustering_nodes.ipynb

Large diffs are not rendered by default.

Binary file added src/evals/data/full-pdfs/insurance-doc-1.pdf
Binary file not shown.
Binary file added src/evals/data/full-pdfs/insurance-doc-2.pdf
Binary file not shown.
Binary file added src/evals/data/full-pdfs/insurance-doc-3.pdf
Binary file not shown.
Binary file added src/evals/data/full-pdfs/insurance-doc-4.pdf
Binary file not shown.
3 changes: 2 additions & 1 deletion src/openparse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from openparse.doc_parser import (
DocumentParser,
)
from openparse import processing, version
from openparse import processing, version, embeddings
from openparse.config import config
from openparse.schemas import (
Bbox,
Expand All @@ -28,4 +28,5 @@
"processing",
"version",
"config",
"embeddings",
]
96 changes: 96 additions & 0 deletions src/openparse/embeddings/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""
This is meant to provide a simple wrapper around llama_index's embeddings classes.
"""

from typing import Dict, Type

from llama_index.core.embeddings import BaseEmbedding


class ImportErrorProxy:
"""
Used to raise an ImportError when an attribute or method is accessed on a class that failed to import.
"""

def __init__(self, class_name, install_command):
self.class_name = class_name
self.install_command = install_command
self.error_message = (
f"Missing optional dependency for '{class_name}'. "
f"Please install it by running: '{install_command}'."
)

def __getattr__(self, name):
raise ImportError(
f"{self.error_message} The attribute '{name}' cannot be used."
)

def __call__(self, *args, **kwargs):
raise ImportError(self.error_message)


try:
from llama_index.embeddings.openai import (
OpenAIEmbedding,
)

except ImportError:
OpenAIEmbedding = ImportErrorProxy(
"OpenAIEmbedding",
"pip install openparse[embeddings-openai]",
)

try:
from llama_index.embeddings.azure_openai import (
AzureOpenAIEmbedding,
)

except ImportError:
AzureOpenAIEmbedding = ImportErrorProxy(
"AzureOpenAIEmbedding",
"pip install openparse[embeddings-azure-openai]",
)

try:
from llama_index.embeddings.huggingface import (
HuggingFaceInferenceAPIEmbedding,
)

except ImportError:
HuggingFaceInferenceAPIEmbedding = ImportErrorProxy(
"HuggingFaceInferenceAPIEmbedding",
"pip install openparse[embeddings-huggingface]",
)


try:
from llama_index.embeddings.huggingface_optimum import (
OptimumEmbedding,
)

except ImportError:
OptimumEmbedding = ImportErrorProxy(
"OptimumEmbedding",
"pip install openparse[embeddings-huggingface-optimum]",
)

try:
from llama_index.embeddings.cohere import CohereEmbedding

except ImportError:
CohereEmbedding = ImportErrorProxy(
"CohereEmbedding",
"pip install openparse[embeddings-cohere]",
)


try:
from llama_index.embeddings.text_embeddings_inference import (
TextEmbeddingsInference,
)

except ImportError:
TextEmbeddingsInference = ImportErrorProxy(
"TextEmbeddingsInference",
"pip install openparse[embeddings-text-embeddings-inference]",
)
3 changes: 1 addition & 2 deletions src/openparse/processing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
CombineNodesSpatially,
RemoveNodesBelowNTokens,
)
from .semantic_transforms import CombineNodesSemantically, OpenAIEmbeddings
from .semantic_transforms import CombineNodesSemantically

__all__ = [
"ProcessingStep",
Expand All @@ -32,5 +32,4 @@
"NoOpIngestionPipeline",
"RemoveNodesBelowNTokens",
"CombineNodesSemantically",
"OpenAIEmbeddings",
]
4 changes: 4 additions & 0 deletions src/openparse/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,3 +674,7 @@ def _nodes_to_llama_index(self, llama_index_doc):
)

return li_nodes
last_modified_date: Optional[dt.date] = None
last_accessed_date: Optional[dt.date] = None
creation_date: Optional[dt.date] = None
file_size: Optional[int] = None