embeddings: nomic embed vision (#22482)

Thank you for contributing to LangChain! **Description:** Adds Langchain support for Nomic Embed Vision **Twitter handle:** nomic_ai,zach_nussbaum - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
langchain-ai · Jun 5, 2024 · 14f3014 · 14f3014
1 parent 3280a5b
commit 14f3014
Show file tree

Hide file tree

Showing 9 changed files with 543 additions and 29 deletions.
diff --git a/cookbook/nomic_multimodal_rag.ipynb b/cookbook/nomic_multimodal_rag.ipynb
diff --git a/docs/scripts/arxiv_references.py b/docs/scripts/arxiv_references.py
@@ -515,7 +515,8 @@ def log_results(arxiv_id2type2key2urls):
 def generate_arxiv_references_page(file_name: Path, papers: list[ArxivPaper]) -> None:
     with open(file_name, "w") as f:
         # Write the table headers
-        f.write("""# arXiv
+        f.write(
+            """# arXiv
             
 LangChain implements the latest research in the field of Natural Language Processing.
 This page contains `arXiv` papers referenced in the LangChain Documentation, API Reference,
@@ -525,7 +526,8 @@ def generate_arxiv_references_page(file_name: Path, papers: list[ArxivPaper]) ->
 
 | arXiv id / Title | Authors | Published date 🔻 | LangChain Documentation|
 |------------------|---------|-------------------|------------------------|
-""")
+"""
+        )
         for paper in papers:
             refs = []
             if paper.referencing_doc2url:
@@ -595,7 +597,8 @@ def generate_arxiv_references_page(file_name: Path, papers: list[ArxivPaper]) ->
                     if el
                 ]
             )
-            f.write(f"""
+            f.write(
+                f"""
 ## {paper.title}
 
 - **arXiv id:** {paper.arxiv_id}
@@ -608,7 +611,8 @@ def generate_arxiv_references_page(file_name: Path, papers: list[ArxivPaper]) ->
 {refs}
 
 **Abstract:** {paper.abstract}
-                """)
+                """
+            )
 
     logger.warning(f"Created the {file_name} file with {len(papers)} arXiv references.")
 

diff --git a/libs/partners/nomic/langchain_nomic/__init__.py b/libs/partners/nomic/langchain_nomic/__init__.py
@@ -1,5 +1,3 @@
 from langchain_nomic.embeddings import NomicEmbeddings
 
-__all__ = [
-    "NomicEmbeddings",
-]
+__all__ = ["NomicEmbeddings"]
diff --git a/libs/partners/nomic/langchain_nomic/embeddings.py b/libs/partners/nomic/langchain_nomic/embeddings.py
@@ -22,6 +22,7 @@ def __init__(
         self,
         *,
         model: str,
+        nomic_api_key: Optional[str] = ...,
         dimensionality: Optional[int] = ...,
         inference_mode: Literal["remote"] = ...,
     ):
@@ -32,6 +33,7 @@ def __init__(
         self,
         *,
         model: str,
+        nomic_api_key: Optional[str] = ...,
         dimensionality: Optional[int] = ...,
         inference_mode: Literal["local", "dynamic"],
         device: Optional[str] = ...,
@@ -43,6 +45,7 @@ def __init__(
         self,
         *,
         model: str,
+        nomic_api_key: Optional[str] = ...,
         dimensionality: Optional[int] = ...,
         inference_mode: str,
         device: Optional[str] = ...,
@@ -57,6 +60,7 @@ def __init__(
         dimensionality: Optional[int] = None,
         inference_mode: str = "remote",
         device: Optional[str] = None,
+        vision_model: Optional[str] = None,
     ):
         """Initialize NomicEmbeddings model.
 
@@ -80,6 +84,7 @@ def __init__(
         self.dimensionality = dimensionality
         self.inference_mode = inference_mode
         self.device = device
+        self.vision_model = vision_model
 
     def embed(self, texts: List[str], *, task_type: str) -> List[List[float]]:
         """Embed texts.
@@ -121,3 +126,9 @@ def embed_query(self, text: str) -> List[float]:
             texts=[text],
             task_type="search_query",
         )[0]
+
+    def embed_image(self, uris: List[str]) -> List[List[float]]:
+        return embed.image(
+            images=uris,
+            model=self.vision_model,
+        )["embeddings"]
diff --git a/libs/partners/nomic/poetry.lock b/libs/partners/nomic/poetry.lock
diff --git a/libs/partners/nomic/pyproject.toml b/libs/partners/nomic/pyproject.toml
@@ -14,6 +14,7 @@ license = "MIT"
 python = ">=3.8.1,<4.0"
 langchain-core = ">=0.1.46,<0.3"
 nomic = "^3.0.29"
+pillow = "^10.3.0"
 
 [tool.poetry.group.test]
 optional = true

diff --git a/templates/rag-multi-modal-local/README.md b/templates/rag-multi-modal-local/README.md
@@ -7,11 +7,11 @@ With the release of open source, multi-modal LLMs it's possible to build this ki
 
 This template demonstrates how to perform private visual search and question-answering over a collection of your photos.
 
-It uses OpenCLIP embeddings to embed all of the photos and stores them in Chroma.
+It uses [`nomic-embed-vision-v1`](https://huggingface.co/nomic-ai/nomic-embed-vision-v1) multi-modal embeddings to embed the images and `Ollama` for question-answering.
 
 Given a question, relevant photos are retrieved and passed to an open source multi-modal LLM of your choice for answer synthesis.
 
-![Diagram illustrating the visual search process with OpenCLIP embeddings and multi-modal LLM for question-answering, featuring example food pictures and a matcha soft serve answer trace.](https://github.com/langchain-ai/langchain/assets/122662504/da543b21-052c-4c43-939e-d4f882a45d75 "Visual Search Process Diagram")
+![Diagram illustrating the visual search process with nomic-embed-vision-v1 embeddings and multi-modal LLM for question-answering, featuring example food pictures and a matcha soft serve answer trace.](https://github.com/langchain-ai/langchain/assets/122662504/da543b21-052c-4c43-939e-d4f882a45d75 "Visual Search Process Diagram")
 
 ## Input
 
@@ -34,22 +34,23 @@ python ingest.py
 
 ## Storage
 
-This template will use [OpenCLIP](https://github.com/mlfoundations/open_clip) multi-modal embeddings to embed the images.
-
-You can select different embedding model options (see results [here](https://github.com/mlfoundations/open_clip/blob/main/docs/openclip_results.csv)).
+This template will use [nomic-embed-vision-v1](https://huggingface.co/nomic-ai/nomic-embed-vision-v1) multi-modal embeddings to embed the images.
 
 The first time you run the app, it will automatically download the multimodal embedding model.
 
-By default, LangChain will use an embedding model with moderate performance but lower memory requirments, `ViT-H-14`.
 
-You can choose alternative `OpenCLIPEmbeddings` models in `rag_chroma_multi_modal/ingest.py`:
+You can choose alternative models in `rag_chroma_multi_modal/ingest.py`, such as `OpenCLIPEmbeddings`.
 ```
+langchain_experimental.open_clip import OpenCLIPEmbeddings
+
+embedding_function=OpenCLIPEmbeddings(
+        model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k"
+        )
+
 vectorstore_mmembd = Chroma(
     collection_name="multi-modal-rag",
     persist_directory=str(re_vectorstore_path),
-    embedding_function=OpenCLIPEmbeddings(
-        model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k"
-    ),
+    embedding_function=embedding_function
 )
 ```
 

diff --git a/templates/rag-multi-modal-local/ingest.py b/templates/rag-multi-modal-local/ingest.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 
 from langchain_community.vectorstores import Chroma
-from langchain_experimental.open_clip import OpenCLIPEmbeddings
+from langchain_nomic import NomicMultimodalEmbeddings
 
 # Load images
 img_dump_path = Path(__file__).parent / "docs/"
@@ -21,7 +21,9 @@
 
 # Load embedding function
 print("Loading embedding function")
-embedding = OpenCLIPEmbeddings(model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k")
+embedding = NomicMultimodalEmbeddings(
+    vision_model="nomic-embed-vision-v1", text_model="nomic-embed-text-v1"
+)
 
 # Create chroma
 vectorstore_mmembd = Chroma(

diff --git a/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py b/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py
@@ -9,7 +9,7 @@
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.pydantic_v1 import BaseModel
 from langchain_core.runnables import RunnableLambda, RunnablePassthrough
-from langchain_experimental.open_clip import OpenCLIPEmbeddings
+from langchain_nomic import NomicMultimodalEmbeddings
 from PIL import Image
 
 
@@ -102,8 +102,8 @@ def multi_modal_rag_chain(retriever):
 vectorstore_mmembd = Chroma(
     collection_name="multi-modal-rag",
     persist_directory=str(Path(__file__).parent.parent / "chroma_db_multi_modal"),
-    embedding_function=OpenCLIPEmbeddings(
-        model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k"
+    embedding_function=NomicMultimodalEmbeddings(
+        vision_model="nomic-embed-vision-v1", text_model="nomic-embed-text-v1"
     ),
 )