From af7af69f66bd0399bd4643caac5624a51885f5c9 Mon Sep 17 00:00:00 2001
From: Eno Reyes <enoreyes@Enos-MacBook-Pro.local>
Date: Wed, 25 Jan 2023 16:41:49 -0800
Subject: [PATCH 1/8] Added Instructor Model to Embeddings

---
 langchain/embeddings/huggingface.py           | 37 +++++++++++++------
 .../embeddings/test_huggingface.py            | 11 ++++++
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py
index 98f9986ad2ca7..508dc5283c652 100644
--- a/langchain/embeddings/huggingface.py
+++ b/langchain/embeddings/huggingface.py
@@ -6,12 +6,14 @@
 from langchain.embeddings.base import Embeddings
 
 DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
-
+MODEL_LIST = [DEFAULT_MODEL_NAME,
+              "hkunlp/instructor-large"]
 
 class HuggingFaceEmbeddings(BaseModel, Embeddings):
     """Wrapper around sentence_transformers embedding models.
 
-    To use, you should have the ``sentence_transformers`` python package installed.
+    To use sentence transformers, you should have the ``sentence_transformers`` python package installed. 
+    To use Instructor, you should have ``InstructorEmbedding`` python package installed.
 
     Example:
         .. code-block:: python
@@ -28,15 +30,28 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
     def __init__(self, **kwargs: Any):
         """Initialize the sentence_transformer."""
         super().__init__(**kwargs)
-        try:
-            import sentence_transformers
-
-            self.client = sentence_transformers.SentenceTransformer(self.model_name)
-        except ImportError:
-            raise ValueError(
-                "Could not import sentence_transformers python package. "
-                "Please install it with `pip install sentence_transformers`."
-            )
+        
+        if (self.model_name == DEFAULT_MODEL_NAME):
+            try:
+                import sentence_transformers
+
+                self.client = sentence_transformers.SentenceTransformer(self.model_name)
+            except ImportError:
+                raise ValueError(
+                    "Could not import sentence_transformers python package. "
+                    "Please install it with `pip install sentence_transformers`."
+                )
+        elif ("instructor" in self.model_name):
+            try:
+                from InstructorEmbedding import INSTRUCTOR
+
+                self.client = INSTRUCTOR(self.model_name)
+            except ImportError:
+                raise ValueError(
+                    "Could not import InstructorEmbedding python package. "
+                    "Please install it with `pip install InstructorEmbedding`."
+                )
+
 
     class Config:
         """Configuration for this pydantic object."""
diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py
index e71fbb0066706..7762d329dcf18 100644
--- a/tests/integration_tests/embeddings/test_huggingface.py
+++ b/tests/integration_tests/embeddings/test_huggingface.py
@@ -21,3 +21,14 @@ def test_huggingface_embedding_query() -> None:
     embedding = HuggingFaceEmbeddings()
     output = embedding.embed_query(document)
     assert len(output) == 768
+
+def test_huggingface_instructor_embedding_documents() -> None:
+    """Test huggingface embeddings."""
+    documents = ["foo bar"]
+    embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large")
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 768
+
+if __name__ == '__main__':
+    test_huggingface_instructor_embedding_documents()
\ No newline at end of file

From 477069b24ac3a65fcdcf54b55af3af292573264a Mon Sep 17 00:00:00 2001
From: Eno Reyes <enoreyes@Enos-MacBook-Pro.local>
Date: Fri, 27 Jan 2023 10:27:47 -0800
Subject: [PATCH 2/8] Updated embeddings and tests

---
 langchain/embeddings/huggingface.py           | 55 ++++++++++++++++---
 .../embeddings/test_huggingface.py            | 12 +++-
 2 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py
index 508dc5283c652..f600b1b6d8071 100644
--- a/langchain/embeddings/huggingface.py
+++ b/langchain/embeddings/huggingface.py
@@ -1,13 +1,16 @@
 """Wrapper around HuggingFace embedding models."""
 from typing import Any, List
+from enum import Enum
 
 from pydantic import BaseModel, Extra
 
 from langchain.embeddings.base import Embeddings
 
 DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
-MODEL_LIST = [DEFAULT_MODEL_NAME,
-              "hkunlp/instructor-large"]
+
+class MODEL_TYPE(Enum):
+    SENTENCE_TRANSFORMER = 1
+    INSTRUCTION_EMBEDDING = 2
 
 class HuggingFaceEmbeddings(BaseModel, Embeddings):
     """Wrapper around sentence_transformers embedding models.
@@ -25,6 +28,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
 
     client: Any  #: :meta private:
     model_name: str = DEFAULT_MODEL_NAME
+    model_type: str = MODEL_TYPE.SENTENCE_TRANSFORMER
     """Model name to use."""
 
     def __init__(self, **kwargs: Any):
@@ -44,7 +48,7 @@ def __init__(self, **kwargs: Any):
         elif ("instructor" in self.model_name):
             try:
                 from InstructorEmbedding import INSTRUCTOR
-
+                self.model_type = MODEL_TYPE.INSTRUCTION_EMBEDDING
                 self.client = INSTRUCTOR(self.model_name)
             except ImportError:
                 raise ValueError(
@@ -69,9 +73,38 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
         """
         texts = list(map(lambda x: x.replace("\n", " "), texts))
         embeddings = self.client.encode(texts)
+
+        if (self.model_name == DEFAULT_MODEL_NAME):
+            return embeddings.tolist()
+
         return embeddings.tolist()
 
-    def embed_query(self, text: str) -> List[float]:
+    ## Embedding instruction-tuned models requires a list of instruction, text pairs.
+    def embed_documents(self, texts: List[List[str]]) -> List[List[float]]:
+        """Compute doc embeddings using a HuggingFace transformer model.
+
+        Args:
+            texts: The list of texts to embed.
+
+        Returns:
+            List of embeddings, one for each text.
+        """
+        if (self.model_type != MODEL_TYPE.INSTRUCTION_EMBEDDING):
+            raise ValueError(
+                    "Erorr: You passed a list of string pairs but did not instantiate an Instruction embedding model. "
+                ) 
+
+        for text_list in texts:
+            for text in text_list:
+                if isinstance(text, str):
+                    text = text.replace("\n", " ")
+
+        embeddings = self.client.encode(texts)
+
+        return embeddings.tolist()
+
+    ## Embedding instruction-tuned model queries requires a list of instruction, text pairs.
+    def embed_query(self, texts: List[str]) -> List[float]:
         """Compute query embeddings using a HuggingFace transformer model.
 
         Args:
@@ -80,6 +113,14 @@ def embed_query(self, text: str) -> List[float]:
         Returns:
             Embeddings for the text.
         """
-        text = text.replace("\n", " ")
-        embedding = self.client.encode(text)
-        return embedding.tolist()
+        if (self.model_type != MODEL_TYPE.INSTRUCTION_EMBEDDING):
+            raise ValueError(
+                    "Erorr: You passed a string pair but did not instantiate an Instruction embedding model. "
+                ) 
+
+        for text in texts:
+            if isinstance(text, str):
+                text = text.replace("\n", " ")
+
+        embedding = self.client.encode(texts)
+        return embedding.tolist()
\ No newline at end of file
diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py
index 7762d329dcf18..63a965446c0f6 100644
--- a/tests/integration_tests/embeddings/test_huggingface.py
+++ b/tests/integration_tests/embeddings/test_huggingface.py
@@ -24,11 +24,19 @@ def test_huggingface_embedding_query() -> None:
 
 def test_huggingface_instructor_embedding_documents() -> None:
     """Test huggingface embeddings."""
-    documents = ["foo bar"]
+    documents = [["foo bar instruction", "foo bar document", 0]]
     embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large")
     output = embedding.embed_documents(documents)
     assert len(output) == 1
     assert len(output[0]) == 768
 
+def test_huggingface_instructor_embedding_query() -> None:
+    """Test huggingface embeddings."""
+    query = [["foo bar instruction", "foo bar query", 0]]
+    embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large")
+    output = embedding.embed_query(query)
+    assert len(output[0]) == 768
+
 if __name__ == '__main__':
-    test_huggingface_instructor_embedding_documents()
\ No newline at end of file
+    test_huggingface_instructor_embedding_documents()
+    test_huggingface_instructor_embedding_query()
\ No newline at end of file

From f3815685b92063076d3c685aa3d89feed5ee5837 Mon Sep 17 00:00:00 2001
From: Eno Reyes <enoreyes@Enos-MacBook-Pro.local>
Date: Fri, 27 Jan 2023 13:35:55 -0800
Subject: [PATCH 3/8] Updated embeddings and tests

---
 langchain/embeddings/huggingface.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py
index f600b1b6d8071..c0260caead9f1 100644
--- a/langchain/embeddings/huggingface.py
+++ b/langchain/embeddings/huggingface.py
@@ -103,12 +103,24 @@ def embed_documents(self, texts: List[List[str]]) -> List[List[float]]:
 
         return embeddings.tolist()
 
+    def embed_query(self, text: str) -> List[float]:
+        """Compute query embeddings using a HuggingFace transformer model.
+        Args:
+            text: The text to embed.
+
+        Returns:
+            Embeddings for the text.
+        """
+        text = text.replace("\n", " ")
+        embedding = self.client.encode(text)
+        return embedding.tolist()
+
     ## Embedding instruction-tuned model queries requires a list of instruction, text pairs.
     def embed_query(self, texts: List[str]) -> List[float]:
-        """Compute query embeddings using a HuggingFace transformer model.
+        """Compute query embeddings using a HuggingFace instructor transformer model.
 
         Args:
-            text: The text to embed.
+            texts: The instruction/query pair to embed.
 
         Returns:
             Embeddings for the text.

From 14b6cd77eb0dd901b794290e2cff6316b1ca7264 Mon Sep 17 00:00:00 2001
From: Eno Reyes <enoreyes@Enos-MacBook-Pro.local>
Date: Fri, 27 Jan 2023 13:54:46 -0800
Subject: [PATCH 4/8] Update structure

---
 tests/integration_tests/embeddings/test_huggingface.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py
index 63a965446c0f6..9740bd69d6a71 100644
--- a/tests/integration_tests/embeddings/test_huggingface.py
+++ b/tests/integration_tests/embeddings/test_huggingface.py
@@ -24,16 +24,16 @@ def test_huggingface_embedding_query() -> None:
 
 def test_huggingface_instructor_embedding_documents() -> None:
     """Test huggingface embeddings."""
-    documents = [["foo bar instruction", "foo bar document", 0]]
-    embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large")
+    documents = ["foo bar"]
+    embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large", instruction="Represent the text")
     output = embedding.embed_documents(documents)
     assert len(output) == 1
     assert len(output[0]) == 768
 
 def test_huggingface_instructor_embedding_query() -> None:
     """Test huggingface embeddings."""
-    query = [["foo bar instruction", "foo bar query", 0]]
-    embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large")
+    query = "foo bar"
+    embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large", instruction="Represent the text")
     output = embedding.embed_query(query)
     assert len(output[0]) == 768
 

From d81591e9a7be44a25b94002936f01255ed676728 Mon Sep 17 00:00:00 2001
From: Eno Reyes <enoreyes@Enos-MacBook-Pro.local>
Date: Fri, 27 Jan 2023 13:57:47 -0800
Subject: [PATCH 5/8] Update structure

---
 tests/integration_tests/embeddings/test_huggingface.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py
index 9740bd69d6a71..d794288f7b923 100644
--- a/tests/integration_tests/embeddings/test_huggingface.py
+++ b/tests/integration_tests/embeddings/test_huggingface.py
@@ -35,8 +35,4 @@ def test_huggingface_instructor_embedding_query() -> None:
     query = "foo bar"
     embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large", instruction="Represent the text")
     output = embedding.embed_query(query)
-    assert len(output[0]) == 768
-
-if __name__ == '__main__':
-    test_huggingface_instructor_embedding_documents()
-    test_huggingface_instructor_embedding_query()
\ No newline at end of file
+    assert len(output[0]) == 768
\ No newline at end of file

From f63cebd1ceececb170de86b92543612e72615fe9 Mon Sep 17 00:00:00 2001
From: Eno Reyes <enoreyes@gmail.com>
Date: Fri, 27 Jan 2023 14:18:06 -0800
Subject: [PATCH 6/8] Update huggingface.py

---
 langchain/embeddings/huggingface.py | 67 ++++++++---------------------
 1 file changed, 18 insertions(+), 49 deletions(-)

diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py
index c0260caead9f1..3adf5836e2222 100644
--- a/langchain/embeddings/huggingface.py
+++ b/langchain/embeddings/huggingface.py
@@ -7,6 +7,7 @@
 from langchain.embeddings.base import Embeddings
 
 DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
+DEFAULT_INSTRUCTION = "Represent the following text:"
 
 class MODEL_TYPE(Enum):
     SENTENCE_TRANSFORMER = 1
@@ -29,6 +30,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
     client: Any  #: :meta private:
     model_name: str = DEFAULT_MODEL_NAME
     model_type: str = MODEL_TYPE.SENTENCE_TRANSFORMER
+    instruction: str = DEFAULT_INSTRUCTION
     """Model name to use."""
 
     def __init__(self, **kwargs: Any):
@@ -49,7 +51,7 @@ def __init__(self, **kwargs: Any):
             try:
                 from InstructorEmbedding import INSTRUCTOR
                 self.model_type = MODEL_TYPE.INSTRUCTION_EMBEDDING
-                self.client = INSTRUCTOR(self.model_name)
+                self.client = INSTRUCTOR(self.model_name)                    
             except ImportError:
                 raise ValueError(
                     "Could not import InstructorEmbedding python package. "
@@ -72,37 +74,20 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]:
             List of embeddings, one for each text.
         """
         texts = list(map(lambda x: x.replace("\n", " "), texts))
-        embeddings = self.client.encode(texts)
+
+        if (self.model_type == MODEL_TYPE.INSTRUCTION_EMBEDDING):
+            instruction_pairs = []
+            for text in texts:
+                instruction_pairs.append([self.instruction, text])
+            embeddings = self.client.encode(instruction_pairs)
+        else:
+            embeddings = self.client.encode(texts)
 
         if (self.model_name == DEFAULT_MODEL_NAME):
             return embeddings.tolist()
 
         return embeddings.tolist()
 
-    ## Embedding instruction-tuned models requires a list of instruction, text pairs.
-    def embed_documents(self, texts: List[List[str]]) -> List[List[float]]:
-        """Compute doc embeddings using a HuggingFace transformer model.
-
-        Args:
-            texts: The list of texts to embed.
-
-        Returns:
-            List of embeddings, one for each text.
-        """
-        if (self.model_type != MODEL_TYPE.INSTRUCTION_EMBEDDING):
-            raise ValueError(
-                    "Erorr: You passed a list of string pairs but did not instantiate an Instruction embedding model. "
-                ) 
-
-        for text_list in texts:
-            for text in text_list:
-                if isinstance(text, str):
-                    text = text.replace("\n", " ")
-
-        embeddings = self.client.encode(texts)
-
-        return embeddings.tolist()
-
     def embed_query(self, text: str) -> List[float]:
         """Compute query embeddings using a HuggingFace transformer model.
         Args:
@@ -112,27 +97,11 @@ def embed_query(self, text: str) -> List[float]:
             Embeddings for the text.
         """
         text = text.replace("\n", " ")
-        embedding = self.client.encode(text)
-        return embedding.tolist()
-
-    ## Embedding instruction-tuned model queries requires a list of instruction, text pairs.
-    def embed_query(self, texts: List[str]) -> List[float]:
-        """Compute query embeddings using a HuggingFace instructor transformer model.
-
-        Args:
-            texts: The instruction/query pair to embed.
-
-        Returns:
-            Embeddings for the text.
-        """
-        if (self.model_type != MODEL_TYPE.INSTRUCTION_EMBEDDING):
-            raise ValueError(
-                    "Erorr: You passed a string pair but did not instantiate an Instruction embedding model. "
-                ) 
 
-        for text in texts:
-            if isinstance(text, str):
-                text = text.replace("\n", " ")
-
-        embedding = self.client.encode(texts)
-        return embedding.tolist()
\ No newline at end of file
+        if (self.model_type == MODEL_TYPE.INSTRUCTION_EMBEDDING):
+            instruction_pair = [self.instruction, text]
+            embedding = self.client.encode(instruction_pair)
+        else:
+            embedding = self.client.encode(text)
+        
+        return embedding.tolist()

From b9799a67820cf7bce8e838fbbfa0201276f08bb6 Mon Sep 17 00:00:00 2001
From: Eno Reyes <enoreyes@gmail.com>
Date: Mon, 30 Jan 2023 12:31:39 -0800
Subject: [PATCH 7/8] Update
 tests/integration_tests/embeddings/test_huggingface.py

Co-authored-by: seanaedmiston <seane999@gmail.com>
---
 tests/integration_tests/embeddings/test_huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py
index d794288f7b923..cf747114c5782 100644
--- a/tests/integration_tests/embeddings/test_huggingface.py
+++ b/tests/integration_tests/embeddings/test_huggingface.py
@@ -35,4 +35,4 @@ def test_huggingface_instructor_embedding_query() -> None:
     query = "foo bar"
     embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large", instruction="Represent the text")
     output = embedding.embed_query(query)
-    assert len(output[0]) == 768
\ No newline at end of file
+    assert len(output) == 768
\ No newline at end of file

From 196c9b7f15e10f3650a3dfdffe9c18a0a3be2085 Mon Sep 17 00:00:00 2001
From: Eno Reyes <enoreyes@gmail.com>
Date: Mon, 30 Jan 2023 12:33:02 -0800
Subject: [PATCH 8/8] Update langchain/embeddings/huggingface.py

Co-authored-by: seanaedmiston <seane999@gmail.com>
---
 langchain/embeddings/huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py
index 3adf5836e2222..17bd4cca38dc7 100644
--- a/langchain/embeddings/huggingface.py
+++ b/langchain/embeddings/huggingface.py
@@ -100,7 +100,7 @@ def embed_query(self, text: str) -> List[float]:
 
         if (self.model_type == MODEL_TYPE.INSTRUCTION_EMBEDDING):
             instruction_pair = [self.instruction, text]
-            embedding = self.client.encode(instruction_pair)
+            embedding = self.client.encode([instruction_pair])[0]
         else:
             embedding = self.client.encode(text)