From af7af69f66bd0399bd4643caac5624a51885f5c9 Mon Sep 17 00:00:00 2001 From: Eno Reyes Date: Wed, 25 Jan 2023 16:41:49 -0800 Subject: [PATCH 1/8] Added Instructor Model to Embeddings --- langchain/embeddings/huggingface.py | 37 +++++++++++++------ .../embeddings/test_huggingface.py | 11 ++++++ 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py index 98f9986ad2ca7..508dc5283c652 100644 --- a/langchain/embeddings/huggingface.py +++ b/langchain/embeddings/huggingface.py @@ -6,12 +6,14 @@ from langchain.embeddings.base import Embeddings DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" - +MODEL_LIST = [DEFAULT_MODEL_NAME, + "hkunlp/instructor-large"] class HuggingFaceEmbeddings(BaseModel, Embeddings): """Wrapper around sentence_transformers embedding models. - To use, you should have the ``sentence_transformers`` python package installed. + To use sentence transformers, you should have the ``sentence_transformers`` python package installed. + To use Instructor, you should have ``InstructorEmbedding`` python package installed. Example: .. code-block:: python @@ -28,15 +30,28 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings): def __init__(self, **kwargs: Any): """Initialize the sentence_transformer.""" super().__init__(**kwargs) - try: - import sentence_transformers - - self.client = sentence_transformers.SentenceTransformer(self.model_name) - except ImportError: - raise ValueError( - "Could not import sentence_transformers python package. " - "Please install it with `pip install sentence_transformers`." - ) + + if (self.model_name == DEFAULT_MODEL_NAME): + try: + import sentence_transformers + + self.client = sentence_transformers.SentenceTransformer(self.model_name) + except ImportError: + raise ValueError( + "Could not import sentence_transformers python package. " + "Please install it with `pip install sentence_transformers`." + ) + elif ("instructor" in self.model_name): + try: + from InstructorEmbedding import INSTRUCTOR + + self.client = INSTRUCTOR(self.model_name) + except ImportError: + raise ValueError( + "Could not import InstructorEmbedding python package. " + "Please install it with `pip install InstructorEmbedding`." + ) + class Config: """Configuration for this pydantic object.""" diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py index e71fbb0066706..7762d329dcf18 100644 --- a/tests/integration_tests/embeddings/test_huggingface.py +++ b/tests/integration_tests/embeddings/test_huggingface.py @@ -21,3 +21,14 @@ def test_huggingface_embedding_query() -> None: embedding = HuggingFaceEmbeddings() output = embedding.embed_query(document) assert len(output) == 768 + +def test_huggingface_instructor_embedding_documents() -> None: + """Test huggingface embeddings.""" + documents = ["foo bar"] + embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large") + output = embedding.embed_documents(documents) + assert len(output) == 1 + assert len(output[0]) == 768 + +if __name__ == '__main__': + test_huggingface_instructor_embedding_documents() \ No newline at end of file From 477069b24ac3a65fcdcf54b55af3af292573264a Mon Sep 17 00:00:00 2001 From: Eno Reyes Date: Fri, 27 Jan 2023 10:27:47 -0800 Subject: [PATCH 2/8] Updated embeddings and tests --- langchain/embeddings/huggingface.py | 55 ++++++++++++++++--- .../embeddings/test_huggingface.py | 12 +++- 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py index 508dc5283c652..f600b1b6d8071 100644 --- a/langchain/embeddings/huggingface.py +++ b/langchain/embeddings/huggingface.py @@ -1,13 +1,16 @@ """Wrapper around HuggingFace embedding models.""" from typing import Any, List +from enum import Enum from pydantic import BaseModel, Extra from langchain.embeddings.base import Embeddings DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" -MODEL_LIST = [DEFAULT_MODEL_NAME, - "hkunlp/instructor-large"] + +class MODEL_TYPE(Enum): + SENTENCE_TRANSFORMER = 1 + INSTRUCTION_EMBEDDING = 2 class HuggingFaceEmbeddings(BaseModel, Embeddings): """Wrapper around sentence_transformers embedding models. @@ -25,6 +28,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings): client: Any #: :meta private: model_name: str = DEFAULT_MODEL_NAME + model_type: str = MODEL_TYPE.SENTENCE_TRANSFORMER """Model name to use.""" def __init__(self, **kwargs: Any): @@ -44,7 +48,7 @@ def __init__(self, **kwargs: Any): elif ("instructor" in self.model_name): try: from InstructorEmbedding import INSTRUCTOR - + self.model_type = MODEL_TYPE.INSTRUCTION_EMBEDDING self.client = INSTRUCTOR(self.model_name) except ImportError: raise ValueError( @@ -69,9 +73,38 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: """ texts = list(map(lambda x: x.replace("\n", " "), texts)) embeddings = self.client.encode(texts) + + if (self.model_name == DEFAULT_MODEL_NAME): + return embeddings.tolist() + return embeddings.tolist() - def embed_query(self, text: str) -> List[float]: + ## Embedding instruction-tuned models requires a list of instruction, text pairs. + def embed_documents(self, texts: List[List[str]]) -> List[List[float]]: + """Compute doc embeddings using a HuggingFace transformer model. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + if (self.model_type != MODEL_TYPE.INSTRUCTION_EMBEDDING): + raise ValueError( + "Erorr: You passed a list of string pairs but did not instantiate an Instruction embedding model. " + ) + + for text_list in texts: + for text in text_list: + if isinstance(text, str): + text = text.replace("\n", " ") + + embeddings = self.client.encode(texts) + + return embeddings.tolist() + + ## Embedding instruction-tuned model queries requires a list of instruction, text pairs. + def embed_query(self, texts: List[str]) -> List[float]: """Compute query embeddings using a HuggingFace transformer model. Args: @@ -80,6 +113,14 @@ def embed_query(self, text: str) -> List[float]: Returns: Embeddings for the text. """ - text = text.replace("\n", " ") - embedding = self.client.encode(text) - return embedding.tolist() + if (self.model_type != MODEL_TYPE.INSTRUCTION_EMBEDDING): + raise ValueError( + "Erorr: You passed a string pair but did not instantiate an Instruction embedding model. " + ) + + for text in texts: + if isinstance(text, str): + text = text.replace("\n", " ") + + embedding = self.client.encode(texts) + return embedding.tolist() \ No newline at end of file diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py index 7762d329dcf18..63a965446c0f6 100644 --- a/tests/integration_tests/embeddings/test_huggingface.py +++ b/tests/integration_tests/embeddings/test_huggingface.py @@ -24,11 +24,19 @@ def test_huggingface_embedding_query() -> None: def test_huggingface_instructor_embedding_documents() -> None: """Test huggingface embeddings.""" - documents = ["foo bar"] + documents = [["foo bar instruction", "foo bar document", 0]] embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large") output = embedding.embed_documents(documents) assert len(output) == 1 assert len(output[0]) == 768 +def test_huggingface_instructor_embedding_query() -> None: + """Test huggingface embeddings.""" + query = [["foo bar instruction", "foo bar query", 0]] + embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large") + output = embedding.embed_query(query) + assert len(output[0]) == 768 + if __name__ == '__main__': - test_huggingface_instructor_embedding_documents() \ No newline at end of file + test_huggingface_instructor_embedding_documents() + test_huggingface_instructor_embedding_query() \ No newline at end of file From f3815685b92063076d3c685aa3d89feed5ee5837 Mon Sep 17 00:00:00 2001 From: Eno Reyes Date: Fri, 27 Jan 2023 13:35:55 -0800 Subject: [PATCH 3/8] Updated embeddings and tests --- langchain/embeddings/huggingface.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py index f600b1b6d8071..c0260caead9f1 100644 --- a/langchain/embeddings/huggingface.py +++ b/langchain/embeddings/huggingface.py @@ -103,12 +103,24 @@ def embed_documents(self, texts: List[List[str]]) -> List[List[float]]: return embeddings.tolist() + def embed_query(self, text: str) -> List[float]: + """Compute query embeddings using a HuggingFace transformer model. + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + text = text.replace("\n", " ") + embedding = self.client.encode(text) + return embedding.tolist() + ## Embedding instruction-tuned model queries requires a list of instruction, text pairs. def embed_query(self, texts: List[str]) -> List[float]: - """Compute query embeddings using a HuggingFace transformer model. + """Compute query embeddings using a HuggingFace instructor transformer model. Args: - text: The text to embed. + texts: The instruction/query pair to embed. Returns: Embeddings for the text. From 14b6cd77eb0dd901b794290e2cff6316b1ca7264 Mon Sep 17 00:00:00 2001 From: Eno Reyes Date: Fri, 27 Jan 2023 13:54:46 -0800 Subject: [PATCH 4/8] Update structure --- tests/integration_tests/embeddings/test_huggingface.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py index 63a965446c0f6..9740bd69d6a71 100644 --- a/tests/integration_tests/embeddings/test_huggingface.py +++ b/tests/integration_tests/embeddings/test_huggingface.py @@ -24,16 +24,16 @@ def test_huggingface_embedding_query() -> None: def test_huggingface_instructor_embedding_documents() -> None: """Test huggingface embeddings.""" - documents = [["foo bar instruction", "foo bar document", 0]] - embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large") + documents = ["foo bar"] + embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large", instruction="Represent the text") output = embedding.embed_documents(documents) assert len(output) == 1 assert len(output[0]) == 768 def test_huggingface_instructor_embedding_query() -> None: """Test huggingface embeddings.""" - query = [["foo bar instruction", "foo bar query", 0]] - embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large") + query = "foo bar" + embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large", instruction="Represent the text") output = embedding.embed_query(query) assert len(output[0]) == 768 From d81591e9a7be44a25b94002936f01255ed676728 Mon Sep 17 00:00:00 2001 From: Eno Reyes Date: Fri, 27 Jan 2023 13:57:47 -0800 Subject: [PATCH 5/8] Update structure --- tests/integration_tests/embeddings/test_huggingface.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py index 9740bd69d6a71..d794288f7b923 100644 --- a/tests/integration_tests/embeddings/test_huggingface.py +++ b/tests/integration_tests/embeddings/test_huggingface.py @@ -35,8 +35,4 @@ def test_huggingface_instructor_embedding_query() -> None: query = "foo bar" embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large", instruction="Represent the text") output = embedding.embed_query(query) - assert len(output[0]) == 768 - -if __name__ == '__main__': - test_huggingface_instructor_embedding_documents() - test_huggingface_instructor_embedding_query() \ No newline at end of file + assert len(output[0]) == 768 \ No newline at end of file From f63cebd1ceececb170de86b92543612e72615fe9 Mon Sep 17 00:00:00 2001 From: Eno Reyes Date: Fri, 27 Jan 2023 14:18:06 -0800 Subject: [PATCH 6/8] Update huggingface.py --- langchain/embeddings/huggingface.py | 67 ++++++++--------------------- 1 file changed, 18 insertions(+), 49 deletions(-) diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py index c0260caead9f1..3adf5836e2222 100644 --- a/langchain/embeddings/huggingface.py +++ b/langchain/embeddings/huggingface.py @@ -7,6 +7,7 @@ from langchain.embeddings.base import Embeddings DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" +DEFAULT_INSTRUCTION = "Represent the following text:" class MODEL_TYPE(Enum): SENTENCE_TRANSFORMER = 1 @@ -29,6 +30,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings): client: Any #: :meta private: model_name: str = DEFAULT_MODEL_NAME model_type: str = MODEL_TYPE.SENTENCE_TRANSFORMER + instruction: str = DEFAULT_INSTRUCTION """Model name to use.""" def __init__(self, **kwargs: Any): @@ -49,7 +51,7 @@ def __init__(self, **kwargs: Any): try: from InstructorEmbedding import INSTRUCTOR self.model_type = MODEL_TYPE.INSTRUCTION_EMBEDDING - self.client = INSTRUCTOR(self.model_name) + self.client = INSTRUCTOR(self.model_name) except ImportError: raise ValueError( "Could not import InstructorEmbedding python package. " @@ -72,37 +74,20 @@ def embed_documents(self, texts: List[str]) -> List[List[float]]: List of embeddings, one for each text. """ texts = list(map(lambda x: x.replace("\n", " "), texts)) - embeddings = self.client.encode(texts) + + if (self.model_type == MODEL_TYPE.INSTRUCTION_EMBEDDING): + instruction_pairs = [] + for text in texts: + instruction_pairs.append([self.instruction, text]) + embeddings = self.client.encode(instruction_pairs) + else: + embeddings = self.client.encode(texts) if (self.model_name == DEFAULT_MODEL_NAME): return embeddings.tolist() return embeddings.tolist() - ## Embedding instruction-tuned models requires a list of instruction, text pairs. - def embed_documents(self, texts: List[List[str]]) -> List[List[float]]: - """Compute doc embeddings using a HuggingFace transformer model. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - if (self.model_type != MODEL_TYPE.INSTRUCTION_EMBEDDING): - raise ValueError( - "Erorr: You passed a list of string pairs but did not instantiate an Instruction embedding model. " - ) - - for text_list in texts: - for text in text_list: - if isinstance(text, str): - text = text.replace("\n", " ") - - embeddings = self.client.encode(texts) - - return embeddings.tolist() - def embed_query(self, text: str) -> List[float]: """Compute query embeddings using a HuggingFace transformer model. Args: @@ -112,27 +97,11 @@ def embed_query(self, text: str) -> List[float]: Embeddings for the text. """ text = text.replace("\n", " ") - embedding = self.client.encode(text) - return embedding.tolist() - - ## Embedding instruction-tuned model queries requires a list of instruction, text pairs. - def embed_query(self, texts: List[str]) -> List[float]: - """Compute query embeddings using a HuggingFace instructor transformer model. - - Args: - texts: The instruction/query pair to embed. - - Returns: - Embeddings for the text. - """ - if (self.model_type != MODEL_TYPE.INSTRUCTION_EMBEDDING): - raise ValueError( - "Erorr: You passed a string pair but did not instantiate an Instruction embedding model. " - ) - for text in texts: - if isinstance(text, str): - text = text.replace("\n", " ") - - embedding = self.client.encode(texts) - return embedding.tolist() \ No newline at end of file + if (self.model_type == MODEL_TYPE.INSTRUCTION_EMBEDDING): + instruction_pair = [self.instruction, text] + embedding = self.client.encode(instruction_pair) + else: + embedding = self.client.encode(text) + + return embedding.tolist() From b9799a67820cf7bce8e838fbbfa0201276f08bb6 Mon Sep 17 00:00:00 2001 From: Eno Reyes Date: Mon, 30 Jan 2023 12:31:39 -0800 Subject: [PATCH 7/8] Update tests/integration_tests/embeddings/test_huggingface.py Co-authored-by: seanaedmiston --- tests/integration_tests/embeddings/test_huggingface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/embeddings/test_huggingface.py b/tests/integration_tests/embeddings/test_huggingface.py index d794288f7b923..cf747114c5782 100644 --- a/tests/integration_tests/embeddings/test_huggingface.py +++ b/tests/integration_tests/embeddings/test_huggingface.py @@ -35,4 +35,4 @@ def test_huggingface_instructor_embedding_query() -> None: query = "foo bar" embedding = HuggingFaceEmbeddings(model_name="hkunlp/instructor-large", instruction="Represent the text") output = embedding.embed_query(query) - assert len(output[0]) == 768 \ No newline at end of file + assert len(output) == 768 \ No newline at end of file From 196c9b7f15e10f3650a3dfdffe9c18a0a3be2085 Mon Sep 17 00:00:00 2001 From: Eno Reyes Date: Mon, 30 Jan 2023 12:33:02 -0800 Subject: [PATCH 8/8] Update langchain/embeddings/huggingface.py Co-authored-by: seanaedmiston --- langchain/embeddings/huggingface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain/embeddings/huggingface.py b/langchain/embeddings/huggingface.py index 3adf5836e2222..17bd4cca38dc7 100644 --- a/langchain/embeddings/huggingface.py +++ b/langchain/embeddings/huggingface.py @@ -100,7 +100,7 @@ def embed_query(self, text: str) -> List[float]: if (self.model_type == MODEL_TYPE.INSTRUCTION_EMBEDDING): instruction_pair = [self.instruction, text] - embedding = self.client.encode(instruction_pair) + embedding = self.client.encode([instruction_pair])[0] else: embedding = self.client.encode(text)