Skip to content

Commit

Permalink
Sync with main repo
Browse files Browse the repository at this point in the history
  • Loading branch information
JacksonCakes committed Apr 13, 2024
2 parents 263d21c + d105ec8 commit 5c054da
Show file tree
Hide file tree
Showing 14 changed files with 348 additions and 186 deletions.
47 changes: 30 additions & 17 deletions examples/training/matryoshka/matryoshka_nli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with MatryoshkaLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
At every 10% training steps, the model is evaluated on the STS benchmark dataset at the different output dimensions.
Usage:
python matryoshka_nli.py
Expand All @@ -15,7 +15,7 @@
from datasets import load_dataset
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
Expand All @@ -34,6 +34,7 @@
train_batch_size = 128 # The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1
matryoshka_dims = [768, 512, 256, 128, 64]

# Save path of the model
model_save_path = (
Expand Down Expand Up @@ -97,16 +98,22 @@ def add_to_samples(sent1, sent2, label):

# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [768, 512, 256, 128, 64])
train_loss = losses.MatryoshkaLoss(model, train_loss, matryoshka_dims=matryoshka_dims)

stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
evaluators = []
for dim in matryoshka_dims:
evaluators.append(
EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-dev-{dim}",
truncate_dim=dim,
)
)
dev_evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[0])

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
Expand Down Expand Up @@ -134,13 +141,19 @@ def add_to_samples(sent1, sent2, label):

model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
evaluators = []
for dim in matryoshka_dims:
evaluators.append(
EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-test-{dim}",
truncate_dim=dim,
)
)
test_evaluator = SequentialEvaluator(evaluators)
test_evaluator(model, output_path=model_save_path)


Expand Down
48 changes: 31 additions & 17 deletions examples/training/matryoshka/matryoshka_nli_reduced_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with MatryoshkaLoss using MultipleNegativesRankingLoss. This trains a model at output dimensions [768, 512, 256, 128, 64].
Entailments are positive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
At every 10% training steps, the model is evaluated on the STS benchmark dataset at the different output dimensions.
The difference between this script and matryoshka_nli.py is that this script uses a reduced dimensionality of the base
model by adding a Dense layer with `reduced_dim=256` output dimensions. This might be useful when your desired output
Expand All @@ -19,7 +19,7 @@
from datasets import load_dataset
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator, SimilarityFunction
import logging
from datetime import datetime
import sys
Expand All @@ -39,6 +39,7 @@
max_seq_length = 75
num_epochs = 1
reduced_dim = 256
matryoshka_dims = [256, 128, 64, 32, 16]

# Save path of the model
model_save_path = (
Expand Down Expand Up @@ -103,16 +104,22 @@ def add_to_samples(sent1, sent2, label):

# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)
train_loss = losses.MatryoshkaLoss(model, train_loss, [256, 128, 64, 32, 16])
train_loss = losses.MatryoshkaLoss(model, train_loss, matryoshka_dims=matryoshka_dims)

stsb_dev = load_dataset("mteb/stsbenchmark-sts", split="validation")
dev_evaluator = EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-dev",
)
evaluators = []
for dim in matryoshka_dims:
evaluators.append(
EmbeddingSimilarityEvaluator(
stsb_dev["sentence1"],
stsb_dev["sentence2"],
[score / 5 for score in stsb_dev["score"]],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-dev-{dim}",
truncate_dim=dim,
)
)
dev_evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[0])

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up
Expand Down Expand Up @@ -140,15 +147,22 @@ def add_to_samples(sent1, sent2, label):

model = SentenceTransformer(model_save_path)
stsb_test = load_dataset("mteb/stsbenchmark-sts", split="test")
test_evaluator = EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name="sts-test",
)
evaluators = []
for dim in matryoshka_dims:
evaluators.append(
EmbeddingSimilarityEvaluator(
stsb_test["sentence1"],
stsb_test["sentence2"],
[score / 5 for score in stsb_test["score"]],
main_similarity=SimilarityFunction.COSINE,
name=f"sts-test-{dim}",
truncate_dim=dim,
)
)
test_evaluator = SequentialEvaluator(evaluators)
test_evaluator(model, output_path=model_save_path)


# Optionally, save the model to the Hugging Face Hub!
# It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
Expand Down
64 changes: 39 additions & 25 deletions sentence_transformers/evaluation/BinaryClassificationEvaluator.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from sentence_transformers import SentenceTransformer
from contextlib import nullcontext
from . import SentenceEvaluator
import logging
import os
import csv
from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
from sklearn.metrics import average_precision_score
import numpy as np
from typing import List
from typing import List, Optional
from ..readers import InputExample


Expand All @@ -30,6 +32,8 @@ class BinaryClassificationEvaluator(SentenceEvaluator):
:param batch_size: Batch size used to compute embeddings
:param show_progress_bar: If true, prints a progress bar
:param write_csv: Write results to a CSV file
:param truncate_dim: The dimension to truncate sentence embeddings to. `None` uses the model's current truncation
dimension. Defaults to None.
"""

def __init__(
Expand All @@ -41,10 +45,12 @@ def __init__(
batch_size: int = 32,
show_progress_bar: bool = False,
write_csv: bool = True,
truncate_dim: Optional[int] = None,
):
self.sentences1 = sentences1
self.sentences2 = sentences2
self.labels = labels
self.truncate_dim = truncate_dim

assert len(self.sentences1) == len(self.sentences2)
assert len(self.sentences1) == len(self.labels)
Expand Down Expand Up @@ -106,16 +112,18 @@ def from_input_examples(cls, examples: List[InputExample], **kwargs):
scores.append(example.label)
return cls(sentences1, sentences2, scores, **kwargs)

def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
def __call__(self, model: SentenceTransformer, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = f" after epoch {epoch}:"
out_txt = f" after epoch {epoch}"
else:
out_txt = f" in epoch {epoch} after {steps} steps:"
out_txt = f" in epoch {epoch} after {steps} steps"
else:
out_txt = ":"
out_txt = ""
if self.truncate_dim is not None:
out_txt += f" (truncated to {self.truncate_dim})"

logger.info("Binary Accuracy Evaluation of the model on " + self.name + " dataset" + out_txt)
logger.info(f"Binary Accuracy Evaluation of the model on the {self.name} dataset{out_txt}:")

scores = self.compute_metrices(model)

Expand Down Expand Up @@ -144,25 +152,31 @@ def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int =
return main_score

def compute_metrices(self, model):
try:
# If the sentences are hashable, then we can use a set to avoid embedding the same sentences multiple times
sentences = list(set(self.sentences1 + self.sentences2))
embeddings = model.encode(
sentences, batch_size=self.batch_size, show_progress_bar=self.show_progress_bar, convert_to_numpy=True
)
emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
embeddings1 = [emb_dict[sent] for sent in self.sentences1]
embeddings2 = [emb_dict[sent] for sent in self.sentences2]
except TypeError:
# Otherwise we just embed everything, e.g. if the sentences are images for evaluating a CLIP model
embeddings = model.encode(
self.sentences1 + self.sentences2,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
)
embeddings1 = embeddings[: len(self.sentences1)]
embeddings2 = embeddings[len(self.sentences1) :]
with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
try:
# If the sentences are hashable, then we can use a set to avoid embedding the same sentences multiple
# times
sentences = list(set(self.sentences1 + self.sentences2))
except TypeError:
# Otherwise we just embed everything, e.g. if the sentences are images for evaluating a CLIP model
embeddings = model.encode(
self.sentences1 + self.sentences2,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
)
embeddings1 = embeddings[: len(self.sentences1)]
embeddings2 = embeddings[len(self.sentences1) :]
else:
embeddings = model.encode(
sentences,
batch_size=self.batch_size,
show_progress_bar=self.show_progress_bar,
convert_to_numpy=True,
)
emb_dict = {sent: emb for sent, emb in zip(sentences, embeddings)}
embeddings1 = [emb_dict[sent] for sent in self.sentences1]
embeddings2 = [emb_dict[sent] for sent in self.sentences2]

cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2)
manhattan_distances = paired_manhattan_distances(embeddings1, embeddings2)
Expand Down
14 changes: 9 additions & 5 deletions sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from contextlib import nullcontext

from sentence_transformers import SentenceTransformer
from . import SentenceEvaluator, SimilarityFunction
import logging
import os
Expand Down Expand Up @@ -101,16 +103,18 @@ def from_input_examples(cls, examples: List[InputExample], **kwargs):
scores.append(example.label)
return cls(sentences1, sentences2, scores, **kwargs)

def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
def __call__(self, model: SentenceTransformer, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
if epoch != -1:
if steps == -1:
out_txt = " after epoch {}:".format(epoch)
out_txt = f" after epoch {epoch}"
else:
out_txt = " in epoch {} after {} steps:".format(epoch, steps)
out_txt = f" in epoch {epoch} after {steps} steps"
else:
out_txt = ":"
out_txt = ""
if self.truncate_dim is not None:
out_txt += f" (truncated to {self.truncate_dim})"

logger.info("EmbeddingSimilarityEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
logger.info(f"EmbeddingSimilarityEvaluator: Evaluating the model on the {self.name} dataset{out_txt}:")

with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
embeddings1 = model.encode(
Expand Down
Loading

0 comments on commit 5c054da

Please sign in to comment.