Merge remote-tracking branch 'origin/main' into multilnag_nli_tasks

huggingface · Sep 30, 2024 · 7b561fe · 7b561fe
2 parents 4e6100d + 170ed87
commit 7b561fe
Show file tree

Hide file tree

Showing 31 changed files with 496 additions and 610 deletions.
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -26,7 +26,7 @@ jobs:
          cache: 'pip'
      - name: Install lighteval in editable mode
        run: |
-         pip install -e .[dev,extended_tasks]
+         pip install -e .[dev,extended_tasks,multilingual]
      - name: Get cached files
        uses: actions/cache@v2
        id: get-cache

diff --git a/README.md b/README.md
diff --git a/assets/lighteval-doc.svg b/assets/lighteval-doc.svg
diff --git a/community_tasks/_template.py b/community_tasks/_template.py
@@ -33,7 +33,7 @@
 
 from lighteval.metrics import Metrics
 from lighteval.metrics.metrics import SampleLevelMetric
-from lighteval.metrics.utils.utils import MetricCategory, MetricUseCase
+from lighteval.metrics.utils.metric_utils import MetricCategory, MetricUseCase
 from lighteval.tasks.default_prompts import LETTER_INDICES
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc

diff --git a/pyproject.toml b/pyproject.toml
@@ -99,7 +99,7 @@ extended_tasks = [
 s3 = ["s3fs"]
 multilingual = [
     "stanza",
-    "spacy[ja,ko]",
+    "spacy[ja,ko,th]",
     "jieba", # for chinese tokenizer
     "pyvi", # for vietnamese tokenizer
 ]

diff --git a/src/lighteval/metrics/dynamic_metrics.py b/src/lighteval/metrics/dynamic_metrics.py
@@ -37,7 +37,7 @@
     LogProbTokenNorm,
     get_multilingual_normalizer,
 )
-from lighteval.metrics.utils.utils import MetricCategory, MetricUseCase, SampleLevelMetric
+from lighteval.metrics.utils.metric_utils import MetricCategory, MetricUseCase, SampleLevelMetric
 from lighteval.utils.language import Language
 
 

diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py
@@ -383,7 +383,6 @@ def __init__(
 
         self._baseline_vals = None
         self.baseline_path = baseline_path
-        self.use_custom_baseline = self.baseline_path is not None
         if self.baseline_path is None:
             self.baseline_path = os.path.join(
                 os.path.dirname(__file__),

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -40,15 +40,15 @@
     ROUGE,
     BertScore,
     ExactMatches,
+    Extractiveness,
     F1_score,
+    Faithfulness,
     JudgeLLM,
     LoglikelihoodAcc,
     MajAtK,
     Recall,
     StringDistance,
     acc_golds_likelihood,
-    extractiveness,
-    faithfulness,
 )
 from lighteval.metrics.normalizations import (
     LogProbCharNorm,
@@ -61,7 +61,7 @@
     remove_braces_and_strip,
 )
 from lighteval.metrics.sample_preparator import GenerativePreparator, LoglikelihoodPreparator, PerplexityPreparator
-from lighteval.metrics.utils.utils import (
+from lighteval.metrics.utils.metric_utils import (
     CorpusLevelMetric,
     CorpusLevelMetricGrouping,
     Metric,
@@ -175,7 +175,9 @@ class Metrics(Enum):
     )
     extractiveness = SampleLevelMetricGrouping(
         metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
-        sample_level_fn=extractiveness,
+        sample_level_fn=Extractiveness(
+            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
+        ).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.SUMMARIZATION,
         corpus_level_fn={
@@ -223,7 +225,9 @@ class Metrics(Enum):
     )
     faithfulness = SampleLevelMetric(
         metric_name="summac",
-        sample_level_fn=faithfulness,
+        sample_level_fn=Faithfulness(
+            normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
+        ).compute,
         category=MetricCategory.GENERATIVE,
         use_case=MetricUseCase.SUMMARIZATION,
         corpus_level_fn=np.mean,

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -159,6 +159,8 @@ def __init__(
                 Defaults to None if no normalization is applied.
             strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
         """
+        if aggregation_function is None:
+            aggregation_function = max
 
         self.aggregation_function = aggregation_function
         self.normalize_gold = normalize_gold
@@ -562,7 +564,7 @@ def __init__(
         self.normalize_gold = normalize_gold
         self.normalize_pred = normalize_pred
 
-    def compute(self, golds: list[str], predictions: list[str]) -> dict:
+    def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict:
         """Computes the prediction, recall and f1 score using the bert scorer.
 
         Args:
@@ -591,24 +593,104 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict:
         return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()}
 
 
-# todo: make into clean classes with call to normalizer
-def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs):
-    inp = remove_braces(formatted_doc.specific["text"])
-    pred = remove_braces_and_strip(predictions[0])
-    stats = DataStatsMetric().evaluate_example(pred, inp)
-    return {
-        "summarization_coverage": stats["coverage"],
-        "summarization_density": stats["density"],
-        "summarization_compression": stats["compression"],
-    }
-
-
-# todo: make into clean classes with call to normalizer
-def faithfulness(formatted_doc: Doc, predictions: list[str], **kwargs):
-    inp = remove_braces(formatted_doc.specific["text"])
-    pred = remove_braces_and_strip(predictions[0])
-    summac = SummaCZS(granularity="sentence", model_name="vitc", imager_load_cache=False)  # , device=device)
-    return summac.score_one(inp, pred)["score"]
+class Extractiveness:
+    def __init__(
+        self,
+        normalize_input: callable = remove_braces,
+        normalize_pred: callable = remove_braces_and_strip,
+        input_column: str = "text",
+    ):
+        """
+        Extractiveness metric class.
+
+        Args:
+            normalize_input (callable, optional): Function to normalize the input strings.
+                Defaults to remove_braces from lighteval.metrics.normalizations if no normalization is applied.
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
+                Defaults to remove_braces_and_strip from lighteval.metrics.normalizations if no normalization is applied.
+            input_column (str): Column in the formatted_doc to use for the input. Defaults to "text".
+        """
+        self.stats_metric = None
+        self.normalize_input = normalize_input
+        self.normalize_pred = normalize_pred
+        self.input_column = input_column
+
+    def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+        """
+        Compute the extractiveness of the predictions.
+
+        This method calculates coverage, density, and compression scores for a single
+        prediction against the input text.
+
+        Args:
+            predictions (list[str]): Predicted strings, a list of length 1.
+            formatted_doc (Doc): The formatted document.
+
+        Returns:
+            dict[str, float]: The extractiveness scores.
+        """
+        if self.stats_metric is None:
+            self.stats_metric = DataStatsMetric()
+
+        inp = formatted_doc.specific[self.input_column]
+        prediction = predictions[0]
+        if self.normalize_input:
+            inp = self.normalize_input(inp)
+        if self.normalize_pred:
+            prediction = self.normalize_pred(prediction)
+
+        stats = self.stats_metric.evaluate_example(prediction, inp)
+        return {
+            "summarization_coverage": stats["coverage"],
+            "summarization_density": stats["density"],
+            "summarization_compression": stats["compression"],
+        }
+
+
+class Faithfulness:
+    def __init__(
+        self,
+        normalize_input: callable = remove_braces,
+        normalize_pred: callable = remove_braces_and_strip,
+        input_column: str = "text",
+    ):
+        """
+        Faithfulness metric class.
+
+        Args:
+            normalize_input (callable, optional): Function to normalize the input strings.
+                Defaults to remove_braces from lighteval.metrics.normalizations if no normalization is applied.
+            normalize_pred (callable, optional): Function to use to normalize the predicted strings.
+                Defaults to remove_braces_and_strip from lighteval.metrics.normalizations if no normalization is applied.
+            input_column (str): Column in the formatted_doc to use for the input. Defaults to "text".
+        """
+        self.summac = None
+        self.normalize_input = normalize_input
+        self.normalize_pred = normalize_pred
+        self.input_column = input_column
+
+    def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
+        """
+        Compute the faithfulness of the predictions.
+
+        The SummaCZS (Summary Content Zero-Shot) model is used with configurable granularity and model variation.
+
+        Args:
+            predictions (list[str]): Predicted strings, a list of length 1.
+            formatted_doc (Doc): The formatted document.
+
+        Returns:
+            dict[str, float]: The faithfulness scores.
+        """
+        if self.summac is None:
+            SummaCZS(granularity="sentence", model_name="vitc", imager_load_cache=False)  # , device=device)
+        inp = formatted_doc.specific[self.input_column]
+        prediction = predictions[0]
+        if self.normalize_input:
+            inp = self.normalize_input(inp)
+        if self.normalize_pred:
+            prediction = self.normalize_pred(prediction)
+        return self.summac.score_one(inp, prediction)["score"]
 
 
 class BLEURT:

diff --git a/src/lighteval/metrics/normalizations.py b/src/lighteval/metrics/normalizations.py
@@ -104,17 +104,20 @@ def _remove_boxed(text: str | None) -> str:
         """
         if text is None:
             return ""
-        if "\\boxed " in text:
-            left = "\\boxed "
-            assert text[: len(left)] == left
-            return text[len(left) :]
+        try:
+            if "\\boxed " in text:
+                left = "\\boxed "
+                assert text[: len(left)] == left
+                return text[len(left) :]
 
-        left = "\\boxed{"
+            left = "\\boxed{"
 
-        assert text[: len(left)] == left
-        assert text[-1] == "}"
+            assert text[: len(left)] == left
+            assert text[-1] == "}"
 
-        return text[len(left) : -1]
+            return text[len(left) : -1]
+        except Exception:
+            return ""
 
     def _last_boxed_only_string(text: str) -> str | None:
         """Extract the last \\boxed{...} or \\fbox{...} element from a string."""
@@ -386,8 +389,9 @@ def remove_articles(text: str, lang: Language) -> str:
     """
     Removes definite and indefinite articles from the text.
     Generated using LLM then manually checked by non-expert.
-    Only languages that don't blend the articles, if you are native speaker,
-    we would appreciate adding also languages that blend the articles.
+    We currently only support languages that don't blend articles.
+    If you are a native speaker of a language where articles are blended,
+    we would appreciate your contribution!
     """
     pattern = _ARTICLE_PATTERNS.get(lang)
     return re.sub(pattern, " ", text) if pattern else text

diff --git a/src/lighteval/metrics/utils/utils.py → src/lighteval/metrics/utils/metric_utils.py b/src/lighteval/metrics/utils/utils.py → src/lighteval/metrics/utils/metric_utils.py
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
@@ -126,7 +126,7 @@ def add_special_tokens(self):
     def max_length(self) -> int:
         return self._max_length
 
-    def init_model_parallel(self, model_parallel: bool = None) -> Tuple[bool, Optional[dict], Optional[str]]:
+    def init_model_parallel(self, model_parallel: bool | None = None) -> Tuple[bool, Optional[dict], Optional[str]]:
         """Compute all the parameters related to model_parallel"""
         if not is_accelerate_available():
             return False, None, None
@@ -147,7 +147,7 @@ def init_model_parallel(self, model_parallel: bool = None) -> Tuple[bool, Option
                 f"the number of local processes is {self.num_local_processes} "
                 f"and the number of GPUs is {len(max_memory_all_gpus)}"
             )
-        if model_parallel:
+        if model_parallel is True:
             max_memory_all_gpus = get_max_memory()  # A dict of the max memory for all the gpus
             if "cpu" in max_memory_all_gpus:
                 del max_memory_all_gpus["cpu"]
@@ -569,7 +569,6 @@ def greedy_until(
                     if max_new_tokens is None:  # If generation size is not set, we go all the way
                         max_new_tokens = self.max_length - context_size
                     else:
-                        print(self.max_length, context_size, max_new_tokens)
                         max_new_tokens = min(self.max_length - context_size, max_new_tokens)
                         if max_new_tokens < 1:
                             max_new_tokens = 1

diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
@@ -118,6 +118,8 @@ class BaseModelConfig:
     def __post_init__(self):
         # Making sure this parameter is a boolean
         self.multichoice_continuations_start_space = boolstring_to_bool(self.multichoice_continuations_start_space)
+        self.model_parallel = boolstring_to_bool(self.model_parallel)
+        self.compile = boolstring_to_bool(self.compile)
 
         if self.quantization_config is not None and not is_bnb_available():
             raise ImportError(NO_BNB_ERROR_MSG)
@@ -209,19 +211,21 @@ def init_configs(self, env_config: EnvConfig):
 @dataclass
 class VLLMModelConfig:
     pretrained: str
-    gpu_memory_utilisation: float = 0.8
-    batch_size: int = -1
-    revision: str = "main"
+    gpu_memory_utilisation: float = 0.9  # lower this if you are running out of memory
+    revision: str = "main"  # revision of the model
     dtype: str | None = None
-    tensor_parallel_size: int = 1
-    data_parallel_size: int = 1
-    max_model_length: int = 1024
+    tensor_parallel_size: int = 1  # how many GPUs to use for tensor parallelism
+    pipeline_parallel_size: int = 1  # how many GPUs to use for pipeline parallelism
+    data_parallel_size: int = 1  # how many GPUs to use for data parallelism
+    max_model_length: int | None = None  # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
     swap_space: int = 4  # CPU swap space size (GiB) per GPU.
     seed: int = 1234
     trust_remote_code: bool = False
     use_chat_template: bool = False
     add_special_tokens: bool = True
-    multichoice_continuations_start_space: bool = True
+    multichoice_continuations_start_space: bool = (
+        True  # whether to add a space at the start of each continuation in multichoice generation
+    )
     subfolder: Optional[str] = None