diff --git a/src/lighteval/config/lighteval_config.py b/src/lighteval/config/lighteval_config.py index 784d7898..1f77b34e 100644 --- a/src/lighteval/config/lighteval_config.py +++ b/src/lighteval/config/lighteval_config.py @@ -76,7 +76,7 @@ class LightEvalTasksArgs: dataset_loading_processes: int = 8 multichoice_continuations_start_space: Optional[bool] = None - pair_wise_tokenization: bool = False + pairwise_tokenization: bool = False @dataclass diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index f007470d..b96a875d 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -476,7 +476,7 @@ class MetricsLogger: default_factory=lambda: collections.defaultdict(lambda: collections.defaultdict(list)) ) metric_aggregated: dict[str, dict[str, float]] = field( - default_factory=lambda: collections.defaultdict(lambda: collections.defaultdict(dict)) + default_factory=lambda: collections.defaultdict(lambda: collections.defaultdict(float)) ) def log(self, task_name: str, metrics: dict) -> None: diff --git a/src/lighteval/models/abstract_model.py b/src/lighteval/models/abstract_model.py index 36fcb360..e66f277e 100644 --- a/src/lighteval/models/abstract_model.py +++ b/src/lighteval/models/abstract_model.py @@ -183,23 +183,28 @@ def tok_encode_pair(self, context, continuation, pairwise: bool = False): context = context[:-n_spaces] if pairwise: - context_enc, continuation_enc = self.tok_encode(context), self.tok_encode(continuation) - if self.add_special_tokens: - tokenized_with_special_tokens = self.tokenizer.build_inputs_with_special_tokens( - context_enc + continuation_enc - ) - # If this fails something went wrong as the function above should only add special tokens - first_non_prefix_token_idx = tokenized_with_special_tokens.index(context_enc[0]) - last_context_token_idx = first_non_prefix_token_idx + len(context_enc) - context_enc, continuation_enc = ( - tokenized_with_special_tokens[:last_context_token_idx], - tokenized_with_special_tokens[last_context_token_idx:], - ) + # We don't add special tokens to the continuation as if bos is added + # models tend to to completely ignore a context + context_enc, continuation_enc = ( + self.tok_encode(context, add_special_tokens=self.add_special_tokens), + self.tok_encode(continuation, add_special_tokens=False), + ) + + # In theory the context_enc can be ended with eos token, this would again + # cause the model to ignore the context. We thus strip the eos token from context_enc + if len(context_enc) > 0 and context_enc[-1] == self.tokenizer.eos_token_id: + context_enc = context_enc[:-1] + return context_enc, continuation_enc whole_enc = self.tok_encode(context + continuation) context_enc = self.tok_encode(context) context_enc_len = len(context_enc) + # In case continuation tokens merge with context tokens we use the merged token as continuation + if len(context_enc) == len(whole_enc): + context_enc_len = len(context_enc) - 1 + context_enc = whole_enc[:context_enc_len] + continuation_enc = whole_enc[context_enc_len:] return context_enc, continuation_enc diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py index c9ec1327..3ef7482a 100644 --- a/src/lighteval/models/base_model.py +++ b/src/lighteval/models/base_model.py @@ -112,7 +112,7 @@ def __init__( model_size=model_size, ) - self.pair_wise_tokenization = config.pair_wise_tokenization + self.pairwise_tokenization = config.pairwise_tokenization @property def tokenizer(self): @@ -697,7 +697,7 @@ def loglikelihood( else: # The following line is mandatory for compatibility with the harness request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair( - request.context, request.choice, pairwise=self.pair_wise_tokenization + request.context, request.choice, pairwise=self.pairwise_tokenization ) return self._loglikelihood_tokens(requests, override_bs=override_bs) diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py index 60f022a0..e133d61c 100644 --- a/src/lighteval/models/model_config.py +++ b/src/lighteval/models/model_config.py @@ -63,7 +63,7 @@ class BaseModelConfig: For example, context: "What is the capital of France?" and choices: "Paris", "London". Will be tokenized as: "What is the capital of France? Paris" and "What is the capital of France? London". True adds a space, False strips a space, None does nothing - pair_wise_tokenization (bool): Whether to tokenize the context and continuation as separately or together. + pairwise_tokenization (bool): Whether to tokenize the context and continuation as separately or together. subfolder (Optional[str]): The subfolder within the model repository. revision (str): The revision of the model. batch_size (int): The batch size for model training. @@ -100,7 +100,7 @@ class BaseModelConfig: accelerator: "Accelerator" = None tokenizer: Optional[str] = None multichoice_continuations_start_space: Optional[bool] = None - pair_wise_tokenization: bool = False + pairwise_tokenization: bool = False subfolder: Optional[str] = None revision: str = "main" batch_size: int = -1 @@ -226,6 +226,8 @@ class VLLMModelConfig: multichoice_continuations_start_space: bool = ( True # whether to add a space at the start of each continuation in multichoice generation ) + pairwise_tokenization: bool = False # whether to tokenize the context and continuation separately or together. + subfolder: Optional[str] = None diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py index 5cf3668f..40c8f237 100644 --- a/src/lighteval/models/nanotron_model.py +++ b/src/lighteval/models/nanotron_model.py @@ -212,7 +212,7 @@ def __init__( self.input_pp_rank, self.output_pp_rank = get_min_max_rank(module=self.model) self.multichoice_continuations_start_space = multichoice_continuations_start_space - self.pair_wise_tokenization = nanotron_config.lighteval_config.tasks.pair_wise_tokenization + self.pairwise_tokenization = nanotron_config.lighteval_config.tasks.pairwise_tokenization self.model_info = ModelInfo( model_name=f"{nanotron_config.nanotron_config.general.run}/{nanotron_config.nanotron_config.general.step}" @@ -447,7 +447,7 @@ def loglikelihood(self, requests: List[LoglikelihoodRequest], override_bs=None) else: # The following line is mandatory for compatibility with the harness request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair( - request.context, request.choice, self.pair_wise_tokenization + request.context, request.choice, self.pairwise_tokenization ) return self._loglikelihood_tokens( diff --git a/src/lighteval/models/vllm_model.py b/src/lighteval/models/vllm_model.py index d07f05a5..50069045 100644 --- a/src/lighteval/models/vllm_model.py +++ b/src/lighteval/models/vllm_model.py @@ -93,6 +93,7 @@ def __init__( self.precision = _get_dtype(config.dtype, config=self._config) self.model_info = ModelInfo(model_name=self.model_name, model_sha=self.model_sha) + self.pairwise_tokenization = config.pairwise_tokenization @property def tokenizer(self): @@ -352,7 +353,7 @@ def loglikelihood( else: # The following line is mandatory for compatibility with the harness request.tokenized_context, request.tokenized_continuation = self.tok_encode_pair( - request.context, request.choice + request.context, request.choice, pairwise=self.pairwise_tokenization ) return self._loglikelihood_tokens(requests, override_bs=override_bs) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 00b4763b..bb33535f 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -24,11 +24,11 @@ import inspect import random from dataclasses import asdict, dataclass, field -from multiprocessing import Pool from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple from datasets import DatasetDict from huggingface_hub import TextGenerationInputGrammarType +from multiprocess import Pool from pytablewriter import MarkdownTableWriter from lighteval.logging.hierarchical_logger import hlog, hlog_warn diff --git a/tests/models/test_abstract_mode.py b/tests/models/test_abstract_model.py similarity index 92% rename from tests/models/test_abstract_mode.py rename to tests/models/test_abstract_model.py index b7b2f737..a598bdc4 100644 --- a/tests/models/test_abstract_mode.py +++ b/tests/models/test_abstract_model.py @@ -34,6 +34,7 @@ def test_tok_encode_pair(): continuation = "1" non_pairwise_tokens = model.tok_encode_pair(context, continuation, pairwise=False) pairwise_tokens = model.tok_encode_pair(context, continuation, pairwise=True) - # Problematic case where the completion tokens are empty despite the chars are non-empty - assert non_pairwise_tokens == ([6, 47873, 13], []) + # Non-pairwise merged ":1" to one token + assert non_pairwise_tokens == ([6, 47873], [34871]) + # Pairwise separated ":" and "1" assert pairwise_tokens == ([6, 47873, 13], [82])