Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into multilnag_nli_tasks
Browse files Browse the repository at this point in the history
  • Loading branch information
hynky1999 committed Sep 30, 2024
2 parents 4e6100d + 170ed87 commit 3488e7d
Show file tree
Hide file tree
Showing 30 changed files with 523 additions and 602 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
cache: 'pip'
- name: Install lighteval in editable mode
run: |
pip install -e .[dev,extended_tasks]
pip install -e .[dev,extended_tasks,multilingual]
- name: Get cached files
uses: actions/cache@v2
id: get-cache
Expand Down
525 changes: 63 additions & 462 deletions README.md

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions assets/lighteval-doc.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion community_tasks/_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from lighteval.metrics import Metrics
from lighteval.metrics.metrics import SampleLevelMetric
from lighteval.metrics.utils.utils import MetricCategory, MetricUseCase
from lighteval.metrics.utils.metric_utils import MetricCategory, MetricUseCase
from lighteval.tasks.default_prompts import LETTER_INDICES
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ extended_tasks = [
s3 = ["s3fs"]
multilingual = [
"stanza",
"spacy[ja,ko]",
"spacy[ja,ko,th]",
"jieba", # for chinese tokenizer
"pyvi", # for vietnamese tokenizer
]
Expand Down
2 changes: 1 addition & 1 deletion src/lighteval/metrics/dynamic_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
LogProbTokenNorm,
get_multilingual_normalizer,
)
from lighteval.metrics.utils.utils import MetricCategory, MetricUseCase, SampleLevelMetric
from lighteval.metrics.utils.metric_utils import MetricCategory, MetricUseCase, SampleLevelMetric
from lighteval.utils.language import Language


Expand Down
1 change: 0 additions & 1 deletion src/lighteval/metrics/imports/bert_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,6 @@ def __init__(

self._baseline_vals = None
self.baseline_path = baseline_path
self.use_custom_baseline = self.baseline_path is not None
if self.baseline_path is None:
self.baseline_path = os.path.join(
os.path.dirname(__file__),
Expand Down
14 changes: 9 additions & 5 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@
ROUGE,
BertScore,
ExactMatches,
Extractiveness,
F1_score,
Faithfulness,
JudgeLLM,
LoglikelihoodAcc,
MajAtK,
Recall,
StringDistance,
acc_golds_likelihood,
extractiveness,
faithfulness,
)
from lighteval.metrics.normalizations import (
LogProbCharNorm,
Expand All @@ -61,7 +61,7 @@
remove_braces_and_strip,
)
from lighteval.metrics.sample_preparator import GenerativePreparator, LoglikelihoodPreparator, PerplexityPreparator
from lighteval.metrics.utils.utils import (
from lighteval.metrics.utils.metric_utils import (
CorpusLevelMetric,
CorpusLevelMetricGrouping,
Metric,
Expand Down Expand Up @@ -175,7 +175,9 @@ class Metrics(Enum):
)
extractiveness = SampleLevelMetricGrouping(
metric_name=["summarization_coverage", "summarization_density", "summarization_compression"],
sample_level_fn=extractiveness,
sample_level_fn=Extractiveness(
normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
).compute,
category=MetricCategory.GENERATIVE,
use_case=MetricUseCase.SUMMARIZATION,
corpus_level_fn={
Expand Down Expand Up @@ -223,7 +225,9 @@ class Metrics(Enum):
)
faithfulness = SampleLevelMetric(
metric_name="summac",
sample_level_fn=faithfulness,
sample_level_fn=Faithfulness(
normalize_input=remove_braces, normalize_pred=remove_braces_and_strip, input_column="text"
).compute,
category=MetricCategory.GENERATIVE,
use_case=MetricUseCase.SUMMARIZATION,
corpus_level_fn=np.mean,
Expand Down
120 changes: 101 additions & 19 deletions src/lighteval/metrics/metrics_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ def __init__(
Defaults to None if no normalization is applied.
strip_strings (bool, optional): Whether to strip both reference and predictions. Defaults to False.
"""
if aggregation_function is None:
aggregation_function = max

self.aggregation_function = aggregation_function
self.normalize_gold = normalize_gold
Expand Down Expand Up @@ -562,7 +564,7 @@ def __init__(
self.normalize_gold = normalize_gold
self.normalize_pred = normalize_pred

def compute(self, golds: list[str], predictions: list[str]) -> dict:
def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict:
"""Computes the prediction, recall and f1 score using the bert scorer.
Args:
Expand Down Expand Up @@ -591,24 +593,104 @@ def compute(self, golds: list[str], predictions: list[str]) -> dict:
return {"BERTScore-P": p[0].item(), "BERTScore-R": r[0].item(), "BERTScore-F": f[0].item()}


# todo: make into clean classes with call to normalizer
def extractiveness(formatted_doc: Doc, predictions: list[str], **kwargs):
inp = remove_braces(formatted_doc.specific["text"])
pred = remove_braces_and_strip(predictions[0])
stats = DataStatsMetric().evaluate_example(pred, inp)
return {
"summarization_coverage": stats["coverage"],
"summarization_density": stats["density"],
"summarization_compression": stats["compression"],
}


# todo: make into clean classes with call to normalizer
def faithfulness(formatted_doc: Doc, predictions: list[str], **kwargs):
inp = remove_braces(formatted_doc.specific["text"])
pred = remove_braces_and_strip(predictions[0])
summac = SummaCZS(granularity="sentence", model_name="vitc", imager_load_cache=False) # , device=device)
return summac.score_one(inp, pred)["score"]
class Extractiveness:
def __init__(
self,
normalize_input: callable = remove_braces,
normalize_pred: callable = remove_braces_and_strip,
input_column: str = "text",
):
"""
Extractiveness metric class.
Args:
normalize_input (callable, optional): Function to normalize the input strings.
Defaults to remove_braces from lighteval.metrics.normalizations if no normalization is applied.
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
Defaults to remove_braces_and_strip from lighteval.metrics.normalizations if no normalization is applied.
input_column (str): Column in the formatted_doc to use for the input. Defaults to "text".
"""
self.stats_metric = None
self.normalize_input = normalize_input
self.normalize_pred = normalize_pred
self.input_column = input_column

def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
"""
Compute the extractiveness of the predictions.
This method calculates coverage, density, and compression scores for a single
prediction against the input text.
Args:
predictions (list[str]): Predicted strings, a list of length 1.
formatted_doc (Doc): The formatted document.
Returns:
dict[str, float]: The extractiveness scores.
"""
if self.stats_metric is None:
self.stats_metric = DataStatsMetric()

inp = formatted_doc.specific[self.input_column]
prediction = predictions[0]
if self.normalize_input:
inp = self.normalize_input(inp)
if self.normalize_pred:
prediction = self.normalize_pred(prediction)

stats = self.stats_metric.evaluate_example(prediction, inp)
return {
"summarization_coverage": stats["coverage"],
"summarization_density": stats["density"],
"summarization_compression": stats["compression"],
}


class Faithfulness:
def __init__(
self,
normalize_input: callable = remove_braces,
normalize_pred: callable = remove_braces_and_strip,
input_column: str = "text",
):
"""
Faithfulness metric class.
Args:
normalize_input (callable, optional): Function to normalize the input strings.
Defaults to remove_braces from lighteval.metrics.normalizations if no normalization is applied.
normalize_pred (callable, optional): Function to use to normalize the predicted strings.
Defaults to remove_braces_and_strip from lighteval.metrics.normalizations if no normalization is applied.
input_column (str): Column in the formatted_doc to use for the input. Defaults to "text".
"""
self.summac = None
self.normalize_input = normalize_input
self.normalize_pred = normalize_pred
self.input_column = input_column

def compute(self, predictions: list[str], formatted_doc: Doc, **kwargs) -> dict[str, float]:
"""
Compute the faithfulness of the predictions.
The SummaCZS (Summary Content Zero-Shot) model is used with configurable granularity and model variation.
Args:
predictions (list[str]): Predicted strings, a list of length 1.
formatted_doc (Doc): The formatted document.
Returns:
dict[str, float]: The faithfulness scores.
"""
if self.summac is None:
SummaCZS(granularity="sentence", model_name="vitc", imager_load_cache=False) # , device=device)
inp = formatted_doc.specific[self.input_column]
prediction = predictions[0]
if self.normalize_input:
inp = self.normalize_input(inp)
if self.normalize_pred:
prediction = self.normalize_pred(prediction)
return self.summac.score_one(inp, prediction)["score"]


class BLEURT:
Expand Down
24 changes: 14 additions & 10 deletions src/lighteval/metrics/normalizations.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,17 +104,20 @@ def _remove_boxed(text: str | None) -> str:
"""
if text is None:
return ""
if "\\boxed " in text:
left = "\\boxed "
assert text[: len(left)] == left
return text[len(left) :]
try:
if "\\boxed " in text:
left = "\\boxed "
assert text[: len(left)] == left
return text[len(left) :]

left = "\\boxed{"
left = "\\boxed{"

assert text[: len(left)] == left
assert text[-1] == "}"
assert text[: len(left)] == left
assert text[-1] == "}"

return text[len(left) : -1]
return text[len(left) : -1]
except Exception:
return ""

def _last_boxed_only_string(text: str) -> str | None:
"""Extract the last \\boxed{...} or \\fbox{...} element from a string."""
Expand Down Expand Up @@ -386,8 +389,9 @@ def remove_articles(text: str, lang: Language) -> str:
"""
Removes definite and indefinite articles from the text.
Generated using LLM then manually checked by non-expert.
Only languages that don't blend the articles, if you are native speaker,
we would appreciate adding also languages that blend the articles.
We currently only support languages that don't blend articles.
If you are a native speaker of a language where articles are blended,
we would appreciate your contribution!
"""
pattern = _ARTICLE_PATTERNS.get(lang)
return re.sub(pattern, " ", text) if pattern else text
Expand Down
File renamed without changes.
5 changes: 2 additions & 3 deletions src/lighteval/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def add_special_tokens(self):
def max_length(self) -> int:
return self._max_length

def init_model_parallel(self, model_parallel: bool = None) -> Tuple[bool, Optional[dict], Optional[str]]:
def init_model_parallel(self, model_parallel: bool | None = None) -> Tuple[bool, Optional[dict], Optional[str]]:
"""Compute all the parameters related to model_parallel"""
if not is_accelerate_available():
return False, None, None
Expand All @@ -147,7 +147,7 @@ def init_model_parallel(self, model_parallel: bool = None) -> Tuple[bool, Option
f"the number of local processes is {self.num_local_processes} "
f"and the number of GPUs is {len(max_memory_all_gpus)}"
)
if model_parallel:
if model_parallel is True:
max_memory_all_gpus = get_max_memory() # A dict of the max memory for all the gpus
if "cpu" in max_memory_all_gpus:
del max_memory_all_gpus["cpu"]
Expand Down Expand Up @@ -569,7 +569,6 @@ def greedy_until(
if max_new_tokens is None: # If generation size is not set, we go all the way
max_new_tokens = self.max_length - context_size
else:
print(self.max_length, context_size, max_new_tokens)
max_new_tokens = min(self.max_length - context_size, max_new_tokens)
if max_new_tokens < 1:
max_new_tokens = 1
Expand Down
18 changes: 11 additions & 7 deletions src/lighteval/models/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ class BaseModelConfig:
def __post_init__(self):
# Making sure this parameter is a boolean
self.multichoice_continuations_start_space = boolstring_to_bool(self.multichoice_continuations_start_space)
self.model_parallel = boolstring_to_bool(self.model_parallel)
self.compile = boolstring_to_bool(self.compile)

if self.quantization_config is not None and not is_bnb_available():
raise ImportError(NO_BNB_ERROR_MSG)
Expand Down Expand Up @@ -209,19 +211,21 @@ def init_configs(self, env_config: EnvConfig):
@dataclass
class VLLMModelConfig:
pretrained: str
gpu_memory_utilisation: float = 0.8
batch_size: int = -1
revision: str = "main"
gpu_memory_utilisation: float = 0.9 # lower this if you are running out of memory
revision: str = "main" # revision of the model
dtype: str | None = None
tensor_parallel_size: int = 1
data_parallel_size: int = 1
max_model_length: int = 1024
tensor_parallel_size: int = 1 # how many GPUs to use for tensor parallelism
pipeline_parallel_size: int = 1 # how many GPUs to use for pipeline parallelism
data_parallel_size: int = 1 # how many GPUs to use for data parallelism
max_model_length: int | None = None # maximum length of the model, ussually infered automatically. reduce this if you encouter OOM issues, 4096 is usually enough
swap_space: int = 4 # CPU swap space size (GiB) per GPU.
seed: int = 1234
trust_remote_code: bool = False
use_chat_template: bool = False
add_special_tokens: bool = True
multichoice_continuations_start_space: bool = True
multichoice_continuations_start_space: bool = (
True # whether to add a space at the start of each continuation in multichoice generation
)
subfolder: Optional[str] = None


Expand Down
Loading

0 comments on commit 3488e7d

Please sign in to comment.