Misc-multilingual tasks (#339)

* add multilignaul dynamic generative metrics * draft * finish multichoice config * update tokenizers + install nltk reqs * use punkt tab * Update src/lighteval/utils/imports.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * Update src/lighteval/metrics/normalizations.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * fix imports * remove unused import * finish implementation of templates + move stuff around * resolve nits * when in rome do as romans do (handle error messages the same way) * fix utils * nicers tests + fix them * nicer todo * add nice doscrings 📃 * add even more docstring * nit * fix test * add multilingual to dev group * merge nli, add languagees to literals * translation literals * add nli * add copa tasks + fix tranlation literals * add hellaswag tasks * remove custom telgu hellaswag * remove hindi hellaswag * add rc tasks + small nits * add rcb + chinese nli * add mcq tasks * add continuations + general qa tasks + missed tasks * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * add two new tasks + docs * add nice docs * update hellaswag with docs * move hellaswag to lighteval suite * add desc to tasks * Update src/lighteval/tasks/multilingual/tasks.py Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com> * enable returning none from templates + better typing * add nice docs * fix saving of partial fcs + better tasks doc * change unoficial hellaswag names to have community_prefix + unify hellaswag preprocesisng * community rename * let strip be optional in hellaswag * nits * fix remaning tasks in mcq * fixes * nits * fix up agieval + ceval * add comment * hellaswag fixes * hellaswag hind + mlqa + hindi/swahili arc + mintaka + triviaqa french * add openai mmlu, turkish mmlu, lumi and mgsm * improve qa readibility * fix norms * fix few shot splits + add boolq tasks * rename boolqa to boolq * Update src/lighteval/tasks/default_prompts.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> * Update src/lighteval/tasks/default_prompts.py Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --------- Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Co-authored-by: Hynek Kydlicek <kydliceh.hynek@gmail.com> Co-authored-by: Clémentine Fourrier <22726840+clefourrier@users.noreply.github.com>
huggingface · Oct 10, 2024 · b018c9c · b018c9c
1 parent 1dfd77d
commit b018c9c
Show file tree

Hide file tree

Showing 6 changed files with 1,405 additions and 222 deletions.
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -756,13 +756,16 @@ def headqa(line, task_name: str = None):
 
 
 def hellaswag_preprocess(
-    text: str, wikihow_artifacts: list[str] = [" [title]"], truncate_dots: bool = False, strip_text: bool = False
+    text: str,
+    wikihow_artifacts: list[str] = [" [title]"],
+    truncate_dots: bool = False,
+    strip_text: bool = False,
+    dot_replacement: str = ". ",
 ):
-    """Comes from AiHarness"""
-    # text = text.strip()
+    """Comes from LM Eval Harness"""
     # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
-    for dot_repl in wikihow_artifacts:
-        text = text.replace(dot_repl, ". ")
+    for wikihow_artifact in wikihow_artifacts:
+        text = text.replace(wikihow_artifact, dot_replacement)
     text = re.sub("\\[.*?\\]", "", text)
     text = text.replace("  ", " ")
     if truncate_dots:

diff --git a/src/lighteval/tasks/multilingual/adapters.py b/src/lighteval/tasks/multilingual/adapters.py
@@ -20,17 +20,21 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import os
 import re
 
 import numpy as np
+from langcodes import standardize_tag
 
 from lighteval.tasks.default_prompts import LETTER_INDICES
 from lighteval.tasks.multilingual.utils.adapters_utils import (
     extract_answers_from_string,
     multichoice_join,
     multichoice_to_single_choice,
 )
+from lighteval.tasks.templates.continuation import ContinuationInput
 from lighteval.tasks.templates.multichoice import MCQInput
+from lighteval.tasks.templates.qa import QAInput
 from lighteval.tasks.templates.utils.formatting_utils import PUNCT
 from lighteval.tasks.templates.utils.formulation import CFFormulation, Formulation
 from lighteval.tasks.templates.utils.translation_literals import TranslationLiterals
@@ -60,14 +64,14 @@ def get_m3exam_adapter(lang: Language, line: dict) -> MCQInput | None:
 def thai_exams_adapter(line: dict) -> MCQInput | None:
     pos_letters = [letter.lower() for letter in LETTER_INDICES[:5]]
 
-    lettr_to_choices = {letter: line[letter] for letter in pos_letters if letter in line}
-    if any(opt.strip() == "" for opt in lettr_to_choices.values()):
+    letter_to_choices = {letter: line[letter] for letter in pos_letters if letter in line}
+    if any(opt.strip() == "" for opt in letter_to_choices.values()):
         return None
 
-    gold_index = list(lettr_to_choices.keys()).index(line["answer"])
+    gold_index = list(letter_to_choices.keys()).index(line["answer"])
     return {
         "question": line["question"],
-        "choices": list(lettr_to_choices.values()),
+        "choices": list(letter_to_choices.values()),
         "gold_idx": gold_index,
     }
 
@@ -111,7 +115,7 @@ def ceval_adapter(lang: Language, formulation: Formulation, line: dict) -> MCQIn
 
     parts = line["question"].rsplit("____", maxsplit=1)
     cleaned_question = parts[0].rstrip(PUNCT).strip()
-    possible_answers_part = parts[1].lstrip(PUNCT)
+    possible_answers_part = parts[1].strip().lstrip(PUNCT)
     gold_index = LETTER_INDICES.index(line["answer"])
 
     # We only attempt to extract answers if the answers are a chinese numbers
@@ -207,3 +211,68 @@ def agieval_adapter(lang: Language, formulation: Formulation, line: dict) -> MCQ
         "gold_idx": gold_index,
         "context": context,
     }
+
+
+def xcodah_adapter(lang: Language, line: dict) -> MCQInput | None:
+    translation_literals = TranslationLiterals(lang)
+
+    gold_index = line["question"]["choices"]["label"].index(line["answerKey"])
+    # All the choices have already common prefix "baken in" so we have to remove to get clearer signal
+    # Extract common prefix from choices
+    choices = line["question"]["choices"]["text"]
+    common_prefix = os.path.commonprefix(choices)
+
+    # Backtract to first space to get good tokenization
+    first_word = common_prefix.rfind(translation_literals.word_space)
+
+    # If there is no word_space we shouldn't remove the common prefix
+    common_prefix = common_prefix[:first_word] if first_word != -1 else ""
+
+    # Remove common prefix from each choice
+    cleaned_choices = [choice[len(common_prefix) :] for choice in choices]
+
+    if any(len(c.strip()) == 0 for c in cleaned_choices):
+        return None
+
+    return {
+        "question": common_prefix,
+        "choices": cleaned_choices,
+        "gold_idx": gold_index,
+    }
+
+
+def winogrand_adapter(lang: Language, line: dict) -> ContinuationInput | None:
+    translation_literals = TranslationLiterals(lang)
+    if line["sentence"].count("_") != 1:
+        return None
+
+    query, end_of_target = line["sentence"].split("_")
+    if len(query.strip()) == 0:
+        return None
+
+    options = [line["option1"], line["option2"]]
+    return {
+        "context": query,
+        "continuations": [f"{o}{translation_literals.word_space}{end_of_target}" for o in options],
+        "gold_idx": int(line["answer"]) - 1,
+    }
+
+
+def get_mkqa_adapter(lang: Language, line: dict) -> QAInput | None:
+    lang_key = "zh_cn" if lang == Language.CHINESE else standardize_tag(lang.value)
+    text = line["answers"][lang_key][0]["text"]
+    if text is None:
+        return None
+
+    aliases = line["answers"][lang_key][0]["aliases"]
+    answers = list(filter(lambda x: len(x.strip()) > 0, [text] + aliases))
+    # Some samples are broken so this is heuristic
+    # e. g   'text': '七月 20, 1969',
+    #        'aliases': ['1', 'u', ',', '2', ' ', '6', 'l', 'y', '9', '0', 'j']}],
+    if len(answers) == 0 or len(answers) > 5:
+        return None
+
+    return {
+        "question": line["queries"][lang_key],
+        "choices": answers,
+    }