Skip to content

Commit

Permalink
Add an automatic system to compute average for tasks with subtasks
Browse files Browse the repository at this point in the history
  • Loading branch information
clefourrier authored Feb 22, 2024
1 parent 77c2016 commit 62abc78
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 15 deletions.
5 changes: 3 additions & 2 deletions src/lighteval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,9 @@ def make_results_table(result_dict):

values = []

for k, dic in result_dict["results"].items():
version = result_dict["versions"][k]
for k in sorted(result_dict["results"].keys()):
dic = result_dict["results"][k]
version = result_dict["versions"][k] if k in result_dict["versions"] else ""
for m, v in dic.items():
if m.endswith("_stderr"):
continue
Expand Down
28 changes: 15 additions & 13 deletions src/lighteval/logging/info_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,18 +459,20 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
self.metric_aggregated[task_name][f"{metric_name}_stderr"] = float("nan")
hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when computing stderr.")

suite_average = {}
suite_nb = {}

for _, metrics in self.metric_aggregated.items():
for metric, value in metrics.items():
suite_average[metric] = suite_average.get(metric, 0) + value
suite_nb[metric] = suite_nb.get(metric, 0) + 1

for metric, value in suite_average.items():
suite_average[metric] = value / suite_nb[metric]

self.metric_aggregated["all"] = suite_average
# We group subtasks which belong to the same parent task, like MMLU, to compute an average on them
grouped_tasks = collections.defaultdict(list)
for k in self.metric_aggregated.keys():
if "|" in k:
suite, task, fewshot = k.split("|")
grouped_tasks[f"{suite}|{task.split(':')[0]}:_average|{fewshot}"].append(k)

for average_task, list_of_subtasks in grouped_tasks.items():
if len(list_of_subtasks) > 1:
metrics = list(self.metric_aggregated[list_of_subtasks[0]].keys())
self.metric_aggregated[average_task] = {
metric: sum([self.metric_aggregated[k][metric] for k in list_of_subtasks]) / len(list_of_subtasks)
for metric in metrics
}


class VersionsLogger:
Expand All @@ -485,7 +487,7 @@ class VersionsLogger:

# the versions dict will be a dict of task_name: task_version
# {"winogrande|winogrande_xl": 0}
versions: dict[str, int] = {"all": 0}
versions: dict[str, int] = {}

def log(self, task_name: str, task_version: int) -> None:
self.versions[task_name] = task_version
Expand Down

0 comments on commit 62abc78

Please sign in to comment.