From 59c3081f60e519c7fc5ff9b031711898dfd044fc Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 8 Apr 2024 14:31:00 +0000
Subject: [PATCH 01/10] init

---
 src/lighteval/logging/evaluation_tracker.py | 209 +++++++++-----------
 src/lighteval/logging/info_loggers.py       |  16 +-
 src/lighteval/main_nanotron.py              |   7 +-
 3 files changed, 97 insertions(+), 135 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 35a835bc..2f672246 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -45,7 +45,7 @@
 
 
 if is_nanotron_available():
-    from nanotron.config import Config
+    from nanotron.config import GeneralArgs
 
 
 class EnhancedJSONEncoder(json.JSONEncoder):
@@ -80,16 +80,37 @@ class EvaluationTracker:
     task_config_logger: TaskConfigLogger
     hub_results_org: str
 
-    def __init__(self, hub_results_org: str = "", token: str = "") -> None:
+    def __init__(
+        self,
+        output_dir: str = None,
+        hub_results_org: str = "",
+        push_results_to_hub: bool = False,
+        push_details_to_hub: bool = False,
+        push_results_to_tensorboard: bool = False,
+        tensorboard_metric_prefix: str = "eval",
+        tensorboard_org: str = "",
+        public: bool = False,
+        token: str = "",
+        nanotron_run_info: GeneralArgs = None,
+    ) -> None:
         """
         Creates all the necessary loggers for evaluation tracking.
 
         Args:
+            output_dir (str): Local folder path where you want results to be saved
             hub_results_org (str): The organisation to push the results to. See
                 more details about the datasets organisation in
                 [`EvaluationTracker.save`]
+            push_results_to_hub (bool): If True, results are pushed to the hub.
+                Results will be pushed either to `{hub_results_org}/results`, a public dataset, if `public` is True else to `{hub_results_org}/private-results`, a private dataset.
+            push_details_to_hub (bool): If True, details are pushed to the hub.
+                Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset,
+                if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset.
+            push_results_to_tensorboard (bool): If True, will create and push the results for a tensorboard folder on the hub
+            public (bool): If True, results and details are pushed in private orgs
             token (str): Token to use when pushing to the hub. This token should
                 have write access to `hub_results_org`.
+            nanotron_run_info (GeneralArgs): Reference to informations about Nanotron models runs
         """
         self.details_logger = DetailsLogger()
         self.metrics_logger = MetricsLogger()
@@ -101,35 +122,22 @@ def __init__(self, hub_results_org: str = "", token: str = "") -> None:
         self.hub_private_results_repo = f"{hub_results_org}/private-results"
         self.api = HfApi(token=token)
 
-    def save(
-        self,
-        output_dir: str,
-        push_results_to_hub: bool,
-        push_details_to_hub: bool,
-        public: bool,
-        push_results_to_tensorboard: bool = False,
-    ) -> None:
-        """Saves the experiment information and results to files, and to the hub if requested.
-
-        Note:
-            In case of save failure, this function will only print a warning, with the error message.
-
-        Args:
-            output_dir (str): Local folder path where you want results to be saved
-            push_results_to_hub (bool): If True, results are pushed to the hub.
-                Results will be pushed either to `{hub_results_org}/results`, a public dataset, if `public` is True else to `{hub_results_org}/private-results`, a private dataset.
-            push_details_to_hub (bool): If True, details are pushed to the hub.
-                Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset,
-                if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset.
-            public (bool): If True, results and details are pushed in private orgs
-
-        """
+        self.output_dir = output_dir
+        self.push_results_to_hub = push_results_to_hub
+        self.push_details_to_hub = push_details_to_hub
+        self.tensorboard_metric_prefix = tensorboard_metric_prefix
+        self.tensorboard_org = tensorboard_org
+        self.push_results_to_tensorboard = push_results_to_tensorboard
+        self.nanotron_run_info = nanotron_run_info
+        self.public = public
+
+    def save(self) -> None:
+        """Saves the experiment information and results to files, and to the hub if requested."""
         hlog("Saving experiment tracker")
-        # try:
         date_id = datetime.now().isoformat().replace(":", "-")
 
-        output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name
-        output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name
+        output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name
+        output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name
         output_dir_details_sub_folder = output_dir_details / date_id
         output_dir_results.mkdir(parents=True, exist_ok=True)
         output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True)
@@ -163,14 +171,8 @@ def save(
 
         for task_name, task_details in self.details_logger.details.items():
             output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
-            # Create a dataset from the dictionary
-            try:
-                dataset = Dataset.from_list([asdict(detail) for detail in task_details])
-            except Exception:
-                # We force cast to str to avoid formatting problems for nested objects
-                dataset = Dataset.from_list(
-                    [{k: str(v) for k, v in asdict(detail).items()} for detail in task_details]
-                )
+            # Create a dataset from the dictionary - we force cast to str to avoid formatting problems for nested objects
+            dataset = Dataset.from_list([{k: str(v) for k, v in asdict(detail).items()} for detail in task_details])
 
             # We don't keep 'id' around if it's there
             column_names = dataset.column_names
@@ -182,30 +184,25 @@ def save(
             # Save the dataset to a Parquet file
             dataset.to_parquet(output_file_details.as_posix())
 
-        if push_results_to_hub:
+        if self.push_results_to_hub:
             self.api.upload_folder(
-                repo_id=self.hub_results_repo if public else self.hub_private_results_repo,
+                repo_id=self.hub_results_repo if self.public else self.hub_private_results_repo,
                 folder_path=output_dir_results,
                 path_in_repo=self.general_config_logger.model_name,
                 repo_type="dataset",
                 commit_message=f"Updating model {self.general_config_logger.model_name}",
             )
 
-        if push_details_to_hub:
+        if self.push_details_to_hub:
             self.details_to_hub(
-                model_name=self.general_config_logger.model_name,
                 results_file_path=output_results_in_details_file,
                 details_folder_path=output_dir_details_sub_folder,
-                push_as_public=public,
             )
 
-        if push_results_to_tensorboard:
-            self.push_results_to_tensorboard(
+        if self.push_results_to_tensorboard:
+            self.push_to_tensorboard(
                 results=self.metrics_logger.metric_aggregated, details=self.details_logger.details
             )
-        # except Exception as e:
-        #     hlog("WARNING: Could not save results")
-        #     hlog(repr(e))
 
     def generate_final_dict(self) -> dict:
         """Aggregates and returns all the logger's experiment information in a dictionary.
@@ -230,29 +227,25 @@ def generate_final_dict(self) -> dict:
 
     def details_to_hub(
         self,
-        model_name: str,
         results_file_path: Path | str,
         details_folder_path: Path | str,
-        push_as_public: bool = False,
     ) -> None:
         """Pushes the experiment details (all the model predictions for every step) to the hub.
 
         Args:
-            model_name (str): Name of the currently evaluated model
             results_file_path (str or Path): Local path of the current's experiment aggregated results individual file
             details_folder_path (str or Path): Local path of the current's experiment details folder.
                 The details folder (created by [`EvaluationTracker.save`]) should contain one parquet file per task used during the evaluation run of the current model.
-            push_as_public (bool, optional): If True, the results will be pushed publicly, else the datasets will be private.
 
         """
         results_file_path = str(results_file_path)
         details_folder_path = str(details_folder_path)
 
-        sanitized_model_name = model_name.replace("/", "__")
+        sanitized_model_name = self.general_config_logger.model_name.replace("/", "__")
 
         # "Default" detail names are the public detail names (same as results vs private-results)
         repo_id = f"{self.hub_results_org}/details_{sanitized_model_name}"
-        if not push_as_public:  # if not public, we add `_private`
+        if not self.public:  # if not public, we add `_private`
             repo_id = f"{repo_id}_private"
 
         sub_folder_path = os.path.basename(results_file_path).replace(".json", "").replace("results_", "")
@@ -265,7 +258,7 @@ def details_to_hub(
 
         if len(checked_paths) == 0:
             hlog(f"Repo {repo_id} not found for {results_file_path}. Creating it.")
-            self.api.create_repo(repo_id, private=not (push_as_public), repo_type="dataset", exist_ok=True)
+            self.api.create_repo(repo_id, private=not (self.public), repo_type="dataset", exist_ok=True)
 
         # Create parquet version of results file as well
         results = load_dataset("json", data_files=results_file_path)
@@ -287,43 +280,45 @@ def details_to_hub(
             repo_id=repo_id, folder_path=details_folder_path, path_in_repo=sub_folder_path, repo_type="dataset"
         )
 
-        self.recreate_metadata_card(repo_id, model_name)
+        self.recreate_metadata_card(repo_id)
 
-    def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:  # noqa: C901
+    def recreate_metadata_card(self, repo_id: str) -> None:  # noqa: C901
         """Fully updates the details repository metadata card for the currently evaluated model
 
         Args:
             repo_id (str): Details dataset repository path on the hub (`org/dataset`)
-            model_name (str): Name of the currently evaluated model.
-
         """
         # Add a nice dataset card and the configuration YAML
         files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
         results_files = [f for f in files_in_repo if ".json" in f]
-        parquet_results_files = [f for f in files_in_repo if ".parquet" in f and "results_" in f]
-        parquet_files = [f for f in files_in_repo if ".parquet" in f and "results_" not in f]
+        parquet_files = [f for f in files_in_repo if ".parquet" in f]
         multiple_results = len(results_files) > 1
 
         # Get last eval results date for each task (evals might be non overlapping)
         last_eval_date_results = {}
         for sub_file in parquet_files:
+            # We focus on details only
+            if "results_" in sub_file:
+                continue
+
             # subfile have this general format:
             # `2023-09-03T10-57-04.203304/details_harness|hendrycksTest-us_foreign_policy|5_2023-09-03T10-57-04.203304.parquet`
             # in the iso date, the `:` are replaced by `-` because windows does not allow `:` in their filenames
-
-            task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0]
+            task_name = (
+                os.path.basename(sub_file).replace("details_", "").split("_202")[0]
+            )  # 202 for dates, 2023, 2024, ...
             # task_name is then equal to `leaderboard|mmlu:us_foreign_policy|5`
 
-            iso_date = os.path.dirname(sub_file)
             # to be able to parse the filename as iso dates, we need to re-replace the `-` with `:`
             # iso_date[13] = iso_date[16] = ':'
-            iso_date = iso_date[:13] + ":" + iso_date[14:16] + ":" + iso_date[17:]
-
+            dir_name = os.path.dirname(sub_file)
+            iso_date = dir_name[:13] + ":" + dir_name[14:16] + ":" + dir_name[17:]
             eval_date = datetime.fromisoformat(iso_date)
 
             last_eval_date_results[task_name] = (
                 max(last_eval_date_results[task_name], eval_date) if task_name in last_eval_date_results else eval_date
             )
+
         max_last_eval_date_results = list(last_eval_date_results.values())[0]
         # Now we convert them in iso-format
         for task in last_eval_date_results:
@@ -336,43 +331,20 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
         card_metadata = MetadataConfigs()
 
         # Add the results config and add the result file as a parquet file
-        for sub_file in parquet_results_files:
-            eval_date = os.path.basename(sub_file).replace("results_", "").replace(".parquet", "")
-            sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date)
-            sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", max_last_eval_date_results)
-
-            repo_file_name = os.path.basename(sub_file)
-
-            if multiple_results:
-                if "results" not in card_metadata:
-                    card_metadata["results"] = {
-                        "data_files": [{"split": sanitized_eval_date, "path": [repo_file_name]}]
-                    }
-                else:
-                    former_entry = card_metadata["results"]
-                    card_metadata["results"] = {
-                        "data_files": former_entry["data_files"]
-                        + [{"split": sanitized_eval_date, "path": [repo_file_name]}]
-                    }
+        for sub_file in parquet_files:
+            if "results_" in sub_file:
+                eval_date = os.path.basename(sub_file).replace("results_", "").replace(".parquet", "")
+                sanitized_task = "results"
+                sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", max_last_eval_date_results)
+                repo_file_name = os.path.basename(sub_file)
             else:
-                if "results" in card_metadata:
-                    raise ValueError(
-                        f"Entry for results already exists in {former_entry} for repo {repo_id} and file {sub_file}"
-                    )
-                card_metadata["results"] = {"data_files": [{"split": sanitized_eval_date, "path": [repo_file_name]}]}
+                task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0]
+                sanitized_task = re.sub(r"\W", "_", task_name)
+                eval_date = os.path.dirname(sub_file)
+                sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", last_eval_date_results[task_name])
+                repo_file_name = os.path.join("**", os.path.basename(sub_file))
 
-            if sanitized_eval_date == sanitized_last_eval_date_results:
-                all_entry = card_metadata["results"]["data_files"]
-                card_metadata["results"] = {"data_files": all_entry + [{"split": "latest", "path": [repo_file_name]}]}
-
-        # Add the tasks details configs
-        for sub_file in parquet_files:
-            task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0]
-            sanitized_task = re.sub(r"\W", "_", task_name)
-            eval_date = os.path.dirname(sub_file)
             sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date)
-            repo_file_name = os.path.join("**", os.path.basename(sub_file))
-            sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", last_eval_date_results[task_name])
 
             if multiple_results:
                 if sanitized_task not in card_metadata:
@@ -400,6 +372,9 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
                     "data_files": all_entry + [{"split": "latest", "path": [repo_file_name]}]
                 }
 
+            if "results_" in sub_file:
+                continue
+
             # Special case for MMLU with a single split covering it all
             # We add another config with all MMLU splits results together for easy inspection
             SPECIAL_TASKS = [
@@ -481,7 +456,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
 
         card_data = DatasetCardData(
             dataset_summary=f"Dataset automatically created during the evaluation run of model "
-            f"[{model_name}](https://huggingface.co/{model_name})"
+            f"[{self.general_config_logger.model_name}](https://huggingface.co/{self.general_config_logger.model_name})"
             f"{org_string}.\n\n"
             f"The dataset is composed of {len(card_metadata) - 1} configuration, each one coresponding to one of the evaluated task.\n\n"
             f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each "
@@ -494,8 +469,8 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
             f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. "
             f'You find each in the results and the "latest" split for each eval):\n\n'
             f"```python\n{results_string}\n```",
-            repo_url=f"https://huggingface.co/{model_name}",
-            pretty_name=f"Evaluation run of {model_name}",
+            repo_url=f"https://huggingface.co/{self.general_config_logger.model_name}",
+            pretty_name=f"Evaluation run of {self.general_config_logger.model_name}",
             leaderboard_url=leaderboard_url,
             point_of_contact=point_of_contact,
         )
@@ -507,27 +482,26 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None:
         )
         card.push_to_hub(repo_id, repo_type="dataset")
 
-    def push_results_to_tensorboard(  # noqa: C901
+    def push_to_tensorboard(  # noqa: C901
         self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
     ):
         if not is_nanotron_available():
             hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
             return
-        config: Config = self.general_config_logger.config
-        lighteval_config = config.lighteval
-        try:
-            global_step = config.general.step
-        except ValueError:
-            global_step = 0
-        if config.lighteval.logging.tensorboard_metric_prefix is not None:
-            prefix = config.lighteval.logging.tensorboard_metric_prefix
+        prefix = self.tensorboard_metric_prefix
+
+        if self.nanotron_run_info is not None:
+            global_step = self.nanotron_run_info.step
+            run = f"{self.nanotron_run_info.run}_{prefix}"
         else:
-            prefix = "eval"
-        output_dir_tb = Path(lighteval_config.logging.local_output_path) / "tb" / (config.general.run + "_" + prefix)
+            global_step = 0
+            run = prefix
+
+        output_dir_tb = Path(self.output_dir) / "tb" / run
         output_dir_tb.mkdir(parents=True, exist_ok=True)
         tb_context = HFSummaryWriter(
             logdir=str(output_dir_tb),
-            repo_id=lighteval_config.logging.hub_repo_tensorboard,
+            repo_id=self.tensorboard_org,
             repo_private=True,
             path_in_repo="tb",
             commit_every=6000,  # Very long time so that we can change our files names and trigger push ourselves (see below)
@@ -559,14 +533,13 @@ def push_results_to_tensorboard(  # noqa: C901
                     )
                 else:
                     tb_context.add_scalar(f"{prefix}/{task_name}/{metric}", value, global_step=global_step)
-        # e.g. MMLU
+        # Tasks with subtasks
         for name, values in bench_averages.items():
             for metric, values in values.items():
                 hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
                 tb_context.add_scalar(f"{prefix}/{name}/{metric}", sum(values) / len(values), global_step=global_step)
 
         tb_context.add_text("eval_config", obj_to_markdown(results), global_step=global_step)
-        # tb_context.add_text("eval_sizes", obj_to_markdown(sizes), global_step=global_step)
 
         for task_name, task_details in details.items():
             tb_context.add_text(
@@ -589,8 +562,6 @@ def push_results_to_tensorboard(  # noqa: C901
         # Now we can push to the hub
         tb_context.scheduler.trigger()
         hlog(
-            f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
-            f" at {output_dir_tb} and global_step {global_step}"
+            f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_org}/{output_dir_tb}/tensorboard"
+            f"at global_step {global_step}"
         )
-        # except Exception as e:
-        #     logger.warning(f"Could not push to tensorboard\n{e}")
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index b11c124c..fbfbdbeb 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -37,11 +37,7 @@
 from lighteval.models.model_output import ModelReturn
 from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
 from lighteval.tasks.requests import Doc
-from lighteval.utils import as_list, is_nanotron_available, sanitize_numpy
-
-
-if is_nanotron_available():
-    from nanotron.config import Config
+from lighteval.utils import as_list, sanitize_numpy
 
 
 @dataclass(init=False)
@@ -86,9 +82,6 @@ class GeneralConfigLogger:
     model_dtype: str = None
     model_size: str = None
 
-    # Nanotron config
-    config: "Config" = None
-
     def __init__(self) -> None:
         """Stores the current lighteval commit for reproducibility, and starts the evaluation timer."""
         try:
@@ -105,7 +98,6 @@ def log_args_info(
         override_batch_size: Union[None, int],
         max_samples: Union[None, int],
         job_id: str,
-        config: "Config" = None,
     ) -> None:
         """
         Logs the information about the arguments passed to the method.
@@ -117,17 +109,11 @@ def log_args_info(
                 Else, the batch size is automatically inferred depending on what fits in memory.
             max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available.
             job_id (str): job ID, used to retrieve logs.
-            config (optional): Nanotron Config
-
-        Returns:
-            None
-
         """
         self.num_fewshot_seeds = num_fewshot_seeds
         self.override_batch_size = override_batch_size
         self.max_samples = max_samples
         self.job_id = job_id
-        self.config = config
 
     def log_model_info(self, model_info: ModelInfo) -> None:
         """
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 4610ea86..9bce875f 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -96,7 +96,12 @@ def main(
             data_parallel_size=lighteval_config.parallelism.dp,
         )
 
-        evaluation_tracker = EvaluationTracker(token=TOKEN)
+        evaluation_tracker = EvaluationTracker(
+            token=TOKEN,
+            output_dir=lighteval_config.logging.local_output_path,
+            tensorboard_org=lighteval_config.logging.hub_repo_tensorboard,
+            nanotron_run_info=nanotron_config.general,
+        )
         evaluation_tracker.general_config_logger.log_args_info(
             num_fewshot_seeds=1,
             override_batch_size=None,

From 77a3d8fe3b799bce482e9a89b4fefebce10e4dca Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 8 Apr 2024 14:37:24 +0000
Subject: [PATCH 02/10] adding tensorboard_metric_prefix

---
 src/lighteval/logging/evaluation_tracker.py | 19 +++++++++++--------
 src/lighteval/main_nanotron.py              |  3 ++-
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 2f672246..496ad250 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -88,7 +88,6 @@ def __init__(
         push_details_to_hub: bool = False,
         push_results_to_tensorboard: bool = False,
         tensorboard_metric_prefix: str = "eval",
-        tensorboard_org: str = "",
         public: bool = False,
         token: str = "",
         nanotron_run_info: GeneralArgs = None,
@@ -117,18 +116,22 @@ def __init__(
         self.versions_logger = VersionsLogger()
         self.general_config_logger = GeneralConfigLogger()
         self.task_config_logger = TaskConfigLogger()
-        self.hub_results_org = hub_results_org
-        self.hub_results_repo = f"{hub_results_org}/results"
-        self.hub_private_results_repo = f"{hub_results_org}/private-results"
+
         self.api = HfApi(token=token)
 
         self.output_dir = output_dir
+
+        self.hub_results_org = hub_results_org  # will also contain tensorboard results
+
+        self.hub_results_repo = f"{hub_results_org}/results"
+        self.hub_private_results_repo = f"{hub_results_org}/private-results"
         self.push_results_to_hub = push_results_to_hub
         self.push_details_to_hub = push_details_to_hub
-        self.tensorboard_metric_prefix = tensorboard_metric_prefix
-        self.tensorboard_org = tensorboard_org
+
         self.push_results_to_tensorboard = push_results_to_tensorboard
+        self.tensorboard_metric_prefix = tensorboard_metric_prefix
         self.nanotron_run_info = nanotron_run_info
+
         self.public = public
 
     def save(self) -> None:
@@ -501,7 +504,7 @@ def push_to_tensorboard(  # noqa: C901
         output_dir_tb.mkdir(parents=True, exist_ok=True)
         tb_context = HFSummaryWriter(
             logdir=str(output_dir_tb),
-            repo_id=self.tensorboard_org,
+            repo_id=self.hub_results_org,
             repo_private=True,
             path_in_repo="tb",
             commit_every=6000,  # Very long time so that we can change our files names and trigger push ourselves (see below)
@@ -562,6 +565,6 @@ def push_to_tensorboard(  # noqa: C901
         # Now we can push to the hub
         tb_context.scheduler.trigger()
         hlog(
-            f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_org}/{output_dir_tb}/tensorboard"
+            f"Pushed to tensorboard at https://huggingface.co/{self.hub_results_org}/{output_dir_tb}/tensorboard"
             f"at global_step {global_step}"
         )
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index 9bce875f..f479c5d7 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -99,7 +99,8 @@ def main(
         evaluation_tracker = EvaluationTracker(
             token=TOKEN,
             output_dir=lighteval_config.logging.local_output_path,
-            tensorboard_org=lighteval_config.logging.hub_repo_tensorboard,
+            hub_results_org=lighteval_config.logging.hub_repo_tensorboard,
+            tensorboard_metric_prefix=lighteval_config.logging.tensorboard_metric_prefix,
             nanotron_run_info=nanotron_config.general,
         )
         evaluation_tracker.general_config_logger.log_args_info(

From d5494f9b23bf271efcc50c63619df67fda4d7a8e Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 8 Apr 2024 14:52:27 +0000
Subject: [PATCH 03/10] accelerate launcher

---
 run_evals_accelerate.py          |  1 +
 src/lighteval/main_accelerate.py | 14 ++++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py
index a743cb49..e7c8c1d3 100644
--- a/run_evals_accelerate.py
+++ b/run_evals_accelerate.py
@@ -47,6 +47,7 @@ def get_parser():
     parser.add_argument("--push_results_to_hub", default=False, action="store_true")
     parser.add_argument("--save_details", action="store_true")
     parser.add_argument("--push_details_to_hub", default=False, action="store_true")
+    parser.add_argument("--push_results_to_tensorboard", default=False, action="store_true")
     parser.add_argument(
         "--public_run", default=False, action="store_true", help="Push results and details to a public repo"
     )
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
index d2ffbbe3..aada6c0c 100644
--- a/src/lighteval/main_accelerate.py
+++ b/src/lighteval/main_accelerate.py
@@ -56,7 +56,15 @@
 @htrack()
 def main(args):
     env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
-    evaluation_tracker = EvaluationTracker(hub_results_org=args.results_org, token=TOKEN)
+    evaluation_tracker = EvaluationTracker(
+        output_dir=args.output_dir,
+        hub_results_org=args.results_org,
+        push_results_to_hub=args.push_results_to_hub,
+        push_details_to_hub=args.push_details_to_hub,
+        push_results_to_tensorboard=args.push_results_to_tensorboard,
+        public_run=args.public_run,
+        token=TOKEN,
+    )
     evaluation_tracker.general_config_logger.log_args_info(
         args.num_fewshot_seeds, args.override_batch_size, args.max_samples, args.job_id
     )
@@ -124,9 +132,7 @@ def main(args):
             evaluation_tracker.details_logger.aggregate()
 
             if args.output_dir:
-                evaluation_tracker.save(
-                    args.output_dir, args.push_results_to_hub, args.push_details_to_hub, args.public_run
-                )
+                evaluation_tracker.save()
 
             final_dict = evaluation_tracker.generate_final_dict()
 

From f5e1506310592a63ec162beb06ec8c042c1fe801 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Mon, 8 Apr 2024 15:44:05 +0000
Subject: [PATCH 04/10] Add tensorboardX import - huggingface_hub is atm
 testing the import of tensrboard instead, we likely will have to wait for a
 lib update before merging this PR...

---
 README.md                                   |  2 +-
 pyproject.toml                              |  1 +
 src/lighteval/logging/evaluation_tracker.py | 13 +++++--------
 src/lighteval/main_accelerate.py            |  2 +-
 src/lighteval/utils.py                      |  9 +++++++++
 5 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 2ed50829..8143667a 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ Install the dependencies. For the default installation, you just need:
 pip install .
 ```
 
-If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`):
+If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`,`tensorboardX`):
 
 ```bash
 pip install '.[optional1,optional2]'
diff --git a/pyproject.toml b/pyproject.toml
index 8adc9aa5..e10a3eec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -84,6 +84,7 @@ nanotron = [
   "nanotron",
   "tensorboardX"
 ]
+tensorboardX = ["tensorboardX"]
 quality = ["ruff==v0.2.2","pre-commit"]
 tests = ["pytest==7.4.0"]
 dev = ["lighteval[accelerate,quality,tests]"]
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 496ad250..d3073ca9 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -41,7 +41,7 @@
     TaskConfigLogger,
     VersionsLogger,
 )
-from lighteval.utils import is_nanotron_available, obj_to_markdown
+from lighteval.utils import NO_TENSORBOARDX_WARN_MSG, is_nanotron_available, is_tensorboardX_available, obj_to_markdown
 
 
 if is_nanotron_available():
@@ -90,9 +90,9 @@ def __init__(
         tensorboard_metric_prefix: str = "eval",
         public: bool = False,
         token: str = "",
-        nanotron_run_info: GeneralArgs = None,
+        nanotron_run_info: "GeneralArgs" = None,
     ) -> None:
-        """
+        """)
         Creates all the necessary loggers for evaluation tracking.
 
         Args:
@@ -151,9 +151,6 @@ def save(self) -> None:
         hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}")
 
         config_general = copy.deepcopy(self.general_config_logger)
-        config_general.config = (
-            config_general.config.as_dict() if is_dataclass(config_general.config) else config_general.config
-        )
         config_general = asdict(config_general)
 
         to_dump = {
@@ -488,8 +485,8 @@ def recreate_metadata_card(self, repo_id: str) -> None:  # noqa: C901
     def push_to_tensorboard(  # noqa: C901
         self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
     ):
-        if not is_nanotron_available():
-            hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping")
+        if not is_tensorboardX_available:
+            hlog_warn(NO_TENSORBOARDX_WARN_MSG)
             return
         prefix = self.tensorboard_metric_prefix
 
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
index aada6c0c..12122c52 100644
--- a/src/lighteval/main_accelerate.py
+++ b/src/lighteval/main_accelerate.py
@@ -62,7 +62,7 @@ def main(args):
         push_results_to_hub=args.push_results_to_hub,
         push_details_to_hub=args.push_details_to_hub,
         push_results_to_tensorboard=args.push_results_to_tensorboard,
-        public_run=args.public_run,
+        public=args.public_run,
         token=TOKEN,
     )
     evaluation_tracker.general_config_logger.log_args_info(
diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py
index d3c32e99..21213099 100644
--- a/src/lighteval/utils.py
+++ b/src/lighteval/utils.py
@@ -191,6 +191,15 @@ def is_peft_available() -> bool:
 NO_PEFT_ERROR_MSG = "You are trying to use adapter weights models, for which you need `peft`, which is not available in your environment. Please install it using pip."
 
 
+def is_tensorboardX_available() -> bool:
+    return importlib.util.find_spec("tensorboardX") is not None
+
+
+NO_TENSORBOARDX_WARN_MSG = (
+    "You are trying to log using tensorboardX, which is not installed. Please install it using pip. Skipping."
+)
+
+
 def can_load_extended_tasks() -> bool:
     imports = []
     for package in ["langdetect"]:

From 2577c6fd1724fe29230af6943ac9ab73126027c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Tue, 21 May 2024 17:54:34 +0200
Subject: [PATCH 05/10] Update version of the hub

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 831780e8..b771942d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,7 +55,7 @@ keywords = ["evaluation", "nlp", "llm"]
 dependencies = [
     # Base dependencies
     "transformers>=4.38.0",
-    "huggingface_hub>=0.22.0",
+    "huggingface_hub>=0.23.0",
     "torch>=2.0",
     "GitPython>=3.1.41", # for logging
     "datasets>=2.14.0",

From dd89c0a770fc36fb061c5350bea98b3139760698 Mon Sep 17 00:00:00 2001
From: "clementine@huggingface.co" <clementine@huggingface.co>
Date: Wed, 22 May 2024 14:05:36 +0000
Subject: [PATCH 06/10] last fix

---
 src/lighteval/logging/evaluation_tracker.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index d3073ca9..16fd3d87 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -122,6 +122,12 @@ def __init__(
         self.output_dir = output_dir
 
         self.hub_results_org = hub_results_org  # will also contain tensorboard results
+        if hub_results_org in ["", None] and any(
+            [push_details_to_hub, push_results_to_hub, push_results_to_tensorboard]
+        ):
+            raise Exception(
+                "You need to select which org to push to, using `--results_org`, if you want to save information to the hub."
+            )
 
         self.hub_results_repo = f"{hub_results_org}/results"
         self.hub_private_results_repo = f"{hub_results_org}/private-results"
@@ -129,6 +135,7 @@ def __init__(
         self.push_details_to_hub = push_details_to_hub
 
         self.push_results_to_tensorboard = push_results_to_tensorboard
+        self.tensorboard_repo = f"{hub_results_org}/tensorboard_logs"
         self.tensorboard_metric_prefix = tensorboard_metric_prefix
         self.nanotron_run_info = nanotron_run_info
 
@@ -501,7 +508,7 @@ def push_to_tensorboard(  # noqa: C901
         output_dir_tb.mkdir(parents=True, exist_ok=True)
         tb_context = HFSummaryWriter(
             logdir=str(output_dir_tb),
-            repo_id=self.hub_results_org,
+            repo_id=self.tensorboard_repo,
             repo_private=True,
             path_in_repo="tb",
             commit_every=6000,  # Very long time so that we can change our files names and trigger push ourselves (see below)
@@ -562,6 +569,6 @@ def push_to_tensorboard(  # noqa: C901
         # Now we can push to the hub
         tb_context.scheduler.trigger()
         hlog(
-            f"Pushed to tensorboard at https://huggingface.co/{self.hub_results_org}/{output_dir_tb}/tensorboard"
+            f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard"
             f"at global_step {global_step}"
         )

From 48c3857cee457c5355c5d2c9701300cbef72a2f2 Mon Sep 17 00:00:00 2001
From: Nathan Habib <nathan.habib@huggingface.co>
Date: Mon, 8 Jul 2024 15:45:57 +0000
Subject: [PATCH 07/10] make style

---
 src/lighteval/logging/evaluation_tracker.py | 2 +-
 src/lighteval/utils.py                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 70905c5e..b0425348 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -495,7 +495,7 @@ def push_to_tensorboard(  # noqa: C901
         if not is_tensorboardX_available:
             hlog_warn(NO_TENSORBOARDX_WARN_MSG)
             return
-          
+
         if not is_nanotron_available():
             hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping")
             return
diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py
index 3faf837a..16235785 100644
--- a/src/lighteval/utils.py
+++ b/src/lighteval/utils.py
@@ -191,7 +191,6 @@ def is_peft_available() -> bool:
 NO_PEFT_ERROR_MSG = "You are trying to use adapter weights models, for which you need `peft`, which is not available in your environment. Please install it using pip."
 
 
-
 def is_tensorboardX_available() -> bool:
     return importlib.util.find_spec("tensorboardX") is not None
 
@@ -200,6 +199,7 @@ def is_tensorboardX_available() -> bool:
     "You are trying to log using tensorboardX, which is not installed. Please install it using pip. Skipping."
 )
 
+
 def is_openai_available() -> bool:
     return importlib.util.find_spec("openai") is not None
 

From da0e81268a891c85c6855c42f289bb13bdb654b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?=
 <22726840+clefourrier@users.noreply.github.com>
Date: Tue, 9 Jul 2024 09:54:59 +0200
Subject: [PATCH 08/10] Apply suggestions from code review

Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
---
 src/lighteval/logging/evaluation_tracker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index b0425348..2a14c1f5 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -319,7 +319,7 @@ def recreate_metadata_card(self, repo_id: str) -> None:  # noqa: C901
             # to be able to parse the filename as iso dates, we need to re-replace the `-` with `:`
             # iso_date[13] = iso_date[16] = ':'
             dir_name = os.path.dirname(sub_file)
-            iso_date = dir_name[:13] + ":" + dir_name[14:16] + ":" + dir_name[17:]
+            iso_date = ':'.join(dir_name.rsplit('-', 2))
             eval_date = datetime.fromisoformat(iso_date)
 
             last_eval_date_results[task_name] = (

From 440e9fd2d6f41dd3a2fa151e5fc3963b27ba15d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine?= <cle.fourrier@gmail.com>
Date: Tue, 9 Jul 2024 11:28:57 +0200
Subject: [PATCH 09/10] restored nanotron config in log

---
 src/lighteval/logging/info_loggers.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index 1eebbf18..c211d2e4 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -37,7 +37,11 @@
 from lighteval.models.model_output import ModelReturn
 from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
 from lighteval.tasks.requests import Doc
-from lighteval.utils import as_list, sanitize_numpy
+from lighteval.utils import as_list, is_nanotron_available, sanitize_numpy
+
+
+if is_nanotron_available():
+    from nanotron.config import Config
 
 
 @dataclass(init=False)
@@ -82,6 +86,9 @@ class GeneralConfigLogger:
     model_dtype: str = None
     model_size: str = None
 
+    # Nanotron config
+    config: "Config" = None
+
     def __init__(self) -> None:
         """Stores the current lighteval commit for reproducibility, and starts the evaluation timer."""
         try:
@@ -98,6 +105,7 @@ def log_args_info(
         override_batch_size: Union[None, int],
         max_samples: Union[None, int],
         job_id: str,
+        config: "Config" = None,
     ) -> None:
         """
         Logs the information about the arguments passed to the method.
@@ -109,11 +117,17 @@ def log_args_info(
                 Else, the batch size is automatically inferred depending on what fits in memory.
             max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available.
             job_id (str): job ID, used to retrieve logs.
+            config (optional): Nanotron Config
+
+        Returns:
+            None
+
         """
         self.num_fewshot_seeds = num_fewshot_seeds
         self.override_batch_size = override_batch_size
         self.max_samples = max_samples
         self.job_id = job_id
+        self.config = config
 
     def log_model_info(self, model_info: ModelInfo) -> None:
         """

From ba4a8c43691fc9aab6c549e6c261efc4307ffbc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine?= <cle.fourrier@gmail.com>
Date: Tue, 9 Jul 2024 11:29:10 +0200
Subject: [PATCH 10/10] style

---
 src/lighteval/logging/evaluation_tracker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 2a14c1f5..b1dbe616 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -319,7 +319,7 @@ def recreate_metadata_card(self, repo_id: str) -> None:  # noqa: C901
             # to be able to parse the filename as iso dates, we need to re-replace the `-` with `:`
             # iso_date[13] = iso_date[16] = ':'
             dir_name = os.path.dirname(sub_file)
-            iso_date = ':'.join(dir_name.rsplit('-', 2))
+            iso_date = ":".join(dir_name.rsplit("-", 2))
             eval_date = datetime.fromisoformat(iso_date)
 
             last_eval_date_results[task_name] = (