From 59c3081f60e519c7fc5ff9b031711898dfd044fc Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Mon, 8 Apr 2024 14:31:00 +0000 Subject: [PATCH 01/10] init --- src/lighteval/logging/evaluation_tracker.py | 209 +++++++++----------- src/lighteval/logging/info_loggers.py | 16 +- src/lighteval/main_nanotron.py | 7 +- 3 files changed, 97 insertions(+), 135 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 35a835bc..2f672246 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -45,7 +45,7 @@ if is_nanotron_available(): - from nanotron.config import Config + from nanotron.config import GeneralArgs class EnhancedJSONEncoder(json.JSONEncoder): @@ -80,16 +80,37 @@ class EvaluationTracker: task_config_logger: TaskConfigLogger hub_results_org: str - def __init__(self, hub_results_org: str = "", token: str = "") -> None: + def __init__( + self, + output_dir: str = None, + hub_results_org: str = "", + push_results_to_hub: bool = False, + push_details_to_hub: bool = False, + push_results_to_tensorboard: bool = False, + tensorboard_metric_prefix: str = "eval", + tensorboard_org: str = "", + public: bool = False, + token: str = "", + nanotron_run_info: GeneralArgs = None, + ) -> None: """ Creates all the necessary loggers for evaluation tracking. Args: + output_dir (str): Local folder path where you want results to be saved hub_results_org (str): The organisation to push the results to. See more details about the datasets organisation in [`EvaluationTracker.save`] + push_results_to_hub (bool): If True, results are pushed to the hub. + Results will be pushed either to `{hub_results_org}/results`, a public dataset, if `public` is True else to `{hub_results_org}/private-results`, a private dataset. + push_details_to_hub (bool): If True, details are pushed to the hub. + Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset, + if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset. + push_results_to_tensorboard (bool): If True, will create and push the results for a tensorboard folder on the hub + public (bool): If True, results and details are pushed in private orgs token (str): Token to use when pushing to the hub. This token should have write access to `hub_results_org`. + nanotron_run_info (GeneralArgs): Reference to informations about Nanotron models runs """ self.details_logger = DetailsLogger() self.metrics_logger = MetricsLogger() @@ -101,35 +122,22 @@ def __init__(self, hub_results_org: str = "", token: str = "") -> None: self.hub_private_results_repo = f"{hub_results_org}/private-results" self.api = HfApi(token=token) - def save( - self, - output_dir: str, - push_results_to_hub: bool, - push_details_to_hub: bool, - public: bool, - push_results_to_tensorboard: bool = False, - ) -> None: - """Saves the experiment information and results to files, and to the hub if requested. - - Note: - In case of save failure, this function will only print a warning, with the error message. - - Args: - output_dir (str): Local folder path where you want results to be saved - push_results_to_hub (bool): If True, results are pushed to the hub. - Results will be pushed either to `{hub_results_org}/results`, a public dataset, if `public` is True else to `{hub_results_org}/private-results`, a private dataset. - push_details_to_hub (bool): If True, details are pushed to the hub. - Results are pushed to `{hub_results_org}/details__{sanitized model_name}` for the model `model_name`, a public dataset, - if `public` is True else `{hub_results_org}/details__{sanitized model_name}_private`, a private dataset. - public (bool): If True, results and details are pushed in private orgs - - """ + self.output_dir = output_dir + self.push_results_to_hub = push_results_to_hub + self.push_details_to_hub = push_details_to_hub + self.tensorboard_metric_prefix = tensorboard_metric_prefix + self.tensorboard_org = tensorboard_org + self.push_results_to_tensorboard = push_results_to_tensorboard + self.nanotron_run_info = nanotron_run_info + self.public = public + + def save(self) -> None: + """Saves the experiment information and results to files, and to the hub if requested.""" hlog("Saving experiment tracker") - # try: date_id = datetime.now().isoformat().replace(":", "-") - output_dir_results = Path(output_dir) / "results" / self.general_config_logger.model_name - output_dir_details = Path(output_dir) / "details" / self.general_config_logger.model_name + output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name + output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name output_dir_details_sub_folder = output_dir_details / date_id output_dir_results.mkdir(parents=True, exist_ok=True) output_dir_details_sub_folder.mkdir(parents=True, exist_ok=True) @@ -163,14 +171,8 @@ def save( for task_name, task_details in self.details_logger.details.items(): output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet" - # Create a dataset from the dictionary - try: - dataset = Dataset.from_list([asdict(detail) for detail in task_details]) - except Exception: - # We force cast to str to avoid formatting problems for nested objects - dataset = Dataset.from_list( - [{k: str(v) for k, v in asdict(detail).items()} for detail in task_details] - ) + # Create a dataset from the dictionary - we force cast to str to avoid formatting problems for nested objects + dataset = Dataset.from_list([{k: str(v) for k, v in asdict(detail).items()} for detail in task_details]) # We don't keep 'id' around if it's there column_names = dataset.column_names @@ -182,30 +184,25 @@ def save( # Save the dataset to a Parquet file dataset.to_parquet(output_file_details.as_posix()) - if push_results_to_hub: + if self.push_results_to_hub: self.api.upload_folder( - repo_id=self.hub_results_repo if public else self.hub_private_results_repo, + repo_id=self.hub_results_repo if self.public else self.hub_private_results_repo, folder_path=output_dir_results, path_in_repo=self.general_config_logger.model_name, repo_type="dataset", commit_message=f"Updating model {self.general_config_logger.model_name}", ) - if push_details_to_hub: + if self.push_details_to_hub: self.details_to_hub( - model_name=self.general_config_logger.model_name, results_file_path=output_results_in_details_file, details_folder_path=output_dir_details_sub_folder, - push_as_public=public, ) - if push_results_to_tensorboard: - self.push_results_to_tensorboard( + if self.push_results_to_tensorboard: + self.push_to_tensorboard( results=self.metrics_logger.metric_aggregated, details=self.details_logger.details ) - # except Exception as e: - # hlog("WARNING: Could not save results") - # hlog(repr(e)) def generate_final_dict(self) -> dict: """Aggregates and returns all the logger's experiment information in a dictionary. @@ -230,29 +227,25 @@ def generate_final_dict(self) -> dict: def details_to_hub( self, - model_name: str, results_file_path: Path | str, details_folder_path: Path | str, - push_as_public: bool = False, ) -> None: """Pushes the experiment details (all the model predictions for every step) to the hub. Args: - model_name (str): Name of the currently evaluated model results_file_path (str or Path): Local path of the current's experiment aggregated results individual file details_folder_path (str or Path): Local path of the current's experiment details folder. The details folder (created by [`EvaluationTracker.save`]) should contain one parquet file per task used during the evaluation run of the current model. - push_as_public (bool, optional): If True, the results will be pushed publicly, else the datasets will be private. """ results_file_path = str(results_file_path) details_folder_path = str(details_folder_path) - sanitized_model_name = model_name.replace("/", "__") + sanitized_model_name = self.general_config_logger.model_name.replace("/", "__") # "Default" detail names are the public detail names (same as results vs private-results) repo_id = f"{self.hub_results_org}/details_{sanitized_model_name}" - if not push_as_public: # if not public, we add `_private` + if not self.public: # if not public, we add `_private` repo_id = f"{repo_id}_private" sub_folder_path = os.path.basename(results_file_path).replace(".json", "").replace("results_", "") @@ -265,7 +258,7 @@ def details_to_hub( if len(checked_paths) == 0: hlog(f"Repo {repo_id} not found for {results_file_path}. Creating it.") - self.api.create_repo(repo_id, private=not (push_as_public), repo_type="dataset", exist_ok=True) + self.api.create_repo(repo_id, private=not (self.public), repo_type="dataset", exist_ok=True) # Create parquet version of results file as well results = load_dataset("json", data_files=results_file_path) @@ -287,43 +280,45 @@ def details_to_hub( repo_id=repo_id, folder_path=details_folder_path, path_in_repo=sub_folder_path, repo_type="dataset" ) - self.recreate_metadata_card(repo_id, model_name) + self.recreate_metadata_card(repo_id) - def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: # noqa: C901 + def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901 """Fully updates the details repository metadata card for the currently evaluated model Args: repo_id (str): Details dataset repository path on the hub (`org/dataset`) - model_name (str): Name of the currently evaluated model. - """ # Add a nice dataset card and the configuration YAML files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset") results_files = [f for f in files_in_repo if ".json" in f] - parquet_results_files = [f for f in files_in_repo if ".parquet" in f and "results_" in f] - parquet_files = [f for f in files_in_repo if ".parquet" in f and "results_" not in f] + parquet_files = [f for f in files_in_repo if ".parquet" in f] multiple_results = len(results_files) > 1 # Get last eval results date for each task (evals might be non overlapping) last_eval_date_results = {} for sub_file in parquet_files: + # We focus on details only + if "results_" in sub_file: + continue + # subfile have this general format: # `2023-09-03T10-57-04.203304/details_harness|hendrycksTest-us_foreign_policy|5_2023-09-03T10-57-04.203304.parquet` # in the iso date, the `:` are replaced by `-` because windows does not allow `:` in their filenames - - task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0] + task_name = ( + os.path.basename(sub_file).replace("details_", "").split("_202")[0] + ) # 202 for dates, 2023, 2024, ... # task_name is then equal to `leaderboard|mmlu:us_foreign_policy|5` - iso_date = os.path.dirname(sub_file) # to be able to parse the filename as iso dates, we need to re-replace the `-` with `:` # iso_date[13] = iso_date[16] = ':' - iso_date = iso_date[:13] + ":" + iso_date[14:16] + ":" + iso_date[17:] - + dir_name = os.path.dirname(sub_file) + iso_date = dir_name[:13] + ":" + dir_name[14:16] + ":" + dir_name[17:] eval_date = datetime.fromisoformat(iso_date) last_eval_date_results[task_name] = ( max(last_eval_date_results[task_name], eval_date) if task_name in last_eval_date_results else eval_date ) + max_last_eval_date_results = list(last_eval_date_results.values())[0] # Now we convert them in iso-format for task in last_eval_date_results: @@ -336,43 +331,20 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: card_metadata = MetadataConfigs() # Add the results config and add the result file as a parquet file - for sub_file in parquet_results_files: - eval_date = os.path.basename(sub_file).replace("results_", "").replace(".parquet", "") - sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date) - sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", max_last_eval_date_results) - - repo_file_name = os.path.basename(sub_file) - - if multiple_results: - if "results" not in card_metadata: - card_metadata["results"] = { - "data_files": [{"split": sanitized_eval_date, "path": [repo_file_name]}] - } - else: - former_entry = card_metadata["results"] - card_metadata["results"] = { - "data_files": former_entry["data_files"] - + [{"split": sanitized_eval_date, "path": [repo_file_name]}] - } + for sub_file in parquet_files: + if "results_" in sub_file: + eval_date = os.path.basename(sub_file).replace("results_", "").replace(".parquet", "") + sanitized_task = "results" + sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", max_last_eval_date_results) + repo_file_name = os.path.basename(sub_file) else: - if "results" in card_metadata: - raise ValueError( - f"Entry for results already exists in {former_entry} for repo {repo_id} and file {sub_file}" - ) - card_metadata["results"] = {"data_files": [{"split": sanitized_eval_date, "path": [repo_file_name]}]} + task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0] + sanitized_task = re.sub(r"\W", "_", task_name) + eval_date = os.path.dirname(sub_file) + sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", last_eval_date_results[task_name]) + repo_file_name = os.path.join("**", os.path.basename(sub_file)) - if sanitized_eval_date == sanitized_last_eval_date_results: - all_entry = card_metadata["results"]["data_files"] - card_metadata["results"] = {"data_files": all_entry + [{"split": "latest", "path": [repo_file_name]}]} - - # Add the tasks details configs - for sub_file in parquet_files: - task_name = os.path.basename(sub_file).replace("details_", "").split("_2023")[0].split("_2024")[0] - sanitized_task = re.sub(r"\W", "_", task_name) - eval_date = os.path.dirname(sub_file) sanitized_eval_date = re.sub(r"[^\w\.]", "_", eval_date) - repo_file_name = os.path.join("**", os.path.basename(sub_file)) - sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", last_eval_date_results[task_name]) if multiple_results: if sanitized_task not in card_metadata: @@ -400,6 +372,9 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: "data_files": all_entry + [{"split": "latest", "path": [repo_file_name]}] } + if "results_" in sub_file: + continue + # Special case for MMLU with a single split covering it all # We add another config with all MMLU splits results together for easy inspection SPECIAL_TASKS = [ @@ -481,7 +456,7 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: card_data = DatasetCardData( dataset_summary=f"Dataset automatically created during the evaluation run of model " - f"[{model_name}](https://huggingface.co/{model_name})" + f"[{self.general_config_logger.model_name}](https://huggingface.co/{self.general_config_logger.model_name})" f"{org_string}.\n\n" f"The dataset is composed of {len(card_metadata) - 1} configuration, each one coresponding to one of the evaluated task.\n\n" f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each " @@ -494,8 +469,8 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: f"(note that their might be results for other tasks in the repos if successive evals didn't cover the same tasks. " f'You find each in the results and the "latest" split for each eval):\n\n' f"```python\n{results_string}\n```", - repo_url=f"https://huggingface.co/{model_name}", - pretty_name=f"Evaluation run of {model_name}", + repo_url=f"https://huggingface.co/{self.general_config_logger.model_name}", + pretty_name=f"Evaluation run of {self.general_config_logger.model_name}", leaderboard_url=leaderboard_url, point_of_contact=point_of_contact, ) @@ -507,27 +482,26 @@ def recreate_metadata_card(self, repo_id: str, model_name: str = None) -> None: ) card.push_to_hub(repo_id, repo_type="dataset") - def push_results_to_tensorboard( # noqa: C901 + def push_to_tensorboard( # noqa: C901 self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail] ): if not is_nanotron_available(): hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping") return - config: Config = self.general_config_logger.config - lighteval_config = config.lighteval - try: - global_step = config.general.step - except ValueError: - global_step = 0 - if config.lighteval.logging.tensorboard_metric_prefix is not None: - prefix = config.lighteval.logging.tensorboard_metric_prefix + prefix = self.tensorboard_metric_prefix + + if self.nanotron_run_info is not None: + global_step = self.nanotron_run_info.step + run = f"{self.nanotron_run_info.run}_{prefix}" else: - prefix = "eval" - output_dir_tb = Path(lighteval_config.logging.local_output_path) / "tb" / (config.general.run + "_" + prefix) + global_step = 0 + run = prefix + + output_dir_tb = Path(self.output_dir) / "tb" / run output_dir_tb.mkdir(parents=True, exist_ok=True) tb_context = HFSummaryWriter( logdir=str(output_dir_tb), - repo_id=lighteval_config.logging.hub_repo_tensorboard, + repo_id=self.tensorboard_org, repo_private=True, path_in_repo="tb", commit_every=6000, # Very long time so that we can change our files names and trigger push ourselves (see below) @@ -559,14 +533,13 @@ def push_results_to_tensorboard( # noqa: C901 ) else: tb_context.add_scalar(f"{prefix}/{task_name}/{metric}", value, global_step=global_step) - # e.g. MMLU + # Tasks with subtasks for name, values in bench_averages.items(): for metric, values in values.items(): hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard") tb_context.add_scalar(f"{prefix}/{name}/{metric}", sum(values) / len(values), global_step=global_step) tb_context.add_text("eval_config", obj_to_markdown(results), global_step=global_step) - # tb_context.add_text("eval_sizes", obj_to_markdown(sizes), global_step=global_step) for task_name, task_details in details.items(): tb_context.add_text( @@ -589,8 +562,6 @@ def push_results_to_tensorboard( # noqa: C901 # Now we can push to the hub tb_context.scheduler.trigger() hlog( - f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/" - f" at {output_dir_tb} and global_step {global_step}" + f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_org}/{output_dir_tb}/tensorboard" + f"at global_step {global_step}" ) - # except Exception as e: - # logger.warning(f"Could not push to tensorboard\n{e}") diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index b11c124c..fbfbdbeb 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -37,11 +37,7 @@ from lighteval.models.model_output import ModelReturn from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig from lighteval.tasks.requests import Doc -from lighteval.utils import as_list, is_nanotron_available, sanitize_numpy - - -if is_nanotron_available(): - from nanotron.config import Config +from lighteval.utils import as_list, sanitize_numpy @dataclass(init=False) @@ -86,9 +82,6 @@ class GeneralConfigLogger: model_dtype: str = None model_size: str = None - # Nanotron config - config: "Config" = None - def __init__(self) -> None: """Stores the current lighteval commit for reproducibility, and starts the evaluation timer.""" try: @@ -105,7 +98,6 @@ def log_args_info( override_batch_size: Union[None, int], max_samples: Union[None, int], job_id: str, - config: "Config" = None, ) -> None: """ Logs the information about the arguments passed to the method. @@ -117,17 +109,11 @@ def log_args_info( Else, the batch size is automatically inferred depending on what fits in memory. max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available. job_id (str): job ID, used to retrieve logs. - config (optional): Nanotron Config - - Returns: - None - """ self.num_fewshot_seeds = num_fewshot_seeds self.override_batch_size = override_batch_size self.max_samples = max_samples self.job_id = job_id - self.config = config def log_model_info(self, model_info: ModelInfo) -> None: """ diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py index 4610ea86..9bce875f 100644 --- a/src/lighteval/main_nanotron.py +++ b/src/lighteval/main_nanotron.py @@ -96,7 +96,12 @@ def main( data_parallel_size=lighteval_config.parallelism.dp, ) - evaluation_tracker = EvaluationTracker(token=TOKEN) + evaluation_tracker = EvaluationTracker( + token=TOKEN, + output_dir=lighteval_config.logging.local_output_path, + tensorboard_org=lighteval_config.logging.hub_repo_tensorboard, + nanotron_run_info=nanotron_config.general, + ) evaluation_tracker.general_config_logger.log_args_info( num_fewshot_seeds=1, override_batch_size=None, From 77a3d8fe3b799bce482e9a89b4fefebce10e4dca Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Mon, 8 Apr 2024 14:37:24 +0000 Subject: [PATCH 02/10] adding tensorboard_metric_prefix --- src/lighteval/logging/evaluation_tracker.py | 19 +++++++++++-------- src/lighteval/main_nanotron.py | 3 ++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 2f672246..496ad250 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -88,7 +88,6 @@ def __init__( push_details_to_hub: bool = False, push_results_to_tensorboard: bool = False, tensorboard_metric_prefix: str = "eval", - tensorboard_org: str = "", public: bool = False, token: str = "", nanotron_run_info: GeneralArgs = None, @@ -117,18 +116,22 @@ def __init__( self.versions_logger = VersionsLogger() self.general_config_logger = GeneralConfigLogger() self.task_config_logger = TaskConfigLogger() - self.hub_results_org = hub_results_org - self.hub_results_repo = f"{hub_results_org}/results" - self.hub_private_results_repo = f"{hub_results_org}/private-results" + self.api = HfApi(token=token) self.output_dir = output_dir + + self.hub_results_org = hub_results_org # will also contain tensorboard results + + self.hub_results_repo = f"{hub_results_org}/results" + self.hub_private_results_repo = f"{hub_results_org}/private-results" self.push_results_to_hub = push_results_to_hub self.push_details_to_hub = push_details_to_hub - self.tensorboard_metric_prefix = tensorboard_metric_prefix - self.tensorboard_org = tensorboard_org + self.push_results_to_tensorboard = push_results_to_tensorboard + self.tensorboard_metric_prefix = tensorboard_metric_prefix self.nanotron_run_info = nanotron_run_info + self.public = public def save(self) -> None: @@ -501,7 +504,7 @@ def push_to_tensorboard( # noqa: C901 output_dir_tb.mkdir(parents=True, exist_ok=True) tb_context = HFSummaryWriter( logdir=str(output_dir_tb), - repo_id=self.tensorboard_org, + repo_id=self.hub_results_org, repo_private=True, path_in_repo="tb", commit_every=6000, # Very long time so that we can change our files names and trigger push ourselves (see below) @@ -562,6 +565,6 @@ def push_to_tensorboard( # noqa: C901 # Now we can push to the hub tb_context.scheduler.trigger() hlog( - f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_org}/{output_dir_tb}/tensorboard" + f"Pushed to tensorboard at https://huggingface.co/{self.hub_results_org}/{output_dir_tb}/tensorboard" f"at global_step {global_step}" ) diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py index 9bce875f..f479c5d7 100644 --- a/src/lighteval/main_nanotron.py +++ b/src/lighteval/main_nanotron.py @@ -99,7 +99,8 @@ def main( evaluation_tracker = EvaluationTracker( token=TOKEN, output_dir=lighteval_config.logging.local_output_path, - tensorboard_org=lighteval_config.logging.hub_repo_tensorboard, + hub_results_org=lighteval_config.logging.hub_repo_tensorboard, + tensorboard_metric_prefix=lighteval_config.logging.tensorboard_metric_prefix, nanotron_run_info=nanotron_config.general, ) evaluation_tracker.general_config_logger.log_args_info( From d5494f9b23bf271efcc50c63619df67fda4d7a8e Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Mon, 8 Apr 2024 14:52:27 +0000 Subject: [PATCH 03/10] accelerate launcher --- run_evals_accelerate.py | 1 + src/lighteval/main_accelerate.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/run_evals_accelerate.py b/run_evals_accelerate.py index a743cb49..e7c8c1d3 100644 --- a/run_evals_accelerate.py +++ b/run_evals_accelerate.py @@ -47,6 +47,7 @@ def get_parser(): parser.add_argument("--push_results_to_hub", default=False, action="store_true") parser.add_argument("--save_details", action="store_true") parser.add_argument("--push_details_to_hub", default=False, action="store_true") + parser.add_argument("--push_results_to_tensorboard", default=False, action="store_true") parser.add_argument( "--public_run", default=False, action="store_true", help="Push results and details to a public repo" ) diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index d2ffbbe3..aada6c0c 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -56,7 +56,15 @@ @htrack() def main(args): env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir) - evaluation_tracker = EvaluationTracker(hub_results_org=args.results_org, token=TOKEN) + evaluation_tracker = EvaluationTracker( + output_dir=args.output_dir, + hub_results_org=args.results_org, + push_results_to_hub=args.push_results_to_hub, + push_details_to_hub=args.push_details_to_hub, + push_results_to_tensorboard=args.push_results_to_tensorboard, + public_run=args.public_run, + token=TOKEN, + ) evaluation_tracker.general_config_logger.log_args_info( args.num_fewshot_seeds, args.override_batch_size, args.max_samples, args.job_id ) @@ -124,9 +132,7 @@ def main(args): evaluation_tracker.details_logger.aggregate() if args.output_dir: - evaluation_tracker.save( - args.output_dir, args.push_results_to_hub, args.push_details_to_hub, args.public_run - ) + evaluation_tracker.save() final_dict = evaluation_tracker.generate_final_dict() From f5e1506310592a63ec162beb06ec8c042c1fe801 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Mon, 8 Apr 2024 15:44:05 +0000 Subject: [PATCH 04/10] Add tensorboardX import - huggingface_hub is atm testing the import of tensrboard instead, we likely will have to wait for a lib update before merging this PR... --- README.md | 2 +- pyproject.toml | 1 + src/lighteval/logging/evaluation_tracker.py | 13 +++++-------- src/lighteval/main_accelerate.py | 2 +- src/lighteval/utils.py | 9 +++++++++ 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2ed50829..8143667a 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Install the dependencies. For the default installation, you just need: pip install . ``` -If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`): +If you want to evaluate models with frameworks like `accelerate` or `peft`, you will need to specify the optional dependencies group that fits your use case (`accelerate`,`tgi`,`optimum`,`quantization`,`adapters`,`nanotron`,`tensorboardX`): ```bash pip install '.[optional1,optional2]' diff --git a/pyproject.toml b/pyproject.toml index 8adc9aa5..e10a3eec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,7 @@ nanotron = [ "nanotron", "tensorboardX" ] +tensorboardX = ["tensorboardX"] quality = ["ruff==v0.2.2","pre-commit"] tests = ["pytest==7.4.0"] dev = ["lighteval[accelerate,quality,tests]"] diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 496ad250..d3073ca9 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -41,7 +41,7 @@ TaskConfigLogger, VersionsLogger, ) -from lighteval.utils import is_nanotron_available, obj_to_markdown +from lighteval.utils import NO_TENSORBOARDX_WARN_MSG, is_nanotron_available, is_tensorboardX_available, obj_to_markdown if is_nanotron_available(): @@ -90,9 +90,9 @@ def __init__( tensorboard_metric_prefix: str = "eval", public: bool = False, token: str = "", - nanotron_run_info: GeneralArgs = None, + nanotron_run_info: "GeneralArgs" = None, ) -> None: - """ + """) Creates all the necessary loggers for evaluation tracking. Args: @@ -151,9 +151,6 @@ def save(self) -> None: hlog(f"Saving results to {output_results_file} and {output_results_in_details_file}") config_general = copy.deepcopy(self.general_config_logger) - config_general.config = ( - config_general.config.as_dict() if is_dataclass(config_general.config) else config_general.config - ) config_general = asdict(config_general) to_dump = { @@ -488,8 +485,8 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901 def push_to_tensorboard( # noqa: C901 self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail] ): - if not is_nanotron_available(): - hlog_warn("You cannot push results to tensorboard with having nanotron installed. Skipping") + if not is_tensorboardX_available: + hlog_warn(NO_TENSORBOARDX_WARN_MSG) return prefix = self.tensorboard_metric_prefix diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py index aada6c0c..12122c52 100644 --- a/src/lighteval/main_accelerate.py +++ b/src/lighteval/main_accelerate.py @@ -62,7 +62,7 @@ def main(args): push_results_to_hub=args.push_results_to_hub, push_details_to_hub=args.push_details_to_hub, push_results_to_tensorboard=args.push_results_to_tensorboard, - public_run=args.public_run, + public=args.public_run, token=TOKEN, ) evaluation_tracker.general_config_logger.log_args_info( diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index d3c32e99..21213099 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -191,6 +191,15 @@ def is_peft_available() -> bool: NO_PEFT_ERROR_MSG = "You are trying to use adapter weights models, for which you need `peft`, which is not available in your environment. Please install it using pip." +def is_tensorboardX_available() -> bool: + return importlib.util.find_spec("tensorboardX") is not None + + +NO_TENSORBOARDX_WARN_MSG = ( + "You are trying to log using tensorboardX, which is not installed. Please install it using pip. Skipping." +) + + def can_load_extended_tasks() -> bool: imports = [] for package in ["langdetect"]: From 2577c6fd1724fe29230af6943ac9ab73126027c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Tue, 21 May 2024 17:54:34 +0200 Subject: [PATCH 05/10] Update version of the hub --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 831780e8..b771942d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ keywords = ["evaluation", "nlp", "llm"] dependencies = [ # Base dependencies "transformers>=4.38.0", - "huggingface_hub>=0.22.0", + "huggingface_hub>=0.23.0", "torch>=2.0", "GitPython>=3.1.41", # for logging "datasets>=2.14.0", From dd89c0a770fc36fb061c5350bea98b3139760698 Mon Sep 17 00:00:00 2001 From: "clementine@huggingface.co" Date: Wed, 22 May 2024 14:05:36 +0000 Subject: [PATCH 06/10] last fix --- src/lighteval/logging/evaluation_tracker.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index d3073ca9..16fd3d87 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -122,6 +122,12 @@ def __init__( self.output_dir = output_dir self.hub_results_org = hub_results_org # will also contain tensorboard results + if hub_results_org in ["", None] and any( + [push_details_to_hub, push_results_to_hub, push_results_to_tensorboard] + ): + raise Exception( + "You need to select which org to push to, using `--results_org`, if you want to save information to the hub." + ) self.hub_results_repo = f"{hub_results_org}/results" self.hub_private_results_repo = f"{hub_results_org}/private-results" @@ -129,6 +135,7 @@ def __init__( self.push_details_to_hub = push_details_to_hub self.push_results_to_tensorboard = push_results_to_tensorboard + self.tensorboard_repo = f"{hub_results_org}/tensorboard_logs" self.tensorboard_metric_prefix = tensorboard_metric_prefix self.nanotron_run_info = nanotron_run_info @@ -501,7 +508,7 @@ def push_to_tensorboard( # noqa: C901 output_dir_tb.mkdir(parents=True, exist_ok=True) tb_context = HFSummaryWriter( logdir=str(output_dir_tb), - repo_id=self.hub_results_org, + repo_id=self.tensorboard_repo, repo_private=True, path_in_repo="tb", commit_every=6000, # Very long time so that we can change our files names and trigger push ourselves (see below) @@ -562,6 +569,6 @@ def push_to_tensorboard( # noqa: C901 # Now we can push to the hub tb_context.scheduler.trigger() hlog( - f"Pushed to tensorboard at https://huggingface.co/{self.hub_results_org}/{output_dir_tb}/tensorboard" + f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard" f"at global_step {global_step}" ) From 48c3857cee457c5355c5d2c9701300cbef72a2f2 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Mon, 8 Jul 2024 15:45:57 +0000 Subject: [PATCH 07/10] make style --- src/lighteval/logging/evaluation_tracker.py | 2 +- src/lighteval/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 70905c5e..b0425348 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -495,7 +495,7 @@ def push_to_tensorboard( # noqa: C901 if not is_tensorboardX_available: hlog_warn(NO_TENSORBOARDX_WARN_MSG) return - + if not is_nanotron_available(): hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping") return diff --git a/src/lighteval/utils.py b/src/lighteval/utils.py index 3faf837a..16235785 100644 --- a/src/lighteval/utils.py +++ b/src/lighteval/utils.py @@ -191,7 +191,6 @@ def is_peft_available() -> bool: NO_PEFT_ERROR_MSG = "You are trying to use adapter weights models, for which you need `peft`, which is not available in your environment. Please install it using pip." - def is_tensorboardX_available() -> bool: return importlib.util.find_spec("tensorboardX") is not None @@ -200,6 +199,7 @@ def is_tensorboardX_available() -> bool: "You are trying to log using tensorboardX, which is not installed. Please install it using pip. Skipping." ) + def is_openai_available() -> bool: return importlib.util.find_spec("openai") is not None From da0e81268a891c85c6855c42f289bb13bdb654b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Fourrier?= <22726840+clefourrier@users.noreply.github.com> Date: Tue, 9 Jul 2024 09:54:59 +0200 Subject: [PATCH 08/10] Apply suggestions from code review Co-authored-by: Nathan Habib <30601243+NathanHB@users.noreply.github.com> --- src/lighteval/logging/evaluation_tracker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index b0425348..2a14c1f5 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -319,7 +319,7 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901 # to be able to parse the filename as iso dates, we need to re-replace the `-` with `:` # iso_date[13] = iso_date[16] = ':' dir_name = os.path.dirname(sub_file) - iso_date = dir_name[:13] + ":" + dir_name[14:16] + ":" + dir_name[17:] + iso_date = ':'.join(dir_name.rsplit('-', 2)) eval_date = datetime.fromisoformat(iso_date) last_eval_date_results[task_name] = ( From 440e9fd2d6f41dd3a2fa151e5fc3963b27ba15d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 9 Jul 2024 11:28:57 +0200 Subject: [PATCH 09/10] restored nanotron config in log --- src/lighteval/logging/info_loggers.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py index 1eebbf18..c211d2e4 100644 --- a/src/lighteval/logging/info_loggers.py +++ b/src/lighteval/logging/info_loggers.py @@ -37,7 +37,11 @@ from lighteval.models.model_output import ModelReturn from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig from lighteval.tasks.requests import Doc -from lighteval.utils import as_list, sanitize_numpy +from lighteval.utils import as_list, is_nanotron_available, sanitize_numpy + + +if is_nanotron_available(): + from nanotron.config import Config @dataclass(init=False) @@ -82,6 +86,9 @@ class GeneralConfigLogger: model_dtype: str = None model_size: str = None + # Nanotron config + config: "Config" = None + def __init__(self) -> None: """Stores the current lighteval commit for reproducibility, and starts the evaluation timer.""" try: @@ -98,6 +105,7 @@ def log_args_info( override_batch_size: Union[None, int], max_samples: Union[None, int], job_id: str, + config: "Config" = None, ) -> None: """ Logs the information about the arguments passed to the method. @@ -109,11 +117,17 @@ def log_args_info( Else, the batch size is automatically inferred depending on what fits in memory. max_samples (Union[None, int]): maximum number of samples, if None, use all the samples available. job_id (str): job ID, used to retrieve logs. + config (optional): Nanotron Config + + Returns: + None + """ self.num_fewshot_seeds = num_fewshot_seeds self.override_batch_size = override_batch_size self.max_samples = max_samples self.job_id = job_id + self.config = config def log_model_info(self, model_info: ModelInfo) -> None: """ From ba4a8c43691fc9aab6c549e6c261efc4307ffbc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine?= Date: Tue, 9 Jul 2024 11:29:10 +0200 Subject: [PATCH 10/10] style --- src/lighteval/logging/evaluation_tracker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py index 2a14c1f5..b1dbe616 100644 --- a/src/lighteval/logging/evaluation_tracker.py +++ b/src/lighteval/logging/evaluation_tracker.py @@ -319,7 +319,7 @@ def recreate_metadata_card(self, repo_id: str) -> None: # noqa: C901 # to be able to parse the filename as iso dates, we need to re-replace the `-` with `:` # iso_date[13] = iso_date[16] = ':' dir_name = os.path.dirname(sub_file) - iso_date = ':'.join(dir_name.rsplit('-', 2)) + iso_date = ":".join(dir_name.rsplit("-", 2)) eval_date = datetime.fromisoformat(iso_date) last_eval_date_results[task_name] = (