Enable mlflow logger (#4893)

* Enable mlflow logger Signed-off-by: whrichd <trabeitwrq@gmail.com> * fix style Signed-off-by: whrichd <trabeitwrq@gmail.com> * Add doc lines. Signed-off-by: whrichd <trabeitwrq@gmail.com> * change default value Signed-off-by: whrichd <trabeitwrq@gmail.com> * fix doc Signed-off-by: whrichd <trabeitwrq@gmail.com> * addressed comments, added dataclass Signed-off-by: whrichd <trabeitwrq@gmail.com> * fix style Signed-off-by: whrichd <trabeitwrq@gmail.com> * fix doc Signed-off-by: whrichd <trabeitwrq@gmail.com> Signed-off-by: whrichd <trabeitwrq@gmail.com>
NVIDIA · Nov 9, 2022 · c5c46ba · c5c46ba
1 parent c29d779
commit c5c46ba
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 10 deletions.
diff --git a/docs/source/core/exp_manager.rst b/docs/source/core/exp_manager.rst
@@ -4,7 +4,7 @@
 Experiment Manager
 ==================
 
-NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, and Weights and Biases logging. The
+NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, Weights and Biases, and MLFlow logging. The
 Experiment Manager is included by default in all NeMo example scripts.
 
 To use the experiment manager simply call :class:`~nemo.utils.exp_manager.exp_manager` and pass in the PyTorch Lightning ``Trainer``.

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
@@ -33,7 +33,7 @@
 from pytorch_lightning.callbacks import Callback, ModelCheckpoint
 from pytorch_lightning.callbacks.timer import Interval, Timer
 from pytorch_lightning.loggers import LoggerCollection as _LoggerCollection
-from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
+from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger, WandbLogger
 from pytorch_lightning.loops import TrainingEpochLoop
 from pytorch_lightning.strategies.ddp import DDPStrategy
 from pytorch_lightning.utilities import rank_zero_info
@@ -88,6 +88,21 @@ class CallbackParams:
     model_parallel_size: Optional[int] = None  # tensor parallel size * pipeline parallel size
 
 
+@dataclass
+class MLFlowParams:
+    # name of experiment, if none, defaults to the globally set experiment name
+    experiment_name: Optional[str] = None
+    # no run_name because it's set by version
+    # local or remote tracking seerver. If tracking_uri is not set, it defaults to save_dir
+    tracking_uri: Optional[str] = None
+    tags: Optional[Dict[str, Any]] = None
+    save_dir: Optional[str] = "./mlruns"
+    prefix: str = ""
+    artifact_location: Optional[str] = None
+    # provide run_id if resuming a previously started run
+    run_id: Optional[str] = None
+
+
 @dataclass
 class StepTimingParams:
     reduction: Optional[str] = "mean"
@@ -122,6 +137,8 @@ class ExpManagerConfig:
     summary_writer_kwargs: Optional[Dict[Any, Any]] = None
     create_wandb_logger: Optional[bool] = False
     wandb_logger_kwargs: Optional[Dict[Any, Any]] = None
+    create_mlflow_logger: Optional[bool] = False
+    mlflow_logger_kwargs: Optional[MLFlowParams] = MLFlowParams()
     # Checkpointing parameters
     create_checkpoint_callback: Optional[bool] = True
     checkpoint_callback_params: Optional[CallbackParams] = CallbackParams()
@@ -190,7 +207,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
     directory. exp_manager also allows for explicit folder creation via explicit_log_dir.
 
     The version can be a datetime string or an integer. Datestime version can be disabled if use_datetime_version is set
-    to False. It optionally creates TensorBoardLogger, WandBLogger, ModelCheckpoint objects from pytorch lightning.
+     to False. It optionally creates TensorBoardLogger, WandBLogger, MLFlowLogger, ModelCheckpoint objects from pytorch lightning.
     It copies sys.argv, and git information if available to the logging directory. It creates a log file for each
     process to log their output into.
 
@@ -231,6 +248,9 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
             - wandb_logger_kwargs (dict): A dictionary of kwargs that can be passed to lightning's WandBLogger
                 class. Note that name and project are required parameters if create_wandb_logger is True.
                 Defaults to None.
+            - create_mlflow_logger (bool): Whether to create an MLFlow logger and attach it to the pytorch lightning
+                training. Defaults to False
+            - mlflow_logger_kwargs (dict): optional parameters for the MLFlow logger
             - create_checkpoint_callback (bool): Whether to create a ModelCheckpoint callback and attach it to the
                 pytorch lightning trainer. The ModelCheckpoint saves the top 3 models with the best "val_loss", the most
                 recent checkpoint under ``*last.ckpt``, and the final checkpoint after training completes under ``*end.ckpt``.
@@ -297,6 +317,15 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
     # If name returned from get_log_dir is "", use cfg.name for checkpointing
     if checkpoint_name is None or checkpoint_name == '':
         checkpoint_name = cfg.name or "default"
+
+    # Set mlflow name if it's not set, before the main name is erased
+    if cfg.create_mlflow_logger and (not cfg.mlflow_logger_kwargs.get("experiment_name", None)):
+        cfg.mlflow_logger_kwargs.experiment_name = cfg.name
+        logging.warning(
+            'mlflow logger specified but no experiment name set. Using the same as Tensorboard: %s',
+            cfg.mlflow_logger_kwargs.experiment_name,
+        )
+
     cfg.name = name  # Used for configure_loggers so that the log_dir is properly set even if name is ""
     cfg.version = version
 
@@ -339,7 +368,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
 
     # For some reason, LearningRateLogger requires trainer to have a logger. Safer to create logger on all ranks
     # not just global rank 0.
-    if cfg.create_tensorboard_logger or cfg.create_wandb_logger:
+    if cfg.create_tensorboard_logger or cfg.create_wandb_logger or cfg.create_mlflow_logger:
         configure_loggers(
             trainer,
             exp_dir,
@@ -349,6 +378,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
             cfg.summary_writer_kwargs,
             cfg.create_wandb_logger,
             cfg.wandb_logger_kwargs,
+            cfg.create_mlflow_logger,
+            cfg.mlflow_logger_kwargs,
         )
 
     # add loggers timing callbacks
@@ -404,20 +435,22 @@ def error_checks(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictC
     """
     Checks that the passed trainer is compliant with NeMo and exp_manager's passed configuration. Checks that:
         - Throws error when hydra has changed the working directory. This causes issues with lightning's DDP
-        - Throws error when trainer has loggers defined but create_tensorboard_logger or create_WandB_logger is True
+        - Throws error when trainer has loggers defined but create_tensorboard_logger or create_WandB_logger  or create_mlflow_logger is True
         - Prints error messages when 1) run on multi-node and not Slurm, and 2) run on multi-gpu without DDP
     """
     if HydraConfig.initialized() and get_original_cwd() != os.getcwd():
         raise ValueError(
             "Hydra changed the working directory. This interferes with ExpManger's functionality. Please pass "
             "hydra.run.dir=. to your python script."
         )
-    if trainer.logger is not None and (cfg.create_tensorboard_logger or cfg.create_wandb_logger):
+    if trainer.logger is not None and (
+        cfg.create_tensorboard_logger or cfg.create_wandb_logger or cfg.create_mlflow_logger
+    ):
         raise LoggerMisconfigurationError(
             "The pytorch lightning trainer that was passed to exp_manager contained a logger, and either "
             f"create_tensorboard_logger: {cfg.create_tensorboard_logger} or create_wandb_logger: "
-            f"{cfg.create_wandb_logger} was set to True. These can only be used if trainer does not already have a"
-            " logger."
+            f"{cfg.create_wandb_logger} or create_mlflow_logger: {cfg.create_mlflow_logger} was set to True. "
+            "These can only be used if trainer does not already have a logger."
         )
     if trainer.num_nodes > 1 and not check_slurm(trainer):
         logging.error(
@@ -685,11 +718,13 @@ def configure_loggers(
     summary_writer_kwargs: dict,
     create_wandb_logger: bool,
     wandb_kwargs: dict,
+    create_mlflow_logger: bool,
+    mlflow_kwargs: dict,
 ):
-    """ Creates TensorboardLogger and/or WandBLogger and attach them to trainer. Raises ValueError if
+    """ Creates TensorboardLogger and/or WandBLogger / MLFlowLogger and attach them to trainer. Raises ValueError if
     summary_writer_kwargs or wandb_kwargs are misconfigured.
     """
-    # Potentially create tensorboard logger and/or WandBLogger
+    # Potentially create tensorboard logger and/or WandBLogger / MLFlowLogger
     logger_list = []
     if create_tensorboard_logger:
         if summary_writer_kwargs is None:
@@ -718,6 +753,12 @@ def configure_loggers(
         logger_list.append(wandb_logger)
         logging.info("WandBLogger has been set up")
 
+    if create_mlflow_logger:
+        mlflow_logger = MLFlowLogger(run_name=version, **mlflow_kwargs)
+
+        logger_list.append(mlflow_logger)
+        logging.info('MLFlowLogger has been set up')
+
     logger_list = (
         LoggerList(logger_list, nemo_name=name, nemo_version=version) if len(logger_list) > 1 else logger_list[0]
     )