Skip to content

Commit

Permalink
Enable mlflow logger (#4893)
Browse files Browse the repository at this point in the history
* Enable mlflow logger

Signed-off-by: whrichd <trabeitwrq@gmail.com>

* fix style

Signed-off-by: whrichd <trabeitwrq@gmail.com>

* Add doc lines.

Signed-off-by: whrichd <trabeitwrq@gmail.com>

* change default value

Signed-off-by: whrichd <trabeitwrq@gmail.com>

* fix doc

Signed-off-by: whrichd <trabeitwrq@gmail.com>

* addressed comments, added dataclass

Signed-off-by: whrichd <trabeitwrq@gmail.com>

* fix style

Signed-off-by: whrichd <trabeitwrq@gmail.com>

* fix doc

Signed-off-by: whrichd <trabeitwrq@gmail.com>

Signed-off-by: whrichd <trabeitwrq@gmail.com>
  • Loading branch information
whrichd authored Nov 9, 2022
1 parent c29d779 commit c5c46ba
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 10 deletions.
2 changes: 1 addition & 1 deletion docs/source/core/exp_manager.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Experiment Manager
==================

NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, and Weights and Biases logging. The
NeMo's Experiment Manager leverages PyTorch Lightning for model checkpointing, TensorBoard Logging, Weights and Biases, and MLFlow logging. The
Experiment Manager is included by default in all NeMo example scripts.

To use the experiment manager simply call :class:`~nemo.utils.exp_manager.exp_manager` and pass in the PyTorch Lightning ``Trainer``.
Expand Down
59 changes: 50 additions & 9 deletions nemo/utils/exp_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
from pytorch_lightning.callbacks.timer import Interval, Timer
from pytorch_lightning.loggers import LoggerCollection as _LoggerCollection
from pytorch_lightning.loggers import TensorBoardLogger, WandbLogger
from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger, WandbLogger
from pytorch_lightning.loops import TrainingEpochLoop
from pytorch_lightning.strategies.ddp import DDPStrategy
from pytorch_lightning.utilities import rank_zero_info
Expand Down Expand Up @@ -88,6 +88,21 @@ class CallbackParams:
model_parallel_size: Optional[int] = None # tensor parallel size * pipeline parallel size


@dataclass
class MLFlowParams:
# name of experiment, if none, defaults to the globally set experiment name
experiment_name: Optional[str] = None
# no run_name because it's set by version
# local or remote tracking seerver. If tracking_uri is not set, it defaults to save_dir
tracking_uri: Optional[str] = None
tags: Optional[Dict[str, Any]] = None
save_dir: Optional[str] = "./mlruns"
prefix: str = ""
artifact_location: Optional[str] = None
# provide run_id if resuming a previously started run
run_id: Optional[str] = None


@dataclass
class StepTimingParams:
reduction: Optional[str] = "mean"
Expand Down Expand Up @@ -122,6 +137,8 @@ class ExpManagerConfig:
summary_writer_kwargs: Optional[Dict[Any, Any]] = None
create_wandb_logger: Optional[bool] = False
wandb_logger_kwargs: Optional[Dict[Any, Any]] = None
create_mlflow_logger: Optional[bool] = False
mlflow_logger_kwargs: Optional[MLFlowParams] = MLFlowParams()
# Checkpointing parameters
create_checkpoint_callback: Optional[bool] = True
checkpoint_callback_params: Optional[CallbackParams] = CallbackParams()
Expand Down Expand Up @@ -190,7 +207,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
directory. exp_manager also allows for explicit folder creation via explicit_log_dir.
The version can be a datetime string or an integer. Datestime version can be disabled if use_datetime_version is set
to False. It optionally creates TensorBoardLogger, WandBLogger, ModelCheckpoint objects from pytorch lightning.
to False. It optionally creates TensorBoardLogger, WandBLogger, MLFlowLogger, ModelCheckpoint objects from pytorch lightning.
It copies sys.argv, and git information if available to the logging directory. It creates a log file for each
process to log their output into.
Expand Down Expand Up @@ -231,6 +248,9 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
- wandb_logger_kwargs (dict): A dictionary of kwargs that can be passed to lightning's WandBLogger
class. Note that name and project are required parameters if create_wandb_logger is True.
Defaults to None.
- create_mlflow_logger (bool): Whether to create an MLFlow logger and attach it to the pytorch lightning
training. Defaults to False
- mlflow_logger_kwargs (dict): optional parameters for the MLFlow logger
- create_checkpoint_callback (bool): Whether to create a ModelCheckpoint callback and attach it to the
pytorch lightning trainer. The ModelCheckpoint saves the top 3 models with the best "val_loss", the most
recent checkpoint under ``*last.ckpt``, and the final checkpoint after training completes under ``*end.ckpt``.
Expand Down Expand Up @@ -297,6 +317,15 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
# If name returned from get_log_dir is "", use cfg.name for checkpointing
if checkpoint_name is None or checkpoint_name == '':
checkpoint_name = cfg.name or "default"

# Set mlflow name if it's not set, before the main name is erased
if cfg.create_mlflow_logger and (not cfg.mlflow_logger_kwargs.get("experiment_name", None)):
cfg.mlflow_logger_kwargs.experiment_name = cfg.name
logging.warning(
'mlflow logger specified but no experiment name set. Using the same as Tensorboard: %s',
cfg.mlflow_logger_kwargs.experiment_name,
)

cfg.name = name # Used for configure_loggers so that the log_dir is properly set even if name is ""
cfg.version = version

Expand Down Expand Up @@ -339,7 +368,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo

# For some reason, LearningRateLogger requires trainer to have a logger. Safer to create logger on all ranks
# not just global rank 0.
if cfg.create_tensorboard_logger or cfg.create_wandb_logger:
if cfg.create_tensorboard_logger or cfg.create_wandb_logger or cfg.create_mlflow_logger:
configure_loggers(
trainer,
exp_dir,
Expand All @@ -349,6 +378,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
cfg.summary_writer_kwargs,
cfg.create_wandb_logger,
cfg.wandb_logger_kwargs,
cfg.create_mlflow_logger,
cfg.mlflow_logger_kwargs,
)

# add loggers timing callbacks
Expand Down Expand Up @@ -404,20 +435,22 @@ def error_checks(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictC
"""
Checks that the passed trainer is compliant with NeMo and exp_manager's passed configuration. Checks that:
- Throws error when hydra has changed the working directory. This causes issues with lightning's DDP
- Throws error when trainer has loggers defined but create_tensorboard_logger or create_WandB_logger is True
- Throws error when trainer has loggers defined but create_tensorboard_logger or create_WandB_logger or create_mlflow_logger is True
- Prints error messages when 1) run on multi-node and not Slurm, and 2) run on multi-gpu without DDP
"""
if HydraConfig.initialized() and get_original_cwd() != os.getcwd():
raise ValueError(
"Hydra changed the working directory. This interferes with ExpManger's functionality. Please pass "
"hydra.run.dir=. to your python script."
)
if trainer.logger is not None and (cfg.create_tensorboard_logger or cfg.create_wandb_logger):
if trainer.logger is not None and (
cfg.create_tensorboard_logger or cfg.create_wandb_logger or cfg.create_mlflow_logger
):
raise LoggerMisconfigurationError(
"The pytorch lightning trainer that was passed to exp_manager contained a logger, and either "
f"create_tensorboard_logger: {cfg.create_tensorboard_logger} or create_wandb_logger: "
f"{cfg.create_wandb_logger} was set to True. These can only be used if trainer does not already have a"
" logger."
f"{cfg.create_wandb_logger} or create_mlflow_logger: {cfg.create_mlflow_logger} was set to True. "
"These can only be used if trainer does not already have a logger."
)
if trainer.num_nodes > 1 and not check_slurm(trainer):
logging.error(
Expand Down Expand Up @@ -685,11 +718,13 @@ def configure_loggers(
summary_writer_kwargs: dict,
create_wandb_logger: bool,
wandb_kwargs: dict,
create_mlflow_logger: bool,
mlflow_kwargs: dict,
):
""" Creates TensorboardLogger and/or WandBLogger and attach them to trainer. Raises ValueError if
""" Creates TensorboardLogger and/or WandBLogger / MLFlowLogger and attach them to trainer. Raises ValueError if
summary_writer_kwargs or wandb_kwargs are misconfigured.
"""
# Potentially create tensorboard logger and/or WandBLogger
# Potentially create tensorboard logger and/or WandBLogger / MLFlowLogger
logger_list = []
if create_tensorboard_logger:
if summary_writer_kwargs is None:
Expand Down Expand Up @@ -718,6 +753,12 @@ def configure_loggers(
logger_list.append(wandb_logger)
logging.info("WandBLogger has been set up")

if create_mlflow_logger:
mlflow_logger = MLFlowLogger(run_name=version, **mlflow_kwargs)

logger_list.append(mlflow_logger)
logging.info('MLFlowLogger has been set up')

logger_list = (
LoggerList(logger_list, nemo_name=name, nemo_version=version) if len(logger_list) > 1 else logger_list[0]
)
Expand Down

0 comments on commit c5c46ba

Please sign in to comment.