Skip to content

Commit

Permalink
Try to fix experiment already exist issue in MLFlowHandler (#7916)
Browse files Browse the repository at this point in the history
Try to fixes NVIDIA/NVFlare#2698.


### Types of changes
<!--- Put an `x` in all the boxes that apply, and remove the not
applicable items -->
- [x] Non-breaking change (fix or new feature that would not break
existing functionality).
- [ ] Breaking change (fix or new feature that would cause existing
functionality to change).
- [ ] New tests added to cover the changes.
- [ ] Integration tests passed locally by running `./runtests.sh -f -u
--net --coverage`.
- [ ] Quick tests passed locally by running `./runtests.sh --quick
--unittests --disttests`.
- [ ] In-line docstrings updated.
- [ ] Documentation updated, tested `make html` command in the `docs/`
folder.

---------

Signed-off-by: YunLiu <55491388+KumoLiu@users.noreply.github.com>
Co-authored-by: Eric Kerfoot <17726042+ericspod@users.noreply.github.com>
  • Loading branch information
KumoLiu and ericspod authored Jul 18, 2024
1 parent bdbfa3e commit 85ab9f4
Showing 1 changed file with 21 additions and 4 deletions.
25 changes: 21 additions & 4 deletions monai/handlers/mlflow_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import torch
from torch.utils.data import Dataset

from monai.apps.utils import get_logger
from monai.config import IgniteInfo
from monai.utils import CommonKeys, ensure_tuple, min_version, optional_import

Expand All @@ -29,6 +30,9 @@
mlflow.entities, _ = optional_import(
"mlflow.entities", descriptor="Please install mlflow.entities before using MLFlowHandler."
)
MlflowException, _ = optional_import(
"mlflow.exceptions", name="MlflowException", descriptor="Please install mlflow before using MLFlowHandler."
)
pandas, _ = optional_import("pandas", descriptor="Please install pandas for recording the dataset.")
tqdm, _ = optional_import("tqdm", "4.47.0", min_version, "tqdm")

Expand All @@ -41,6 +45,8 @@

DEFAULT_TAG = "Loss"

logger = get_logger(module_name=__name__)


class MLFlowHandler:
"""
Expand Down Expand Up @@ -236,10 +242,21 @@ def start(self, engine: Engine) -> None:
def _set_experiment(self):
experiment = self.experiment
if not experiment:
experiment = self.client.get_experiment_by_name(self.experiment_name)
if not experiment:
experiment_id = self.client.create_experiment(self.experiment_name)
experiment = self.client.get_experiment(experiment_id)
for _retry_time in range(3):
try:
experiment = self.client.get_experiment_by_name(self.experiment_name)
if not experiment:
experiment_id = self.client.create_experiment(self.experiment_name)
experiment = self.client.get_experiment(experiment_id)
break
except MlflowException as e:
if "RESOURCE_ALREADY_EXISTS" in str(e):
logger.warning("Experiment already exists; delaying before retrying.")
time.sleep(1)
if _retry_time == 2:
raise e
else:
raise e

if experiment.lifecycle_stage != mlflow.entities.LifecycleStage.ACTIVE:
raise ValueError(f"Cannot set a deleted experiment '{self.experiment_name}' as the active experiment")
Expand Down

0 comments on commit 85ab9f4

Please sign in to comment.