[tune] Limit maximum number of pending trials. Add convergence test. (#…

…14835)
ray-project · Mar 24, 2021 · 898243d · 898243d
1 parent 5d763b3
commit 898243d
Show file tree

Hide file tree

Showing 17 changed files with 257 additions and 118 deletions.
diff --git a/doc/source/tune/user-guide.rst b/doc/source/tune/user-guide.rst
@@ -759,14 +759,17 @@ These are the environment variables Ray Tune currently considers:
 * **TUNE_MAX_LEN_IDENTIFIER**: Maximum length of trial subdirectory names (those
   with the parameter values in them)
 * **TUNE_MAX_PENDING_TRIALS_PG**: Maximum number of pending trials when placement groups are used. Defaults
-  to ``1000``.
+  to ``auto``, which will be updated to ``1000`` for random/grid search and ``1`` for any other search algorithms.
 * **TUNE_PLACEMENT_GROUP_AUTO_DISABLED**: Ray Tune automatically uses placement groups
   instead of the legacy resource requests. Setting this to 1 enables legacy placement.
 * **TUNE_PLACEMENT_GROUP_CLEANUP_DISABLED**: Ray Tune cleans up existing placement groups
   with the ``_tune__`` prefix in their name before starting a run. This is used to make sure
   that scheduled placement groups are removed when multiple calls to ``tune.run()`` are
   done in the same script. You might want to disable this if you run multiple Tune runs in
   parallel from different scripts. Set to 1 to disable.
+* **TUNE_PLACEMENT_GROUP_PREFIX**: Prefix for placement groups created by Ray Tune. This prefix is used
+  e.g. to identify placement groups that should be cleaned up on start/stop of the tuning run. This is
+  initialized to a unique name at the start of the first run.
 * **TUNE_PLACEMENT_GROUP_WAIT_S**: Default time the trial executor waits for placement
   groups to be placed before continuing the tuning loop. Setting this to a float
   will block for that many seconds. This is mostly used for testing purposes. Defaults

diff --git a/python/ray/tune/BUILD b/python/ray/tune/BUILD
@@ -50,9 +50,9 @@ py_test(
 )
 
 py_test(
-    name = "test_convergence_gaussian_process",
-    size = "small",
-    srcs = ["tests/test_convergence_gaussian_process.py"],
+    name = "test_convergence",
+    size = "medium",
+    srcs = ["tests/test_convergence.py"],
     deps = [":tune_lib"],
     tags = ["exclusive"],
 )

diff --git a/python/ray/tune/ray_trial_executor.py b/python/ray/tune/ray_trial_executor.py
@@ -19,7 +19,8 @@
 from ray.tune.logger import NoopLogger
 from ray.tune.result import TRIAL_INFO, STDOUT_FILE, STDERR_FILE
 from ray.tune.resources import Resources
-from ray.tune.utils.placement_groups import PlacementGroupManager
+from ray.tune.utils.placement_groups import PlacementGroupManager, \
+    get_tune_pg_prefix
 from ray.tune.utils.trainable import TrainableUtil
 from ray.tune.trial import Trial, Checkpoint, Location, TrialInfo
 from ray.tune.trial_executor import TrialExecutor
@@ -160,7 +161,7 @@ def __init__(self,
 
         self._avail_resources = Resources(cpu=0, gpu=0)
         self._committed_resources = Resources(cpu=0, gpu=0)
-        self._pg_manager = PlacementGroupManager()
+        self._pg_manager = PlacementGroupManager(prefix=get_tune_pg_prefix())
         self._staged_trials = set()
         self._just_staged_trials = set()
         self._trial_just_finished = False
@@ -197,6 +198,9 @@ def in_staging_grace_period(self) -> bool:
         """Returns True if trials have recently been staged."""
         return self._pg_manager.in_staging_grace_period()
 
+    def set_max_pending_trials(self, max_pending: int):
+        self._pg_manager.set_max_staging(max_pending)
+
     def stage_and_update_status(self, trials: List[Trial]):
         """Check and update statuses of scheduled placement groups.
 
@@ -783,7 +787,9 @@ def has_resources_for_trial(self, trial: Trial):
 
         """
         if trial.uses_placement_groups:
-            return trial in self._staged_trials or self._pg_manager.can_stage()
+            return trial in self._staged_trials or self._pg_manager.can_stage(
+            ) or self._pg_manager.has_ready(
+                trial, update=True)
 
         return self.has_resources(trial.resources)
 

diff --git a/python/ray/tune/stopper.py b/python/ray/tune/stopper.py
@@ -213,6 +213,7 @@ def stop_all(self):
         return self.has_plateaued() and self._iterations >= self._patience
 
 
+# Deprecate: 1.4
 class EarlyStopping(ExperimentPlateauStopper):
     def __init__(self, *args, **kwargs):
         warnings.warn(

diff --git a/python/ray/tune/suggest/ax.py b/python/ray/tune/suggest/ax.py
@@ -17,10 +17,11 @@
 
 # This exception only exists in newer Ax releases for python 3.7
 try:
+    from ax.exceptions.core import DataRequiredError
     from ax.exceptions.generation_strategy import \
         MaxParallelismReachedException
 except ImportError:
-    MaxParallelismReachedException = Exception
+    MaxParallelismReachedException = DataRequiredError = Exception
 
 import logging
 
@@ -262,7 +263,7 @@ def suggest(self, trial_id: str) -> Optional[Dict]:
         else:
             try:
                 parameters, trial_index = self._ax.get_next_trial()
-            except MaxParallelismReachedException:
+            except (MaxParallelismReachedException, DataRequiredError):
                 return None
 
         self._live_trial_mapping[trial_id] = trial_index

diff --git a/python/ray/tune/suggest/dragonfly.py b/python/ray/tune/suggest/dragonfly.py
@@ -4,7 +4,6 @@
 
 import inspect
 import logging
-import pickle
 from typing import Dict, List, Optional, Union
 
 from ray.tune.result import DEFAULT_METRIC
@@ -331,17 +330,6 @@ def on_trial_complete(self,
             self._opt.tell([(trial_info,
                              self._metric_op * result[self._metric])])
 
-    def save(self, checkpoint_path: str):
-        trials_object = (self._initial_points, self._opt)
-        with open(checkpoint_path, "wb") as outputFile:
-            pickle.dump(trials_object, outputFile)
-
-    def restore(self, checkpoint_dir: str):
-        with open(checkpoint_dir, "rb") as inputFile:
-            trials_object = pickle.load(inputFile)
-        self._initial_points = trials_object[0]
-        self._opt = trials_object[1]
-
     @staticmethod
     def convert_search_space(spec: Dict) -> List[Dict]:
         resolved_vars, domain_vars, grid_vars = parse_spec_vars(spec)

diff --git a/python/ray/tune/suggest/nevergrad.py b/python/ray/tune/suggest/nevergrad.py
@@ -1,6 +1,7 @@
+import inspect
 import logging
 import pickle
-from typing import Dict, Optional, Union, List, Sequence
+from typing import Dict, Optional, Type, Union, List, Sequence
 
 from ray.tune.result import DEFAULT_METRIC
 from ray.tune.sample import Categorical, Domain, Float, Integer, LogUniform, \
@@ -108,7 +109,8 @@ class NevergradSearch(Searcher):
     """
 
     def __init__(self,
-                 optimizer: Union[None, Optimizer, ConfiguredOptimizer] = None,
+                 optimizer: Union[None, Optimizer, Type[Optimizer],
+                                  ConfiguredOptimizer] = None,
                  space: Optional[Union[Dict, Parameter]] = None,
                  metric: Optional[str] = None,
                  mode: Optional[str] = None,
@@ -154,7 +156,9 @@ def __init__(self,
                     "parameter.")
             self._parameters = space
             self._nevergrad_opt = optimizer
-        elif isinstance(optimizer, ConfiguredOptimizer):
+        elif (inspect.isclass(optimizer)
+              and issubclass(optimizer, Optimizer)) or isinstance(
+                  optimizer, ConfiguredOptimizer):
             self._opt_factory = optimizer
             self._parameters = None
             self._space = space

diff --git a/python/ray/tune/tests/test_cluster.py b/python/ray/tune/tests/test_cluster.py
@@ -364,11 +364,11 @@ def test_trial_migration(start_connected_emptyhead_cluster, trainable_id):
 
 @pytest.mark.parametrize("trainable_id", ["__fake", "__fake_durable"])
 @pytest.mark.parametrize("with_pg", [True, False])
-@patch("ray.tune.trial_runner.TUNE_MAX_PENDING_TRIALS_PG", 1)
-@patch("ray.tune.utils.placement_groups.TUNE_MAX_PENDING_TRIALS_PG", 1)
 def test_trial_requeue(start_connected_emptyhead_cluster, trainable_id,
                        with_pg):
     """Removing a node in full cluster causes Trial to be requeued."""
+    os.environ["TUNE_MAX_PENDING_TRIALS_PG"] = "1"
+
     if not with_pg:
         os.environ["TUNE_PLACEMENT_GROUP_AUTO_DISABLED"] = "1"
 

diff --git a/python/ray/tune/tests/test_convergence.py b/python/ray/tune/tests/test_convergence.py
@@ -0,0 +1,156 @@
+import math
+import numpy as np
+
+import ray
+from ray import tune
+from ray.tune.stopper import ExperimentPlateauStopper
+from ray.tune.suggest import ConcurrencyLimiter
+import unittest
+
+
+def loss(config, reporter):
+    x = config.get("x")
+    reporter(loss=x**2)  # A simple function to optimize
+
+
+class ConvergenceTest(unittest.TestCase):
+    """Test convergence in gaussian process."""
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        ray.init(local_mode=False, num_cpus=1, num_gpus=0)
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        ray.shutdown()
+
+    def _testConvergence(self, searcher, top=3, patience=20):
+        # This is the space of parameters to explore
+        space = {"x": tune.uniform(0, 20)}
+
+        resources_per_trial = {"cpu": 1, "gpu": 0}
+
+        analysis = tune.run(
+            loss,
+            metric="loss",
+            mode="min",
+            stop=ExperimentPlateauStopper(
+                metric="loss", top=top, patience=patience),
+            search_alg=searcher,
+            config=space,
+            num_samples=100,  # Number of iterations
+            resources_per_trial=resources_per_trial,
+            raise_on_failed_trial=False,
+            fail_fast=True,
+            reuse_actors=True,
+            verbose=1)
+        print(f"Num trials: {len(analysis.trials)}. "
+              f"Best result: {analysis.best_config['x']}")
+
+        return analysis
+
+    def testConvergenceAx(self):
+        from ray.tune.suggest.ax import AxSearch
+
+        np.random.seed(0)
+
+        searcher = AxSearch()
+        analysis = self._testConvergence(searcher, patience=10)
+
+        assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-5)
+
+    def testConvergenceBayesOpt(self):
+        from ray.tune.suggest.bayesopt import BayesOptSearch
+
+        np.random.seed(0)
+
+        # Following bayesian optimization
+        searcher = BayesOptSearch(random_search_steps=10)
+        searcher.repeat_float_precision = 5
+        searcher = ConcurrencyLimiter(searcher, 1)
+
+        analysis = self._testConvergence(searcher, patience=100)
+
+        assert len(analysis.trials) < 50
+        assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-5)
+
+    def testConvergenceDragonfly(self):
+        from ray.tune.suggest.dragonfly import DragonflySearch
+
+        np.random.seed(0)
+        searcher = DragonflySearch(domain="euclidean", optimizer="bandit")
+        analysis = self._testConvergence(searcher)
+
+        assert len(analysis.trials) < 100
+        assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-5)
+
+    def testConvergenceHEBO(self):
+        from ray.tune.suggest.hebo import HEBOSearch
+
+        np.random.seed(0)
+        searcher = HEBOSearch()
+        analysis = self._testConvergence(searcher)
+
+        assert len(analysis.trials) < 100
+        assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-2)
+
+    def testConvergenceHyperopt(self):
+        from ray.tune.suggest.hyperopt import HyperOptSearch
+
+        np.random.seed(0)
+        searcher = HyperOptSearch(random_state_seed=1234)
+        analysis = self._testConvergence(searcher, patience=50, top=5)
+
+        assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-2)
+
+    def testConvergenceNevergrad(self):
+        from ray.tune.suggest.nevergrad import NevergradSearch
+        import nevergrad as ng
+
+        np.random.seed(0)
+        searcher = NevergradSearch(optimizer=ng.optimizers.PSO)
+        analysis = self._testConvergence(searcher, patience=50, top=5)
+
+        assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-3)
+
+    def testConvergenceOptuna(self):
+        from ray.tune.suggest.optuna import OptunaSearch
+
+        np.random.seed(1)
+        searcher = OptunaSearch()
+        analysis = self._testConvergence(
+            searcher,
+            top=5,
+        )
+
+        # This assertion is much weaker than in the BO case, but TPE
+        # don't converge too close. It is still unlikely to get to this
+        # tolerance with random search (~0.01% chance)
+        assert len(analysis.trials) < 100
+        assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-2)
+
+    def testConvergenceSkOpt(self):
+        from ray.tune.suggest.skopt import SkOptSearch
+
+        np.random.seed(0)
+        searcher = SkOptSearch()
+        analysis = self._testConvergence(searcher)
+
+        assert len(analysis.trials) < 100
+        assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-3)
+
+    def testConvergenceZoopt(self):
+        from ray.tune.suggest.zoopt import ZOOptSearch
+
+        np.random.seed(0)
+        searcher = ZOOptSearch(budget=100)
+        analysis = self._testConvergence(searcher)
+
+        assert len(analysis.trials) < 100
+        assert math.isclose(analysis.best_config["x"], 0, abs_tol=1e-3)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
diff --git a/python/ray/tune/tests/test_convergence_gaussian_process.py b/python/ray/tune/tests/test_convergence_gaussian_process.py