[Serve] Skip test_max_replicas_per_node on Windows (ray-project#40030)

ray-project#36926 makes the test flaky by launching extra worker nodes than needed. Seems it's more flaky on Windows so disabling the test on Windows for now until the underlying issue is fixed. Also make sure we clean up everything even when tests fail. Signed-off-by: Jiajun Yao <jeromeyjj@gmail.com> Signed-off-by: Victor <vctr.y.m@example.com>
vymao · Oct 11, 2023 · 0292c2f · 0292c2f
1 parent 715de75
commit 0292c2f
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 57 deletions.
diff --git a/python/ray/serve/tests/conftest.py b/python/ray/serve/tests/conftest.py
@@ -10,6 +10,7 @@
 from ray import serve
 from ray._private.test_utils import wait_for_condition
 from ray._private.usage import usage_lib
+from ray.cluster_utils import AutoscalingCluster, Cluster
 from ray.serve.context import _get_global_client
 from ray.serve.tests.common.utils import TELEMETRY_ROUTE_PREFIX, check_ray_stopped
 from ray.tests.conftest import propagate_logs, pytest_runtest_makereport  # noqa
@@ -29,6 +30,25 @@ def ray_shutdown():
     ray.shutdown()
 
 
+@pytest.fixture
+def ray_cluster():
+    cluster = Cluster()
+    yield Cluster()
+    serve.shutdown()
+    ray.shutdown()
+    cluster.shutdown()
+
+
+@pytest.fixture
+def ray_autoscaling_cluster(request):
+    cluster = AutoscalingCluster(**request.param)
+    cluster.start()
+    yield
+    serve.shutdown()
+    ray.shutdown()
+    cluster.shutdown()
+
+
 @pytest.fixture
 def ray_start(scope="module"):
     port = random.randint(MIN_DYNAMIC_PORT, MAX_DYNAMIC_PORT)

diff --git a/python/ray/serve/tests/test_cluster.py b/python/ray/serve/tests/test_cluster.py
@@ -19,15 +19,6 @@
 from ray.serve.handle import RayServeHandle
 
 
-@pytest.fixture
-def ray_cluster():
-    cluster = Cluster()
-    yield cluster
-    serve.shutdown()
-    ray.shutdown()
-    cluster.shutdown()
-
-
 def get_pids(expected, deployment_name="D", app_name="default", timeout=30):
     handle = serve.get_deployment_handle(deployment_name, app_name)
     refs = []

diff --git a/python/ray/serve/tests/test_grpc.py b/python/ray/serve/tests/test_grpc.py
@@ -26,15 +26,6 @@
 from ray.serve.tests.test_config_files.grpc_deployment import g, g2
 
 
-@pytest.fixture
-def ray_cluster():
-    cluster = Cluster()
-    yield Cluster()
-    serve.shutdown()
-    ray.shutdown()
-    cluster.shutdown()
-
-
 def test_serving_request_through_grpc_proxy(ray_cluster):
     """Test serving request through gRPC proxy.
 

diff --git a/python/ray/serve/tests/test_max_replicas_per_node.py b/python/ray/serve/tests/test_max_replicas_per_node.py
@@ -5,7 +5,6 @@
 
 import ray
 from ray import serve
-from ray.cluster_utils import AutoscalingCluster
 from ray.serve.drivers import DAGDriver
 from ray.util.state import list_actors
 
@@ -32,24 +31,32 @@ def get_node_to_deployment_to_num_replicas():
     return node_to_deployment_to_num_replicas
 
 
-def test_basic():
-    """Test that max_replicas_per_node is honored."""
-
-    cluster = AutoscalingCluster(
-        head_resources={"CPU": 0},
-        worker_node_types={
-            "cpu_node": {
-                "resources": {
-                    "CPU": 9999,
+@pytest.mark.skipif(
+    sys.platform == "win32",
+    reason="Flaky on Windows due to https://github.com/ray-project/ray/issues/36926.",
+)
+@pytest.mark.parametrize(
+    "ray_autoscaling_cluster",
+    [
+        {
+            "head_resources": {"CPU": 0},
+            "worker_node_types": {
+                "cpu_node": {
+                    "resources": {
+                        "CPU": 9999,
+                    },
+                    "node_config": {},
+                    "min_workers": 0,
+                    "max_workers": 100,
                 },
-                "node_config": {},
-                "min_workers": 0,
-                "max_workers": 100,
             },
-        },
-    )
+        }
+    ],
+    indirect=True,
+)
+def test_basic(ray_autoscaling_cluster):
+    """Test that max_replicas_per_node is honored."""
 
-    cluster.start()
     ray.init()
 
     @serve.deployment
@@ -78,29 +85,33 @@ def __call__(self):
         assert deployment_to_num_replicas["deploy1"] == 3
         assert deployment_to_num_replicas["deploy2"] == 1
 
-    serve.shutdown()
-    ray.shutdown()
-    cluster.shutdown()
-
 
-def test_update_max_replicas_per_node():
-    """Test re-deploying a deployment with different max_replicas_per_node."""
-
-    cluster = AutoscalingCluster(
-        head_resources={"CPU": 0},
-        worker_node_types={
-            "cpu_node": {
-                "resources": {
-                    "CPU": 9999,
+@pytest.mark.skipif(
+    sys.platform == "win32",
+    reason="Flaky on Windows due to https://github.com/ray-project/ray/issues/36926.",
+)
+@pytest.mark.parametrize(
+    "ray_autoscaling_cluster",
+    [
+        {
+            "head_resources": {"CPU": 0},
+            "worker_node_types": {
+                "cpu_node": {
+                    "resources": {
+                        "CPU": 9999,
+                    },
+                    "node_config": {},
+                    "min_workers": 0,
+                    "max_workers": 100,
                 },
-                "node_config": {},
-                "min_workers": 0,
-                "max_workers": 100,
             },
-        },
-    )
+        }
+    ],
+    indirect=True,
+)
+def test_update_max_replicas_per_node(ray_autoscaling_cluster):
+    """Test re-deploying a deployment with different max_replicas_per_node."""
 
-    cluster.start()
     ray.init()
 
     @serve.deployment
@@ -136,10 +147,6 @@ def __call__(self):
         # Every node has 1 replica.
         assert deployment_to_num_replicas["deploy1"] == 1
 
-    serve.shutdown()
-    ray.shutdown()
-    cluster.shutdown()
-
 
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", "-s", __file__]))