skypilot-org · Michaelvll · Aug 31, 2023 · Aug 30, 2023 · Aug 30, 2023 · Aug 30, 2023
diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py
@@ -2014,14 +2014,24 @@ def run_ray_status_to_check_ray_cluster_healthy() -> bool:
             # triggered.
             if external_ips is None or len(external_ips) == 0:
                 logger.debug(f'Refreshing status ({cluster_name!r}): No cached '
-                             f'IPs found. External IPs: {external_ips}')
+                             f'IPs found. Handle: {handle}')
                 raise exceptions.FetchIPError(
                     reason=exceptions.FetchIPError.Reason.HEAD)
 
+            if handle.head_ssh_port is None:
+                # Refresh the ssh ports. It is ok to refresh as it is fast.
+                handle.external_ssh_ports()
+                if handle.head_ssh_port is None:
+                    logger.debug(
+                        f'Refreshing status ({cluster_name!r}): failed '
+                        f'to get the ssh ports. Handle: {handle}')
+                    raise exceptions.FetchIPError(
+                        reason=exceptions.FetchIPError.Reason.HEAD)
+
             # Check if ray cluster status is healthy.
             ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
                                                        handle.docker_user)
-            assert handle.head_ssh_port is not None, handle
+
             runner = command_runner.SSHCommandRunner(external_ips[0],
                                                      **ssh_credentials,
                                                      port=handle.head_ssh_port)

diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -1524,9 +1524,10 @@ def _retry_zones(
             if zones and len(zones) == 1:
                 launched_resources = launched_resources.copy(zone=zones[0].name)
 
-            prev_cluster_ips = None
+            prev_cluster_ips, prev_ssh_ports = None, None
             if prev_handle is not None:
                 prev_cluster_ips = prev_handle.stable_internal_external_ips
+                prev_ssh_ports = prev_handle.stable_ssh_ports
             # Record early, so if anything goes wrong, 'sky status' will show
             # the cluster name and users can appropriately 'sky down'.  It also
             # means a second 'sky launch -c <name>' will attempt to reuse.
@@ -1543,10 +1544,11 @@ def _retry_zones(
                 launched_resources=launched_resources,
                 tpu_create_script=config_dict.get('tpu-create-script'),
                 tpu_delete_script=config_dict.get('tpu-delete-script'),
-                # Use the previous cluster's IPs if available to optimize
-                # the case where the cluster is restarted, i.e., no need to
-                # query the IPs from the cloud provider.
-                stable_internal_external_ips=prev_cluster_ips)
+                # Use the previous cluster's IPs and ports if available to
+                # optimize the case where the cluster is restarted, i.e., no
+                # need to query the IPs from the cloud provider.
+                stable_internal_external_ips=prev_cluster_ips,
+                stable_ssh_ports=prev_ssh_ports)
             usage_lib.messages.usage.update_final_cluster_status(
                 status_lib.ClusterStatus.INIT)