diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index b8daa6adabe..237bc9aa16f 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -36,6 +36,7 @@ from sky import spot as spot_lib from sky import status_lib from sky import task as task_lib +from sky.clouds.cloud import ExcludableFeatureCheckConfig from sky.data import data_utils from sky.data import storage as storage_lib from sky.backends import backend_utils @@ -2016,8 +2017,15 @@ def provision_with_retries( else: cloud_user = to_provision.cloud.get_current_user_identity() # Skip if to_provision.cloud does not support requested features - to_provision.cloud.check_features_are_supported( - self._requested_features) + granted_features = to_provision.cloud.grant_features( + self._requested_features, + ExcludableFeatureCheckConfig(cluster_name=cluster_name)) + + if self._requested_features != granted_features: + logger.info( + f'{colorama.Fore.CYAN}The following features will be skipped since they are not supported by {to_provision.cloud} and were deemed optional : ' + f'{", ".join(map(lambda x: x.value, self._requested_features - granted_features))}{style.RESET_ALL}' + ) config_dict = self._retry_zones( to_provision, @@ -3768,6 +3776,14 @@ def set_autostop(self, idle_minutes_to_autostop: Optional[int], down: bool = False, stream_logs: bool = True) -> None: + if (handle.launched_resources.cloud and + clouds.CloudImplementationFeatures.AUTOSTOP in + handle.launched_resources.cloud._cloud_unsupported_features()): + logger.info( + f'{colorama.Fore.YELLOW}Skipping set_autostop since it is not supported for ' + f'{handle.get_cluster_name()}.{colorama.Style.RESET_ALL}') + return + if idle_minutes_to_autostop is not None: code = autostop_lib.AutostopCodeGen.set_autostop( idle_minutes_to_autostop, self.NAME, down) diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 5e4fdec042f..553bbfd4c91 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -15,14 +15,27 @@ from sky import resources as resources_lib +class ExcludableFeatureCheckConfig( + collections.namedtuple('ExcludableFeatureCheckConfig', + ['cluster_name'])): + """Config for excludable feature check.""" + cluster_name: str + + +class ExcludableFeatureCheckProtocol(typing.Protocol): + + def __call__(self, config: ExcludableFeatureCheckConfig) -> bool: + ... + + class CloudImplementationFeatures(enum.Enum): """Features that might not be implemented for all clouds. - Used by Cloud.check_features_are_supported(). + Used by Cloud.check_features_are_supported() and Cloud.grant_features(). Note: If any new feature is added, please check and update - _cloud_unsupported_features in all clouds to make sure the - check_features_are_supported() works as expected. + _cloud_unsupported_features and _cloud_excludable_features in all clouds to make sure + check_features_are_supported() and grant_features() work as expected. """ STOP = 'stop' AUTOSTOP = 'autostop' @@ -93,6 +106,21 @@ def _cloud_unsupported_features( """ raise NotImplementedError + @classmethod + def _cloud_excludable_features( + cls + ) -> Dict[CloudImplementationFeatures, ExcludableFeatureCheckProtocol]: + """The features that can be excluded/ignored by the cloud implementation, given their exclusion condition is met. + + This method is used by get_excludable_features() to check if the + cloud implementation can exclude features if their condition is met. + + Returns: + A dict of {feature: condition} for the features excludable by the + cloud implementation. + """ + return {} + @classmethod def _max_cluster_name_length(cls) -> Optional[int]: """Returns the maximum length limit of a cluster name. @@ -449,6 +477,44 @@ def check_features_are_supported( f'The following features are not supported by {cls._REPR}:' '\n\t' + table.get_string().replace('\n', '\n\t')) + @classmethod + def get_excludable_features( + cls, requested_features: Set[CloudImplementationFeatures], + config: ExcludableFeatureCheckConfig + ) -> Set[CloudImplementationFeatures]: + """Returns the features that can be excluded/ignored by the cloud implementation, given their exclusion condition is met. + + For instance, Kubernetes Cloud can exclude autostop for spot controller, so + Kubernetes.get_excludable_features({ + CloudImplementationFeatures.AUTOSTOP, ExcludableFeatureCheckConfig(cluster_name=cluster_name) + }) returns {CloudImplementationFeatures.AUTOSTOP} if the cluster is a spot controller else {}. + """ + excludable_features = set() + excludable_features2condition = cls._cloud_excludable_features() + for feature, condition in excludable_features2condition.items(): + if feature in requested_features and condition(config): + excludable_features.add(feature) + return excludable_features + + @classmethod + def grant_features( + cls, requested_features: Set[CloudImplementationFeatures], + config: ExcludableFeatureCheckConfig + ) -> Set[CloudImplementationFeatures]: + """Returns the features that can be granted by the cloud implementation after running cloud.check_features_are_supported on mandatory features. + + This function performs the following steps: + 1. Get excludable features from the cloud implementation. + 2. Remove excludable features from the requested features. + 3. Check if the remaining features are supported by the cloud implementation. + 4. Return the remaining features. + """ + excludable_features = cls.get_excludable_features( + requested_features, config) + remaining_features = requested_features - excludable_features + cls.check_features_are_supported(remaining_features) + return remaining_features + @classmethod def check_cluster_name_is_valid(cls, cluster_name: str) -> None: """Errors out on invalid cluster names not supported by cloud providers. diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index a2c339f9791..6d39e0a4a12 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -9,10 +9,13 @@ from sky import exceptions from sky import status_lib from sky.adaptors import kubernetes +from sky.clouds.cloud import ExcludableFeatureCheckProtocol from sky.utils import common_utils from sky.utils import ux_utils from sky.skylet.providers.kubernetes import utils as kubernetes_utils +from sky.utils import common_utils + if typing.TYPE_CHECKING: # Renaming to avoid shadowing variables. from sky import resources as resources_lib @@ -164,6 +167,12 @@ class Kubernetes(clouds.Cloud): 'supported in ' 'Kubernetes.', } + _CLOUD_EXCLUDABLE_FEATURES: Dict[ + clouds.CloudImplementationFeatures, ExcludableFeatureCheckProtocol] = { + clouds.CloudImplementationFeatures.AUTOSTOP: + lambda config: config.cluster_name.startswith( + 'sky-spot-controller-') + } IMAGE = 'us-central1-docker.pkg.dev/' \ 'skypilot-375900/skypilotk8s/skypilot:latest' @@ -173,6 +182,13 @@ def _cloud_unsupported_features( cls) -> Dict[clouds.CloudImplementationFeatures, str]: return cls._CLOUD_UNSUPPORTED_FEATURES + @classmethod + def _cloud_excludable_features( + cls + ) -> Dict[clouds.CloudImplementationFeatures, + ExcludableFeatureCheckProtocol]: + return cls._CLOUD_EXCLUDABLE_FEATURES + @classmethod def regions(cls) -> List[clouds.Region]: return cls._regions