Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for conditional exclusion of features (specific to a cloud) during provisioning #2348

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from sky import spot as spot_lib
from sky import status_lib
from sky import task as task_lib
from sky.clouds.cloud import ExcludableFeatureCheckConfig
from sky.data import data_utils
from sky.data import storage as storage_lib
from sky.backends import backend_utils
Expand Down Expand Up @@ -2016,8 +2017,15 @@ def provision_with_retries(
else:
cloud_user = to_provision.cloud.get_current_user_identity()
# Skip if to_provision.cloud does not support requested features
to_provision.cloud.check_features_are_supported(
self._requested_features)
granted_features = to_provision.cloud.grant_features(
self._requested_features,
ExcludableFeatureCheckConfig(cluster_name=cluster_name))

if self._requested_features != granted_features:
logger.info(
f'{colorama.Fore.CYAN}The following features will be skipped since they are not supported by {to_provision.cloud} and were deemed optional : '
f'{", ".join(map(lambda x: x.value, self._requested_features - granted_features))}{style.RESET_ALL}'
)

config_dict = self._retry_zones(
to_provision,
Expand Down Expand Up @@ -3768,6 +3776,14 @@ def set_autostop(self,
idle_minutes_to_autostop: Optional[int],
down: bool = False,
stream_logs: bool = True) -> None:
if (handle.launched_resources.cloud and
clouds.CloudImplementationFeatures.AUTOSTOP in
handle.launched_resources.cloud._cloud_unsupported_features()):
logger.info(
f'{colorama.Fore.YELLOW}Skipping set_autostop since it is not supported for '
f'{handle.get_cluster_name()}.{colorama.Style.RESET_ALL}')
return

if idle_minutes_to_autostop is not None:
code = autostop_lib.AutostopCodeGen.set_autostop(
idle_minutes_to_autostop, self.NAME, down)
Expand Down
72 changes: 69 additions & 3 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,27 @@
from sky import resources as resources_lib


class ExcludableFeatureCheckConfig(
collections.namedtuple('ExcludableFeatureCheckConfig',
['cluster_name'])):
"""Config for excludable feature check."""
cluster_name: str


class ExcludableFeatureCheckProtocol(typing.Protocol):

def __call__(self, config: ExcludableFeatureCheckConfig) -> bool:
...


class CloudImplementationFeatures(enum.Enum):
"""Features that might not be implemented for all clouds.

Used by Cloud.check_features_are_supported().
Used by Cloud.check_features_are_supported() and Cloud.grant_features().

Note: If any new feature is added, please check and update
_cloud_unsupported_features in all clouds to make sure the
check_features_are_supported() works as expected.
_cloud_unsupported_features and _cloud_excludable_features in all clouds to make sure
check_features_are_supported() and grant_features() work as expected.
"""
STOP = 'stop'
AUTOSTOP = 'autostop'
Expand Down Expand Up @@ -93,6 +106,21 @@ def _cloud_unsupported_features(
"""
raise NotImplementedError

@classmethod
def _cloud_excludable_features(
cls
) -> Dict[CloudImplementationFeatures, ExcludableFeatureCheckProtocol]:
"""The features that can be excluded/ignored by the cloud implementation, given their exclusion condition is met.

This method is used by get_excludable_features() to check if the
cloud implementation can exclude features if their condition is met.

Returns:
A dict of {feature: condition} for the features excludable by the
cloud implementation.
"""
return {}

@classmethod
def _max_cluster_name_length(cls) -> Optional[int]:
"""Returns the maximum length limit of a cluster name.
Expand Down Expand Up @@ -449,6 +477,44 @@ def check_features_are_supported(
f'The following features are not supported by {cls._REPR}:'
'\n\t' + table.get_string().replace('\n', '\n\t'))

@classmethod
def get_excludable_features(
cls, requested_features: Set[CloudImplementationFeatures],
config: ExcludableFeatureCheckConfig
) -> Set[CloudImplementationFeatures]:
"""Returns the features that can be excluded/ignored by the cloud implementation, given their exclusion condition is met.

For instance, Kubernetes Cloud can exclude autostop for spot controller, so
Kubernetes.get_excludable_features({
CloudImplementationFeatures.AUTOSTOP, ExcludableFeatureCheckConfig(cluster_name=cluster_name)
}) returns {CloudImplementationFeatures.AUTOSTOP} if the cluster is a spot controller else {}.
"""
excludable_features = set()
excludable_features2condition = cls._cloud_excludable_features()
for feature, condition in excludable_features2condition.items():
if feature in requested_features and condition(config):
excludable_features.add(feature)
return excludable_features

@classmethod
def grant_features(
cls, requested_features: Set[CloudImplementationFeatures],
config: ExcludableFeatureCheckConfig
) -> Set[CloudImplementationFeatures]:
"""Returns the features that can be granted by the cloud implementation after running cloud.check_features_are_supported on mandatory features.

This function performs the following steps:
1. Get excludable features from the cloud implementation.
2. Remove excludable features from the requested features.
3. Check if the remaining features are supported by the cloud implementation.
4. Return the remaining features.
"""
excludable_features = cls.get_excludable_features(
requested_features, config)
remaining_features = requested_features - excludable_features
cls.check_features_are_supported(remaining_features)
return remaining_features

@classmethod
def check_cluster_name_is_valid(cls, cluster_name: str) -> None:
"""Errors out on invalid cluster names not supported by cloud providers.
Expand Down
16 changes: 16 additions & 0 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@
from sky import exceptions
from sky import status_lib
from sky.adaptors import kubernetes
from sky.clouds.cloud import ExcludableFeatureCheckProtocol
from sky.utils import common_utils
from sky.utils import ux_utils
from sky.skylet.providers.kubernetes import utils as kubernetes_utils

from sky.utils import common_utils

if typing.TYPE_CHECKING:
# Renaming to avoid shadowing variables.
from sky import resources as resources_lib
Expand Down Expand Up @@ -164,6 +167,12 @@ class Kubernetes(clouds.Cloud):
'supported in '
'Kubernetes.',
}
_CLOUD_EXCLUDABLE_FEATURES: Dict[
clouds.CloudImplementationFeatures, ExcludableFeatureCheckProtocol] = {
clouds.CloudImplementationFeatures.AUTOSTOP:
lambda config: config.cluster_name.startswith(
'sky-spot-controller-')
}

IMAGE = 'us-central1-docker.pkg.dev/' \
'skypilot-375900/skypilotk8s/skypilot:latest'
Expand All @@ -173,6 +182,13 @@ def _cloud_unsupported_features(
cls) -> Dict[clouds.CloudImplementationFeatures, str]:
return cls._CLOUD_UNSUPPORTED_FEATURES

@classmethod
def _cloud_excludable_features(
cls
) -> Dict[clouds.CloudImplementationFeatures,
ExcludableFeatureCheckProtocol]:
return cls._CLOUD_EXCLUDABLE_FEATURES

@classmethod
def regions(cls) -> List[clouds.Region]:
return cls._regions
Expand Down