-
Notifications
You must be signed in to change notification settings - Fork 589
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* [GCP] initial take for dws support with migs * fix lint errors * dependency and format fix * refactor mig instance creation * fix * remove unecessary instance creation code for mig * Fix deletion * Fix instance template logic * Restart * format * format * move to REST APIs instead of python APIs * add multi-node back * Fix multi-node * Avoid spot * format * format * fix scheduling * fix cancel * Add smoke test * revert some changes * fix smoke * Fix * fix * Fix smoke * [GCP] Changing the config name for DWS support and fix for resize request cancellation (#5) * Fix config fields * fix cancel * Add loggings * remove useless codes --------- Co-authored-by: Zhanghao Wu <zhangaho.wu@outlook.com> Co-authored-by: Zhanghao Wu <zhanghao.wu@outlook.com>
- Loading branch information
1 parent
02b2053
commit 3cd26e2
Showing
10 changed files
with
662 additions
and
71 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,7 @@ | |
from sky import clouds | ||
from sky import exceptions | ||
from sky import sky_logging | ||
from sky import skypilot_config | ||
from sky.adaptors import gcp | ||
from sky.clouds import service_catalog | ||
from sky.clouds.utils import gcp_utils | ||
|
@@ -181,20 +182,31 @@ class GCP(clouds.Cloud): | |
def _unsupported_features_for_resources( | ||
cls, resources: 'resources.Resources' | ||
) -> Dict[clouds.CloudImplementationFeatures, str]: | ||
unsupported = {} | ||
if gcp_utils.is_tpu_vm_pod(resources): | ||
return { | ||
unsupported = { | ||
clouds.CloudImplementationFeatures.STOP: ( | ||
'TPU VM pods cannot be stopped. Please refer to: https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_your_resources' | ||
'TPU VM pods cannot be stopped. Please refer to: ' | ||
'https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_your_resources' | ||
) | ||
} | ||
if gcp_utils.is_tpu(resources) and not gcp_utils.is_tpu_vm(resources): | ||
# TPU node does not support multi-node. | ||
return { | ||
clouds.CloudImplementationFeatures.MULTI_NODE: | ||
('TPU node does not support multi-node. Please set ' | ||
'num_nodes to 1.') | ||
} | ||
return {} | ||
unsupported[clouds.CloudImplementationFeatures.MULTI_NODE] = ( | ||
'TPU node does not support multi-node. Please set ' | ||
'num_nodes to 1.') | ||
# TODO(zhwu): We probably need to store the MIG requirement in resources | ||
# because `skypilot_config` may change for an existing cluster. | ||
# Clusters created with MIG (only GPU clusters) cannot be stopped. | ||
if (skypilot_config.get_nested( | ||
('gcp', 'managed_instance_group'), None) is not None and | ||
resources.accelerators): | ||
unsupported[clouds.CloudImplementationFeatures.STOP] = ( | ||
'Managed Instance Group (MIG) does not support stopping yet.') | ||
unsupported[clouds.CloudImplementationFeatures.SPOT_INSTANCE] = ( | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
binarycrayon
|
||
'Managed Instance Group with DWS does not support ' | ||
'spot instances.') | ||
return unsupported | ||
|
||
@classmethod | ||
def max_cluster_name_length(cls) -> Optional[int]: | ||
|
@@ -495,6 +507,12 @@ def make_deploy_resources_variables( | |
|
||
resources_vars['tpu_node_name'] = tpu_node_name | ||
|
||
managed_instance_group_config = skypilot_config.get_nested( | ||
('gcp', 'managed_instance_group'), None) | ||
use_mig = managed_instance_group_config is not None | ||
resources_vars['gcp_use_managed_instance_group'] = use_mig | ||
if use_mig: | ||
resources_vars.update(managed_instance_group_config) | ||
return resources_vars | ||
|
||
def _get_feasible_launchable_resources( | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
oh interesting, I thought DWS do support preemptible instances? can you clarify? thank you @Michaelvll