Skip to content

Commit

Permalink
Speed up Azure A10 VM creation
Browse files Browse the repository at this point in the history
  • Loading branch information
yika-luo committed Oct 29, 2024
1 parent f267893 commit c474acf
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 41 deletions.
9 changes: 0 additions & 9 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1950,17 +1950,8 @@ def provision_with_retries(

failover_history: List[Exception] = list()

style = colorama.Style
fore = colorama.Fore
# Retrying launchable resources.
while True:
if (isinstance(to_provision.cloud, clouds.Azure) and
to_provision.accelerators is not None and
'A10' in to_provision.accelerators and prev_handle is None):
logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch '
'an A10 cluster on Azure. This may take ~20 '
'minutes due to driver installation.'
f'{style.RESET_ALL}')
try:
# Recheck cluster name as the 'except:' block below may
# change the cloud assignment.
Expand Down
9 changes: 4 additions & 5 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
_DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
_FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
# This is used by Azure GPU VMs that use grid drivers (e.g. A10).
_DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid'

_COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'

Expand Down Expand Up @@ -220,6 +222,8 @@ def _get_default_image_tag(self, gen_version, instance_type) -> str:
acc_name = list(acc.keys())[0]
if acc_name == 'K80':
return _DEFAULT_GPU_K80_IMAGE_ID
if acc_name == 'A10':
return _DEFAULT_GPU_GRID_IMAGE_ID
# About Gen V1 vs V2:
# In Azure, all instances with K80 (Standard_NC series), some
# instances with M60 (Standard_NV series) and some cpu instances
Expand Down Expand Up @@ -350,10 +354,6 @@ def make_deploy_resources_variables(
'image_version': version,
}

# Setup the A10 nvidia driver.
need_nvidia_driver_extension = (acc_dict is not None and
'A10' in acc_dict)

# Determine resource group for deploying the instance.
resource_group_name = skypilot_config.get_nested(
('azure', 'resource_group_vm'), None)
Expand Down Expand Up @@ -413,7 +413,6 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
# Azure does not support specific zones.
'zones': None,
**image_config,
'need_nvidia_driver_extension': need_nvidia_driver_extension,
'disk_tier': Azure._get_disk_type(disk_tier),
'cloud_init_setup_commands': cloud_init_setup_commands,
'azure_subscription_id': self.get_project_id(dryrun),
Expand Down
27 changes: 27 additions & 0 deletions sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

sudo apt update
sudo apt install -y build-essential

echo "Installing GRID driver..."
GRID_DRIVER_URL="https://download.microsoft.com/download/8/d/a/8da4fb8e-3a9b-4e6a-bc9a-72ff64d7a13c/NVIDIA-Linux-x86_64-535.161.08-grid-azure.run"
GRID_DRIVER_FILE="NVIDIA-Linux-x86_64-535.161.08-grid-azure.run"

wget -nv $GRID_DRIVER_URL -O $GRID_DRIVER_FILE
sudo chmod +x $GRID_DRIVER_FILE
sudo sh $GRID_DRIVER_FILE --silent


echo "Installing CUDA toolkit..."
CUDA_TOOLKIT_URL="https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
CUDA_TOOLKIT_FILE="cuda_12.2.0_535.54.03_linux.run"
wget -nv $CUDA_TOOLKIT_URL -O $CUDA_TOOLKIT_FILE
sudo sh $CUDA_TOOLKIT_FILE --silent --toolkit --override

# Set environment variables
echo 'export PATH=$PATH:/usr/local/cuda-12.2/bin' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64' >> ~/.bashrc
source ~/.bashrc

# Verify installations
nvidia-smi
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ variable "vm_generation" {
description = "Azure's VM generation, currently support 1 or 2"
}

variable "use_grid_driver" {
type = bool
default = false
description = "Whether to use the Azure GRID driver. Currently only A10 GPU VMs need this."
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
version = formatdate("YY.MM.DD", timestamp())
Expand All @@ -26,8 +32,8 @@ source "azure-arm" "gpu-ubuntu" {
image_publisher = "Canonical"
image_offer = "0001-com-ubuntu-server-jammy"
image_sku = var.vm_generation == 1 ? "22_04-lts" : "22_04-lts-gen2"
location = var.vm_generation == 1 ? "eastus" : "centralus"
vm_size = var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4"
location = var.use_grid_driver || var.vm_generation == 1 ? "eastus" : "centralus"
vm_size = var.use_grid_driver ? "Standard_NV36ads_A10_v5" : (var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4")
ssh_username = "azureuser"
azure_tags = {
Created_by = "packer"
Expand All @@ -37,8 +43,8 @@ source "azure-arm" "gpu-ubuntu" {
shared_image_gallery_destination {
subscription = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7"
resource_group = "skypilot-images"
gallery_name = var.vm_generation == 1 ? "skypilot_images": "skypilot_image_gallery"
image_name = "skypilot-gpu-gen${var.vm_generation}"
gallery_name = var.use_grid_driver || var.vm_generation == 1 ? "skypilot_images" : "skypilot_image_gallery"
image_name = var.use_grid_driver ? "skypilot-gpu-gen2-grid" : "skypilot-gpu-gen${var.vm_generation}"
image_version = "${local.version}"
replication_regions = [
"centralus",
Expand All @@ -61,7 +67,7 @@ build {
script = "./provisioners/docker.sh"
}
provisioner "shell" {
script = "./provisioners/cuda.sh"
script = var.use_grid_driver ? "./provisioners/cuda-azure-grid.sh" : "./provisioners/cuda.sh"
}
provisioner "shell" {
script = "./provisioners/nvidia-container-toolkit.sh"
Expand Down
21 changes: 0 additions & 21 deletions sky/provision/azure/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,27 +313,6 @@ def _create_vm(
)
# poller.result() will block on async operation until it's done.
logger.info(f'Created VM {vm_poller.result().name}.')
# Configure driver extension for A10 GPUs. A10 GPUs requires a
# special type of drivers which is available at Microsoft HPC
# extension. Reference:
# https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2
# This can take more than 20mins for setting up the A10 GPUs
if node_config.get('need_nvidia_driver_extension', False):
ext_poller = compute_client.virtual_machine_extensions.\
begin_create_or_update(
resource_group_name=provider_config['resource_group'],
vm_name=vm_name,
vm_extension_name='NvidiaGpuDriverLinux',
extension_parameters=compute.VirtualMachineExtension(
location=provider_config['location'],
publisher='Microsoft.HpcCompute',
type_properties_type='NvidiaGpuDriverLinux',
type_handler_version='1.9',
auto_upgrade_minor_version=True,
settings='{}'))
logger.info(
f'Created VM extension {ext_poller.result().name} for VM {vm_name}.'
)
return vm_poller.result()


Expand Down
1 change: 0 additions & 1 deletion sky/templates/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ available_node_types:
{%- for cmd in cloud_init_setup_commands %}
{{ cmd }}
{%- endfor %}
need_nvidia_driver_extension: {{need_nvidia_driver_extension}}
{%- if disk_performance_tier is not none %}
disk_performance_tier: {{disk_performance_tier}}
{%- endif %}
Expand Down

0 comments on commit c474acf

Please sign in to comment.