Skip to content

Commit

Permalink
[Performance] Speed up Azure A10 instance creation (skypilot-org#4205)
Browse files Browse the repository at this point in the history
* Use date instead of timestamp in skypilot image names

* Speed up Azure A10 VM creation

* disable nouveau and use smaller instance

* address comments

* address comments

* add todo
  • Loading branch information
yika-luo authored and AlexCuadron committed Nov 7, 2024
1 parent 30136b7 commit 83fade9
Show file tree
Hide file tree
Showing 12 changed files with 69 additions and 59 deletions.
9 changes: 0 additions & 9 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1950,17 +1950,8 @@ def provision_with_retries(

failover_history: List[Exception] = list()

style = colorama.Style
fore = colorama.Fore
# Retrying launchable resources.
while True:
if (isinstance(to_provision.cloud, clouds.Azure) and
to_provision.accelerators is not None and
'A10' in to_provision.accelerators and prev_handle is None):
logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch '
'an A10 cluster on Azure. This may take ~20 '
'minutes due to driver installation.'
f'{style.RESET_ALL}')
try:
# Recheck cluster name as the 'except:' block below may
# change the cloud assignment.
Expand Down
9 changes: 4 additions & 5 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
_DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
_FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
# This is used by Azure GPU VMs that use grid drivers (e.g. A10).
_DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid'

_COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'

Expand Down Expand Up @@ -220,6 +222,8 @@ def _get_default_image_tag(self, gen_version, instance_type) -> str:
acc_name = list(acc.keys())[0]
if acc_name == 'K80':
return _DEFAULT_GPU_K80_IMAGE_ID
if acc_name == 'A10':
return _DEFAULT_GPU_GRID_IMAGE_ID
# About Gen V1 vs V2:
# In Azure, all instances with K80 (Standard_NC series), some
# instances with M60 (Standard_NV series) and some cpu instances
Expand Down Expand Up @@ -350,10 +354,6 @@ def make_deploy_resources_variables(
'image_version': version,
}

# Setup the A10 nvidia driver.
need_nvidia_driver_extension = (acc_dict is not None and
'A10' in acc_dict)

# Determine resource group for deploying the instance.
resource_group_name = skypilot_config.get_nested(
('azure', 'resource_group_vm'), None)
Expand Down Expand Up @@ -413,7 +413,6 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
# Azure does not support specific zones.
'zones': None,
**image_config,
'need_nvidia_driver_extension': need_nvidia_driver_extension,
'disk_tier': Azure._get_disk_type(disk_tier),
'cloud_init_setup_commands': cloud_init_setup_commands,
'azure_subscription_id': self.get_project_id(dryrun),
Expand Down
7 changes: 4 additions & 3 deletions sky/clouds/service_catalog/images/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ export SECRET=xxxxxx # Update this
```
2. Build and copy images for all regions for GPU (gen 1 & 2) and CPU (gen 2 only).
```bash
export TYPE=gpu # Update this
export VM_GENERATION=1 # Update this
packer build --var vm_generation=${VM_GENERATION} --var client_secret=${SECRET} skypilot-azure-${TYPE}-ubuntu.pkr.hcl
packer build --var vm_generation=2 --var client_secret=${SECRET} skypilot-azure-cpu-ubuntu.pkr.hcl
packer build --var vm_generation=2 --var client_secret=${SECRET} skypilot-azure-gpu-ubuntu.pkr.hcl
packer build --var vm_generation=1 --var client_secret=${SECRET} skypilot-azure-gpu-ubuntu.pkr.hcl
packer build --var vm_generation=2 --var client_secret=${SECRET} --var use_grid_driver=true skypilot-azure-gpu-ubuntu.pkr.hcl
```

## Test Images
Expand Down
33 changes: 33 additions & 0 deletions sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

sudo apt update
sudo apt install -y build-essential

echo "Installing GRID driver..."
GRID_DRIVER_URL="https://download.microsoft.com/download/8/d/a/8da4fb8e-3a9b-4e6a-bc9a-72ff64d7a13c/NVIDIA-Linux-x86_64-535.161.08-grid-azure.run"
GRID_DRIVER_FILE="NVIDIA-Linux-x86_64-535.161.08-grid-azure.run"

wget -nv $GRID_DRIVER_URL -O $GRID_DRIVER_FILE
sudo chmod +x $GRID_DRIVER_FILE
sudo sh $GRID_DRIVER_FILE --silent --disable-nouveau

echo "Set vGPU Licensing Daemon config..."
sudo cp /etc/nvidia/gridd.conf.template /etc/nvidia/gridd.conf
sudo sed -i '/^FeatureType=0/s/^/# /' /etc/nvidia/gridd.conf
echo "IgnoreSP=FALSE" | sudo tee -a /etc/nvidia/gridd.conf
echo "EnableUI=FALSE" | sudo tee -a /etc/nvidia/gridd.conf

echo "Installing CUDA toolkit..."
CUDA_TOOLKIT_URL="https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
CUDA_TOOLKIT_FILE="cuda_12.2.0_535.54.03_linux.run"
wget -nv $CUDA_TOOLKIT_URL -O $CUDA_TOOLKIT_FILE
sudo sh $CUDA_TOOLKIT_FILE --silent --toolkit --override

# Set environment variables
echo 'export PATH=$PATH:/usr/local/cuda-12.2/bin' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.2/lib64' >> ~/.bashrc
source ~/.bashrc

# Verify installations
rm -f NVIDIA-Linux-x86_64-535.161.08-grid-azure.run cuda_12.2.0_535.54.03_linux.run
nvidia-smi
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ variable "region" {
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
}

source "amazon-ebs" "cpu-ubuntu" {
ami_name = "skypilot-aws-cpu-ubuntu-${local.timestamp}"
ami_name = "skypilot-aws-cpu-ubuntu-${local.date}"
instance_type = "t2.micro"
region = var.region
ssh_username = "ubuntu"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ variable "region" {
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
}

source "amazon-ebs" "gpu-ubuntu" {
ami_name = "skypilot-aws-gpu-ubuntu-${local.timestamp}"
ami_name = "skypilot-aws-gpu-ubuntu-${local.date}"
instance_type = "g6.xlarge"
region = var.region
ssh_username = "ubuntu"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ variable "vm_generation" {
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
version = formatdate("YY.MM.DD", timestamp())
}

source "azure-arm" "cpu-ubuntu" {
managed_image_resource_group_name = "skypilot-images"
managed_image_name = "skypilot-azure-cpu-ubuntu-${local.timestamp}"
// TODO(yika): these fields may not be required as we use community images below instead. We need to double-check if these can be removed.
managed_image_name = "skypilot-azure-cpu-ubuntu-${local.date}"

subscription_id = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7"
tenant_id = "7c81f068-46f8-4b26-9a46-2fbec2287e3d"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,20 @@ variable "vm_generation" {
description = "Azure's VM generation, currently support 1 or 2"
}

variable "use_grid_driver" {
type = bool
default = false
description = "Whether to use the Azure GRID driver. Currently only A10 GPU VMs need this."
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
version = formatdate("YY.MM.DD", timestamp())
}

source "azure-arm" "gpu-ubuntu" {
managed_image_resource_group_name = "skypilot-images"
managed_image_name = "skypilot-azure-gpu-ubuntu-${local.timestamp}"
managed_image_name = "skypilot-azure-gpu-ubuntu-${local.date}"

subscription_id = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7"
tenant_id = "7c81f068-46f8-4b26-9a46-2fbec2287e3d"
Expand All @@ -26,8 +32,8 @@ source "azure-arm" "gpu-ubuntu" {
image_publisher = "Canonical"
image_offer = "0001-com-ubuntu-server-jammy"
image_sku = var.vm_generation == 1 ? "22_04-lts" : "22_04-lts-gen2"
location = var.vm_generation == 1 ? "eastus" : "centralus"
vm_size = var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4"
location = var.use_grid_driver || var.vm_generation == 1 ? "eastus" : "centralus"
vm_size = var.use_grid_driver ? "Standard_NV12ads_A10_v5" : (var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4")
ssh_username = "azureuser"
azure_tags = {
Created_by = "packer"
Expand All @@ -37,8 +43,8 @@ source "azure-arm" "gpu-ubuntu" {
shared_image_gallery_destination {
subscription = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7"
resource_group = "skypilot-images"
gallery_name = var.vm_generation == 1 ? "skypilot_images": "skypilot_image_gallery"
image_name = "skypilot-gpu-gen${var.vm_generation}"
gallery_name = var.use_grid_driver || var.vm_generation == 1 ? "skypilot_images" : "skypilot_image_gallery"
image_name = var.use_grid_driver ? "skypilot-gpu-gen2-grid" : "skypilot-gpu-gen${var.vm_generation}"
image_version = "${local.version}"
replication_regions = [
"centralus",
Expand All @@ -61,7 +67,7 @@ build {
script = "./provisioners/docker.sh"
}
provisioner "shell" {
script = "./provisioners/cuda.sh"
script = var.use_grid_driver ? "./provisioners/cuda-azure-grid.sh" : "./provisioners/cuda.sh"
}
provisioner "shell" {
script = "./provisioners/nvidia-container-toolkit.sh"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
}

source "googlecompute" "cpu-ubuntu" {
project_id = "sky-dev-465"
image_name = "skypilot-gcp-cpu-ubuntu-${local.timestamp}"
image_name = "skypilot-gcp-cpu-ubuntu-${local.date}"
source_image_family = "ubuntu-2204-lts"
zone = "us-west1-a"
image_description = "SkyPilot custom image for launching GCP CPU instances."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ variable "zone" {
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
}

source "googlecompute" "gpu-ubuntu" {
image_name = "skypilot-gcp-gpu-ubuntu-${local.timestamp}"
image_name = "skypilot-gcp-gpu-ubuntu-${local.date}"
project_id = "sky-dev-465"
source_image_family = "ubuntu-2204-lts"
zone = var.zone
Expand Down
28 changes: 4 additions & 24 deletions sky/provision/azure/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,30 +311,10 @@ def _create_vm(
vm_name=vm_name,
parameters=vm_instance,
)
# poller.result() will block on async operation until it's done.
logger.info(f'Created VM {vm_poller.result().name}.')
# Configure driver extension for A10 GPUs. A10 GPUs requires a
# special type of drivers which is available at Microsoft HPC
# extension. Reference:
# https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2
# This can take more than 20mins for setting up the A10 GPUs
if node_config.get('need_nvidia_driver_extension', False):
ext_poller = compute_client.virtual_machine_extensions.\
begin_create_or_update(
resource_group_name=provider_config['resource_group'],
vm_name=vm_name,
vm_extension_name='NvidiaGpuDriverLinux',
extension_parameters=compute.VirtualMachineExtension(
location=provider_config['location'],
publisher='Microsoft.HpcCompute',
type_properties_type='NvidiaGpuDriverLinux',
type_handler_version='1.9',
auto_upgrade_minor_version=True,
settings='{}'))
logger.info(
f'Created VM extension {ext_poller.result().name} for VM {vm_name}.'
)
return vm_poller.result()
# This line will block until the VM is created or the operation times out.
vm = vm_poller.result()
logger.info(f'Created VM {vm.name}.')
return vm


def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
Expand Down
1 change: 0 additions & 1 deletion sky/templates/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ available_node_types:
{%- for cmd in cloud_init_setup_commands %}
{{ cmd }}
{%- endfor %}
need_nvidia_driver_extension: {{need_nvidia_driver_extension}}
{%- if disk_performance_tier is not none %}
disk_performance_tier: {{disk_performance_tier}}
{%- endif %}
Expand Down

0 comments on commit 83fade9

Please sign in to comment.