Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Performance] Speed up Azure A10 instance creation #4205

Merged
merged 6 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1950,17 +1950,8 @@ def provision_with_retries(

failover_history: List[Exception] = list()

style = colorama.Style
fore = colorama.Fore
# Retrying launchable resources.
while True:
if (isinstance(to_provision.cloud, clouds.Azure) and
to_provision.accelerators is not None and
'A10' in to_provision.accelerators and prev_handle is None):
logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch '
'an A10 cluster on Azure. This may take ~20 '
'minutes due to driver installation.'
f'{style.RESET_ALL}')
try:
# Recheck cluster name as the 'except:' block below may
# change the cloud assignment.
Expand Down
9 changes: 4 additions & 5 deletions sky/clouds/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
_DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
_FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
# This is used by Azure GPU VMs that use grid drivers (e.g. A10).
_DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid'

_COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'

Expand Down Expand Up @@ -220,6 +222,8 @@ def _get_default_image_tag(self, gen_version, instance_type) -> str:
acc_name = list(acc.keys())[0]
if acc_name == 'K80':
return _DEFAULT_GPU_K80_IMAGE_ID
if acc_name == 'A10':
return _DEFAULT_GPU_GRID_IMAGE_ID
# About Gen V1 vs V2:
# In Azure, all instances with K80 (Standard_NC series), some
# instances with M60 (Standard_NV series) and some cpu instances
Expand Down Expand Up @@ -350,10 +354,6 @@ def make_deploy_resources_variables(
'image_version': version,
}

# Setup the A10 nvidia driver.
need_nvidia_driver_extension = (acc_dict is not None and
'A10' in acc_dict)

# Determine resource group for deploying the instance.
resource_group_name = skypilot_config.get_nested(
('azure', 'resource_group_vm'), None)
Expand Down Expand Up @@ -413,7 +413,6 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
# Azure does not support specific zones.
'zones': None,
**image_config,
'need_nvidia_driver_extension': need_nvidia_driver_extension,
'disk_tier': Azure._get_disk_type(disk_tier),
'cloud_init_setup_commands': cloud_init_setup_commands,
'azure_subscription_id': self.get_project_id(dryrun),
Expand Down
7 changes: 4 additions & 3 deletions sky/clouds/service_catalog/images/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ export SECRET=xxxxxx # Update this
```
2. Build and copy images for all regions for GPU (gen 1 & 2) and CPU (gen 2 only).
```bash
export TYPE=gpu # Update this
export VM_GENERATION=1 # Update this
packer build --var vm_generation=${VM_GENERATION} --var client_secret=${SECRET} skypilot-azure-${TYPE}-ubuntu.pkr.hcl
packer build --var vm_generation=2 --var client_secret=${SECRET} skypilot-azure-cpu-ubuntu.pkr.hcl
packer build --var vm_generation=2 --var client_secret=${SECRET} skypilot-azure-gpu-ubuntu.pkr.hcl
packer build --var vm_generation=1 --var client_secret=${SECRET} skypilot-azure-gpu-ubuntu.pkr.hcl
packer build --var vm_generation=2 --var client_secret=${SECRET} --var use_grid_driver=true skypilot-azure-gpu-ubuntu.pkr.hcl
```

## Test Images
Expand Down
33 changes: 33 additions & 0 deletions sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

sudo apt update
sudo apt install -y build-essential

echo "Installing GRID driver..."
GRID_DRIVER_URL="https://download.microsoft.com/download/8/d/a/8da4fb8e-3a9b-4e6a-bc9a-72ff64d7a13c/NVIDIA-Linux-x86_64-535.161.08-grid-azure.run"
GRID_DRIVER_FILE="NVIDIA-Linux-x86_64-535.161.08-grid-azure.run"

wget -nv $GRID_DRIVER_URL -O $GRID_DRIVER_FILE
sudo chmod +x $GRID_DRIVER_FILE
sudo sh $GRID_DRIVER_FILE --silent --disable-nouveau

echo "Set vGPU Licensing Daemon config..."
sudo cp /etc/nvidia/gridd.conf.template /etc/nvidia/gridd.conf
sudo sed -i '/^FeatureType=0/s/^/# /' /etc/nvidia/gridd.conf
echo "IgnoreSP=FALSE" | sudo tee -a /etc/nvidia/gridd.conf
echo "EnableUI=FALSE" | sudo tee -a /etc/nvidia/gridd.conf

echo "Installing CUDA toolkit..."
CUDA_TOOLKIT_URL="https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
CUDA_TOOLKIT_FILE="cuda_12.2.0_535.54.03_linux.run"
wget -nv $CUDA_TOOLKIT_URL -O $CUDA_TOOLKIT_FILE
sudo sh $CUDA_TOOLKIT_FILE --silent --toolkit --override

# Set environment variables
echo 'export PATH=$PATH:/usr/local/cuda-12.2/bin' >> ~/.bashrc
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.2/lib64' >> ~/.bashrc
source ~/.bashrc

# Verify installations
rm -f NVIDIA-Linux-x86_64-535.161.08-grid-azure.run cuda_12.2.0_535.54.03_linux.run
nvidia-smi
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ variable "region" {
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
}

source "amazon-ebs" "cpu-ubuntu" {
ami_name = "skypilot-aws-cpu-ubuntu-${local.timestamp}"
ami_name = "skypilot-aws-cpu-ubuntu-${local.date}"
instance_type = "t2.micro"
region = var.region
ssh_username = "ubuntu"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ variable "region" {
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
}

source "amazon-ebs" "gpu-ubuntu" {
ami_name = "skypilot-aws-gpu-ubuntu-${local.timestamp}"
ami_name = "skypilot-aws-gpu-ubuntu-${local.date}"
instance_type = "g6.xlarge"
region = var.region
ssh_username = "ubuntu"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ variable "vm_generation" {
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
version = formatdate("YY.MM.DD", timestamp())
}

source "azure-arm" "cpu-ubuntu" {
managed_image_resource_group_name = "skypilot-images"
managed_image_name = "skypilot-azure-cpu-ubuntu-${local.timestamp}"
managed_image_name = "skypilot-azure-cpu-ubuntu-${local.date}"

subscription_id = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7"
tenant_id = "7c81f068-46f8-4b26-9a46-2fbec2287e3d"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,20 @@ variable "vm_generation" {
description = "Azure's VM generation, currently support 1 or 2"
}

variable "use_grid_driver" {
type = bool
default = false
description = "Whether to use the Azure GRID driver. Currently only A10 GPU VMs need this."
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
version = formatdate("YY.MM.DD", timestamp())
}

source "azure-arm" "gpu-ubuntu" {
managed_image_resource_group_name = "skypilot-images"
managed_image_name = "skypilot-azure-gpu-ubuntu-${local.timestamp}"
managed_image_name = "skypilot-azure-gpu-ubuntu-${local.date}"

subscription_id = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7"
tenant_id = "7c81f068-46f8-4b26-9a46-2fbec2287e3d"
Expand All @@ -26,8 +32,8 @@ source "azure-arm" "gpu-ubuntu" {
image_publisher = "Canonical"
image_offer = "0001-com-ubuntu-server-jammy"
image_sku = var.vm_generation == 1 ? "22_04-lts" : "22_04-lts-gen2"
location = var.vm_generation == 1 ? "eastus" : "centralus"
vm_size = var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4"
location = var.use_grid_driver || var.vm_generation == 1 ? "eastus" : "centralus"
vm_size = var.use_grid_driver ? "Standard_NV12ads_A10_v5" : (var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4")
ssh_username = "azureuser"
azure_tags = {
Created_by = "packer"
Expand All @@ -37,8 +43,8 @@ source "azure-arm" "gpu-ubuntu" {
shared_image_gallery_destination {
subscription = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7"
resource_group = "skypilot-images"
gallery_name = var.vm_generation == 1 ? "skypilot_images": "skypilot_image_gallery"
image_name = "skypilot-gpu-gen${var.vm_generation}"
gallery_name = var.use_grid_driver || var.vm_generation == 1 ? "skypilot_images" : "skypilot_image_gallery"
image_name = var.use_grid_driver ? "skypilot-gpu-gen2-grid" : "skypilot-gpu-gen${var.vm_generation}"
image_version = "${local.version}"
replication_regions = [
"centralus",
Expand All @@ -61,7 +67,7 @@ build {
script = "./provisioners/docker.sh"
}
provisioner "shell" {
script = "./provisioners/cuda.sh"
script = var.use_grid_driver ? "./provisioners/cuda-azure-grid.sh" : "./provisioners/cuda.sh"
}
provisioner "shell" {
script = "./provisioners/nvidia-container-toolkit.sh"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
}

source "googlecompute" "cpu-ubuntu" {
project_id = "sky-dev-465"
image_name = "skypilot-gcp-cpu-ubuntu-${local.timestamp}"
image_name = "skypilot-gcp-cpu-ubuntu-${local.date}"
source_image_family = "ubuntu-2204-lts"
zone = "us-west1-a"
image_description = "SkyPilot custom image for launching GCP CPU instances."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ variable "zone" {
}

locals {
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
date = formatdate("YYMMDD", timestamp())
}

source "googlecompute" "gpu-ubuntu" {
image_name = "skypilot-gcp-gpu-ubuntu-${local.timestamp}"
image_name = "skypilot-gcp-gpu-ubuntu-${local.date}"
project_id = "sky-dev-465"
source_image_family = "ubuntu-2204-lts"
zone = var.zone
Expand Down
28 changes: 4 additions & 24 deletions sky/provision/azure/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,30 +311,10 @@ def _create_vm(
vm_name=vm_name,
parameters=vm_instance,
)
# poller.result() will block on async operation until it's done.
logger.info(f'Created VM {vm_poller.result().name}.')
# Configure driver extension for A10 GPUs. A10 GPUs requires a
# special type of drivers which is available at Microsoft HPC
# extension. Reference:
# https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2
# This can take more than 20mins for setting up the A10 GPUs
if node_config.get('need_nvidia_driver_extension', False):
ext_poller = compute_client.virtual_machine_extensions.\
begin_create_or_update(
resource_group_name=provider_config['resource_group'],
vm_name=vm_name,
vm_extension_name='NvidiaGpuDriverLinux',
extension_parameters=compute.VirtualMachineExtension(
location=provider_config['location'],
publisher='Microsoft.HpcCompute',
type_properties_type='NvidiaGpuDriverLinux',
type_handler_version='1.9',
auto_upgrade_minor_version=True,
settings='{}'))
logger.info(
f'Created VM extension {ext_poller.result().name} for VM {vm_name}.'
)
return vm_poller.result()
# This line will block until the VM is created or the operation times out.
vm = vm_poller.result()
logger.info(f'Created VM {vm.name}.')
return vm


def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
Expand Down
1 change: 0 additions & 1 deletion sky/templates/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ available_node_types:
{%- for cmd in cloud_init_setup_commands %}
{{ cmd }}
{%- endfor %}
need_nvidia_driver_extension: {{need_nvidia_driver_extension}}
{%- if disk_performance_tier is not none %}
disk_performance_tier: {{disk_performance_tier}}
{%- endif %}
Expand Down
Loading