From 98a38ecc7f3eae3fb95ed43e5474a9812f51572b Mon Sep 17 00:00:00 2001 From: Yika Date: Wed, 30 Oct 2024 13:33:29 -0700 Subject: [PATCH] [Performance] Speed up Azure A10 instance creation (#4205) * Use date instead of timestamp in skypilot image names * Speed up Azure A10 VM creation * disable nouveau and use smaller instance * address comments * address comments * add todo --- sky/backends/cloud_vm_ray_backend.py | 9 ----- sky/clouds/azure.py | 9 +++-- sky/clouds/service_catalog/images/README.md | 7 ++-- .../images/provisioners/cuda-azure-grid.sh | 33 +++++++++++++++++++ .../images/skypilot-aws-cpu-ubuntu.pkr.hcl | 4 +-- .../images/skypilot-aws-gpu-ubuntu.pkr.hcl | 4 +-- .../images/skypilot-azure-cpu-ubuntu.pkr.hcl | 5 +-- .../images/skypilot-azure-gpu-ubuntu.pkr.hcl | 20 +++++++---- .../images/skypilot-gcp-cpu-ubuntu.pkr.hcl | 4 +-- .../images/skypilot-gcp-gpu-ubuntu.pkr.hcl | 4 +-- sky/provision/azure/instance.py | 28 +++------------- sky/templates/azure-ray.yml.j2 | 1 - 12 files changed, 69 insertions(+), 59 deletions(-) create mode 100644 sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 27e61b5b371..b0a064afe7c 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1950,17 +1950,8 @@ def provision_with_retries( failover_history: List[Exception] = list() - style = colorama.Style - fore = colorama.Fore # Retrying launchable resources. while True: - if (isinstance(to_provision.cloud, clouds.Azure) and - to_provision.accelerators is not None and - 'A10' in to_provision.accelerators and prev_handle is None): - logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch ' - 'an A10 cluster on Azure. This may take ~20 ' - 'minutes due to driver installation.' - f'{style.RESET_ALL}') try: # Recheck cluster name as the 'except:' block below may # change the cloud assignment. diff --git a/sky/clouds/azure.py b/sky/clouds/azure.py index cc90f273dd9..edd5840d271 100644 --- a/sky/clouds/azure.py +++ b/sky/clouds/azure.py @@ -44,6 +44,8 @@ _DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1' _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004' _FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204' +# This is used by Azure GPU VMs that use grid drivers (e.g. A10). +_DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid' _COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries' @@ -220,6 +222,8 @@ def _get_default_image_tag(self, gen_version, instance_type) -> str: acc_name = list(acc.keys())[0] if acc_name == 'K80': return _DEFAULT_GPU_K80_IMAGE_ID + if acc_name == 'A10': + return _DEFAULT_GPU_GRID_IMAGE_ID # About Gen V1 vs V2: # In Azure, all instances with K80 (Standard_NC series), some # instances with M60 (Standard_NV series) and some cpu instances @@ -350,10 +354,6 @@ def make_deploy_resources_variables( 'image_version': version, } - # Setup the A10 nvidia driver. - need_nvidia_driver_extension = (acc_dict is not None and - 'A10' in acc_dict) - # Determine resource group for deploying the instance. resource_group_name = skypilot_config.get_nested( ('azure', 'resource_group_vm'), None) @@ -413,7 +413,6 @@ def _failover_disk_tier() -> Optional[resources_utils.DiskTier]: # Azure does not support specific zones. 'zones': None, **image_config, - 'need_nvidia_driver_extension': need_nvidia_driver_extension, 'disk_tier': Azure._get_disk_type(disk_tier), 'cloud_init_setup_commands': cloud_init_setup_commands, 'azure_subscription_id': self.get_project_id(dryrun), diff --git a/sky/clouds/service_catalog/images/README.md b/sky/clouds/service_catalog/images/README.md index 2323c1b0104..32f91fc67fb 100644 --- a/sky/clouds/service_catalog/images/README.md +++ b/sky/clouds/service_catalog/images/README.md @@ -59,9 +59,10 @@ export SECRET=xxxxxx # Update this ``` 2. Build and copy images for all regions for GPU (gen 1 & 2) and CPU (gen 2 only). ```bash -export TYPE=gpu # Update this -export VM_GENERATION=1 # Update this -packer build --var vm_generation=${VM_GENERATION} --var client_secret=${SECRET} skypilot-azure-${TYPE}-ubuntu.pkr.hcl +packer build --var vm_generation=2 --var client_secret=${SECRET} skypilot-azure-cpu-ubuntu.pkr.hcl +packer build --var vm_generation=2 --var client_secret=${SECRET} skypilot-azure-gpu-ubuntu.pkr.hcl +packer build --var vm_generation=1 --var client_secret=${SECRET} skypilot-azure-gpu-ubuntu.pkr.hcl +packer build --var vm_generation=2 --var client_secret=${SECRET} --var use_grid_driver=true skypilot-azure-gpu-ubuntu.pkr.hcl ``` ## Test Images diff --git a/sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh b/sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh new file mode 100644 index 00000000000..6177dfa5d53 --- /dev/null +++ b/sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +sudo apt update +sudo apt install -y build-essential + +echo "Installing GRID driver..." +GRID_DRIVER_URL="https://download.microsoft.com/download/8/d/a/8da4fb8e-3a9b-4e6a-bc9a-72ff64d7a13c/NVIDIA-Linux-x86_64-535.161.08-grid-azure.run" +GRID_DRIVER_FILE="NVIDIA-Linux-x86_64-535.161.08-grid-azure.run" + +wget -nv $GRID_DRIVER_URL -O $GRID_DRIVER_FILE +sudo chmod +x $GRID_DRIVER_FILE +sudo sh $GRID_DRIVER_FILE --silent --disable-nouveau + +echo "Set vGPU Licensing Daemon config..." +sudo cp /etc/nvidia/gridd.conf.template /etc/nvidia/gridd.conf +sudo sed -i '/^FeatureType=0/s/^/# /' /etc/nvidia/gridd.conf +echo "IgnoreSP=FALSE" | sudo tee -a /etc/nvidia/gridd.conf +echo "EnableUI=FALSE" | sudo tee -a /etc/nvidia/gridd.conf + +echo "Installing CUDA toolkit..." +CUDA_TOOLKIT_URL="https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run" +CUDA_TOOLKIT_FILE="cuda_12.2.0_535.54.03_linux.run" +wget -nv $CUDA_TOOLKIT_URL -O $CUDA_TOOLKIT_FILE +sudo sh $CUDA_TOOLKIT_FILE --silent --toolkit --override + +# Set environment variables +echo 'export PATH=$PATH:/usr/local/cuda-12.2/bin' >> ~/.bashrc +echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.2/lib64' >> ~/.bashrc +source ~/.bashrc + +# Verify installations +rm -f NVIDIA-Linux-x86_64-535.161.08-grid-azure.run cuda_12.2.0_535.54.03_linux.run +nvidia-smi diff --git a/sky/clouds/service_catalog/images/skypilot-aws-cpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-aws-cpu-ubuntu.pkr.hcl index 0e2f74c5355..5611b589cf9 100644 --- a/sky/clouds/service_catalog/images/skypilot-aws-cpu-ubuntu.pkr.hcl +++ b/sky/clouds/service_catalog/images/skypilot-aws-cpu-ubuntu.pkr.hcl @@ -4,11 +4,11 @@ variable "region" { } locals { - timestamp = regex_replace(timestamp(), "[- TZ:]", "") + date = formatdate("YYMMDD", timestamp()) } source "amazon-ebs" "cpu-ubuntu" { - ami_name = "skypilot-aws-cpu-ubuntu-${local.timestamp}" + ami_name = "skypilot-aws-cpu-ubuntu-${local.date}" instance_type = "t2.micro" region = var.region ssh_username = "ubuntu" diff --git a/sky/clouds/service_catalog/images/skypilot-aws-gpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-aws-gpu-ubuntu.pkr.hcl index 40ca77f49a0..91aad2aa71e 100644 --- a/sky/clouds/service_catalog/images/skypilot-aws-gpu-ubuntu.pkr.hcl +++ b/sky/clouds/service_catalog/images/skypilot-aws-gpu-ubuntu.pkr.hcl @@ -4,11 +4,11 @@ variable "region" { } locals { - timestamp = regex_replace(timestamp(), "[- TZ:]", "") + date = formatdate("YYMMDD", timestamp()) } source "amazon-ebs" "gpu-ubuntu" { - ami_name = "skypilot-aws-gpu-ubuntu-${local.timestamp}" + ami_name = "skypilot-aws-gpu-ubuntu-${local.date}" instance_type = "g6.xlarge" region = var.region ssh_username = "ubuntu" diff --git a/sky/clouds/service_catalog/images/skypilot-azure-cpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-azure-cpu-ubuntu.pkr.hcl index 0b7dd18188d..dd08e1baef7 100644 --- a/sky/clouds/service_catalog/images/skypilot-azure-cpu-ubuntu.pkr.hcl +++ b/sky/clouds/service_catalog/images/skypilot-azure-cpu-ubuntu.pkr.hcl @@ -9,13 +9,14 @@ variable "vm_generation" { } locals { - timestamp = regex_replace(timestamp(), "[- TZ:]", "") + date = formatdate("YYMMDD", timestamp()) version = formatdate("YY.MM.DD", timestamp()) } source "azure-arm" "cpu-ubuntu" { managed_image_resource_group_name = "skypilot-images" - managed_image_name = "skypilot-azure-cpu-ubuntu-${local.timestamp}" + // TODO(yika): these fields may not be required as we use community images below instead. We need to double-check if these can be removed. + managed_image_name = "skypilot-azure-cpu-ubuntu-${local.date}" subscription_id = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7" tenant_id = "7c81f068-46f8-4b26-9a46-2fbec2287e3d" diff --git a/sky/clouds/service_catalog/images/skypilot-azure-gpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-azure-gpu-ubuntu.pkr.hcl index e5901d4107f..d66125cfffb 100644 --- a/sky/clouds/service_catalog/images/skypilot-azure-gpu-ubuntu.pkr.hcl +++ b/sky/clouds/service_catalog/images/skypilot-azure-gpu-ubuntu.pkr.hcl @@ -8,14 +8,20 @@ variable "vm_generation" { description = "Azure's VM generation, currently support 1 or 2" } +variable "use_grid_driver" { + type = bool + default = false + description = "Whether to use the Azure GRID driver. Currently only A10 GPU VMs need this." +} + locals { - timestamp = regex_replace(timestamp(), "[- TZ:]", "") + date = formatdate("YYMMDD", timestamp()) version = formatdate("YY.MM.DD", timestamp()) } source "azure-arm" "gpu-ubuntu" { managed_image_resource_group_name = "skypilot-images" - managed_image_name = "skypilot-azure-gpu-ubuntu-${local.timestamp}" + managed_image_name = "skypilot-azure-gpu-ubuntu-${local.date}" subscription_id = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7" tenant_id = "7c81f068-46f8-4b26-9a46-2fbec2287e3d" @@ -26,8 +32,8 @@ source "azure-arm" "gpu-ubuntu" { image_publisher = "Canonical" image_offer = "0001-com-ubuntu-server-jammy" image_sku = var.vm_generation == 1 ? "22_04-lts" : "22_04-lts-gen2" - location = var.vm_generation == 1 ? "eastus" : "centralus" - vm_size = var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4" + location = var.use_grid_driver || var.vm_generation == 1 ? "eastus" : "centralus" + vm_size = var.use_grid_driver ? "Standard_NV12ads_A10_v5" : (var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4") ssh_username = "azureuser" azure_tags = { Created_by = "packer" @@ -37,8 +43,8 @@ source "azure-arm" "gpu-ubuntu" { shared_image_gallery_destination { subscription = "59d8c23c-7ef5-42c7-b2f3-a919ad8026a7" resource_group = "skypilot-images" - gallery_name = var.vm_generation == 1 ? "skypilot_images": "skypilot_image_gallery" - image_name = "skypilot-gpu-gen${var.vm_generation}" + gallery_name = var.use_grid_driver || var.vm_generation == 1 ? "skypilot_images" : "skypilot_image_gallery" + image_name = var.use_grid_driver ? "skypilot-gpu-gen2-grid" : "skypilot-gpu-gen${var.vm_generation}" image_version = "${local.version}" replication_regions = [ "centralus", @@ -61,7 +67,7 @@ build { script = "./provisioners/docker.sh" } provisioner "shell" { - script = "./provisioners/cuda.sh" + script = var.use_grid_driver ? "./provisioners/cuda-azure-grid.sh" : "./provisioners/cuda.sh" } provisioner "shell" { script = "./provisioners/nvidia-container-toolkit.sh" diff --git a/sky/clouds/service_catalog/images/skypilot-gcp-cpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-gcp-cpu-ubuntu.pkr.hcl index 3c56e3d5af6..056f2d1239e 100644 --- a/sky/clouds/service_catalog/images/skypilot-gcp-cpu-ubuntu.pkr.hcl +++ b/sky/clouds/service_catalog/images/skypilot-gcp-cpu-ubuntu.pkr.hcl @@ -1,11 +1,11 @@ locals { - timestamp = regex_replace(timestamp(), "[- TZ:]", "") + date = formatdate("YYMMDD", timestamp()) } source "googlecompute" "cpu-ubuntu" { project_id = "sky-dev-465" - image_name = "skypilot-gcp-cpu-ubuntu-${local.timestamp}" + image_name = "skypilot-gcp-cpu-ubuntu-${local.date}" source_image_family = "ubuntu-2204-lts" zone = "us-west1-a" image_description = "SkyPilot custom image for launching GCP CPU instances." diff --git a/sky/clouds/service_catalog/images/skypilot-gcp-gpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-gcp-gpu-ubuntu.pkr.hcl index 578c0b1269f..8d57456b29b 100644 --- a/sky/clouds/service_catalog/images/skypilot-gcp-gpu-ubuntu.pkr.hcl +++ b/sky/clouds/service_catalog/images/skypilot-gcp-gpu-ubuntu.pkr.hcl @@ -4,11 +4,11 @@ variable "zone" { } locals { - timestamp = regex_replace(timestamp(), "[- TZ:]", "") + date = formatdate("YYMMDD", timestamp()) } source "googlecompute" "gpu-ubuntu" { - image_name = "skypilot-gcp-gpu-ubuntu-${local.timestamp}" + image_name = "skypilot-gcp-gpu-ubuntu-${local.date}" project_id = "sky-dev-465" source_image_family = "ubuntu-2204-lts" zone = var.zone diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py index 24ba012ea9e..60159232787 100644 --- a/sky/provision/azure/instance.py +++ b/sky/provision/azure/instance.py @@ -311,30 +311,10 @@ def _create_vm( vm_name=vm_name, parameters=vm_instance, ) - # poller.result() will block on async operation until it's done. - logger.info(f'Created VM {vm_poller.result().name}.') - # Configure driver extension for A10 GPUs. A10 GPUs requires a - # special type of drivers which is available at Microsoft HPC - # extension. Reference: - # https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2 - # This can take more than 20mins for setting up the A10 GPUs - if node_config.get('need_nvidia_driver_extension', False): - ext_poller = compute_client.virtual_machine_extensions.\ - begin_create_or_update( - resource_group_name=provider_config['resource_group'], - vm_name=vm_name, - vm_extension_name='NvidiaGpuDriverLinux', - extension_parameters=compute.VirtualMachineExtension( - location=provider_config['location'], - publisher='Microsoft.HpcCompute', - type_properties_type='NvidiaGpuDriverLinux', - type_handler_version='1.9', - auto_upgrade_minor_version=True, - settings='{}')) - logger.info( - f'Created VM extension {ext_poller.result().name} for VM {vm_name}.' - ) - return vm_poller.result() + # This line will block until the VM is created or the operation times out. + vm = vm_poller.result() + logger.info(f'Created VM {vm.name}.') + return vm def _create_instances(compute_client: 'azure_compute.ComputeManagementClient', diff --git a/sky/templates/azure-ray.yml.j2 b/sky/templates/azure-ray.yml.j2 index 36bf9468b23..7b9737748d3 100644 --- a/sky/templates/azure-ray.yml.j2 +++ b/sky/templates/azure-ray.yml.j2 @@ -83,7 +83,6 @@ available_node_types: {%- for cmd in cloud_init_setup_commands %} {{ cmd }} {%- endfor %} - need_nvidia_driver_extension: {{need_nvidia_driver_extension}} {%- if disk_performance_tier is not none %} disk_performance_tier: {{disk_performance_tier}} {%- endif %}