From dbfb3d837e02f616dd969ded20f895d0b454bee6 Mon Sep 17 00:00:00 2001 From: Yika Luo Date: Tue, 29 Oct 2024 19:23:27 -0700 Subject: [PATCH] disable nouveau and use smaller instance --- sky/clouds/service_catalog/images/README.md | 7 ++++--- .../images/provisioners/cuda-azure-grid.sh | 2 +- .../images/skypilot-azure-gpu-ubuntu.pkr.hcl | 15 ++------------- sky/provision/azure/instance.py | 2 +- 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/sky/clouds/service_catalog/images/README.md b/sky/clouds/service_catalog/images/README.md index 2323c1b01043..32f91fc67fb2 100644 --- a/sky/clouds/service_catalog/images/README.md +++ b/sky/clouds/service_catalog/images/README.md @@ -59,9 +59,10 @@ export SECRET=xxxxxx # Update this ``` 2. Build and copy images for all regions for GPU (gen 1 & 2) and CPU (gen 2 only). ```bash -export TYPE=gpu # Update this -export VM_GENERATION=1 # Update this -packer build --var vm_generation=${VM_GENERATION} --var client_secret=${SECRET} skypilot-azure-${TYPE}-ubuntu.pkr.hcl +packer build --var vm_generation=2 --var client_secret=${SECRET} skypilot-azure-cpu-ubuntu.pkr.hcl +packer build --var vm_generation=2 --var client_secret=${SECRET} skypilot-azure-gpu-ubuntu.pkr.hcl +packer build --var vm_generation=1 --var client_secret=${SECRET} skypilot-azure-gpu-ubuntu.pkr.hcl +packer build --var vm_generation=2 --var client_secret=${SECRET} --var use_grid_driver=true skypilot-azure-gpu-ubuntu.pkr.hcl ``` ## Test Images diff --git a/sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh b/sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh index 79539ea5dae8..c5d1e0c2d9a9 100644 --- a/sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh +++ b/sky/clouds/service_catalog/images/provisioners/cuda-azure-grid.sh @@ -9,7 +9,7 @@ GRID_DRIVER_FILE="NVIDIA-Linux-x86_64-535.161.08-grid-azure.run" wget -nv $GRID_DRIVER_URL -O $GRID_DRIVER_FILE sudo chmod +x $GRID_DRIVER_FILE -sudo sh $GRID_DRIVER_FILE --silent +sudo sh $GRID_DRIVER_FILE --silent --disable-nouveau echo "Installing CUDA toolkit..." diff --git a/sky/clouds/service_catalog/images/skypilot-azure-gpu-ubuntu.pkr.hcl b/sky/clouds/service_catalog/images/skypilot-azure-gpu-ubuntu.pkr.hcl index 4ddd307e6912..977d538539bd 100644 --- a/sky/clouds/service_catalog/images/skypilot-azure-gpu-ubuntu.pkr.hcl +++ b/sky/clouds/service_catalog/images/skypilot-azure-gpu-ubuntu.pkr.hcl @@ -33,7 +33,7 @@ source "azure-arm" "gpu-ubuntu" { image_offer = "0001-com-ubuntu-server-jammy" image_sku = var.vm_generation == 1 ? "22_04-lts" : "22_04-lts-gen2" location = var.use_grid_driver || var.vm_generation == 1 ? "eastus" : "centralus" - vm_size = var.use_grid_driver ? "Standard_NV36ads_A10_v5" : (var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4") + vm_size = var.use_grid_driver ? "Standard_NV6ads_A10_v5" : (var.vm_generation == 1 ? "Standard_NC4as_T4_v3" : "Standard_NC24ads_A100_v4") ssh_username = "azureuser" azure_tags = { Created_by = "packer" @@ -45,18 +45,7 @@ source "azure-arm" "gpu-ubuntu" { resource_group = "skypilot-images" gallery_name = var.use_grid_driver || var.vm_generation == 1 ? "skypilot_images" : "skypilot_image_gallery" image_name = var.use_grid_driver ? "skypilot-gpu-gen2-grid" : "skypilot-gpu-gen${var.vm_generation}" - image_version = "${local.version}" - replication_regions = [ - "centralus", - "eastus", - "eastus2", - "northcentralus", - "southcentralus", - "westcentralus", - "westus", - "westus2", - "westus3" - ] + image_version = "${local.version}2" } } diff --git a/sky/provision/azure/instance.py b/sky/provision/azure/instance.py index a8c053f8a51a..71fb9e8284d1 100644 --- a/sky/provision/azure/instance.py +++ b/sky/provision/azure/instance.py @@ -311,7 +311,7 @@ def _create_vm( vm_name=vm_name, parameters=vm_instance, ) - # poller.result() will block on async operation until it's done. + vm_poller.wait() logger.info(f'Created VM {vm_poller.result().name}.') return vm_poller.result()