Migrate existing notebooks to the azureml-examples repo from azureml-…

…foundation-models (#2432) Co-authored-by: grajguru <grajguru@microsoft.com>
Azure · Jul 7, 2023 · 98118c7 · 98118c7
1 parent 2cee822
commit 98118c7
Show file tree

Hide file tree

Showing 20 changed files with 3,742 additions and 0 deletions.
diff --git a/...stem/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json b/...stem/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json
@@ -0,0 +1,42 @@
+{
+  "fp16": {
+    "enabled": true,
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "zero_optimization": {
+    "stage": 1,
+    "allgather_partitions": true,
+    "allgather_bucket_size": 200000000,
+    "overlap_comm": true,
+    "reduce_scatter": true,
+    "reduce_bucket_size": 200000000,
+    "contiguous_gradients": false,
+    "cpu_offload": false
+  },
+  "zero_allow_untested_optimizer": true,
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": "auto",
+      "warmup_max_lr": "auto",
+      "warmup_num_steps": "auto"
+    }
+  },
+  "steps_per_print": 2000,
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "wall_clock_breakdown": false
+}
diff --git a/...ndation-models/system/finetune/image-classification/multiclass-classification/deploy.yaml b/...ndation-models/system/finetune/image-classification/multiclass-classification/deploy.yaml
@@ -0,0 +1,4 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+name: demo
+instance_type: Standard_DS3_v2
+instance_count: 1
diff --git a/...class-classification/hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml b/...class-classification/hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml
@@ -0,0 +1,96 @@
+$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
+type: pipeline
+
+experiment_name: AzureML-Train-Finetune-Vision-MultiClass-Samples
+
+inputs:
+  # # model - specify the foundation model available in the azureml system registry
+  mlflow_model_path:
+    path: azureml://registries/azureml-preview/models/google-vit-base-patch16-224/versions/1
+    type: mlflow_model
+  # model_name: microsoft/beit-base-patch16-224-pt22k-ft22k
+  # dataset files
+  training_data:
+    path: ./data/training-mltable-folder
+    type: mltable
+  validation_data:
+    path: ./data/validation-mltable-folder
+    type: mltable
+  # deepspeed config file
+  ds_finetune:
+    path: ./deepspeed_configs/zero1.json
+    type: uri_file
+  # compute
+  compute_model_import: sample-model-import-cluster
+  compute_finetune: sample-finetune-cluster-gpu-nc6
+
+outputs:
+  # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model
+  # registering the model is required to deploy the model to an online or batch endpoint
+  trained_model:
+    type: mlflow_model
+
+settings:
+  force_rerun: true
+  default_compute: azureml:sample-finetune-cluster-gpu-nc6
+
+jobs:
+  huggingface_transformers_model_finetune_job:
+    type: pipeline
+    component: azureml://registries/azureml-preview/components/image_classification_pipeline/labels/latest
+    inputs:
+
+      # Compute
+      compute_model_import: ${{parent.inputs.compute_model_import}}
+      compute_finetune: ${{parent.inputs.compute_finetune}}
+      number_of_gpu_to_use_finetuning: 1
+      num_nodes_finetune: 1
+
+      # model
+      task_name: image-classification
+      model_family: HuggingFaceImage
+      # # specify the model_name instead of mlflow_model if you want to use a model from the huggingface hub
+      mlflow_model: ${{parent.inputs.mlflow_model_path}} 
+      # model_name: ${{parent.inputs.model_name}}
+
+      # data
+      training_data: ${{parent.inputs.training_data}}
+      validation_data: ${{parent.inputs.validation_data}}
+
+      image_width: 224
+      image_height: 224
+      number_of_workers: 8
+      apply_augmentations: True
+      apply_deepspeed: False
+      deepspeed_config: ${{parent.inputs.ds_finetune}}
+      apply_ort: False
+      number_of_epochs: 15
+      max_steps: -1
+      training_batch_size: 4
+      validation_batch_size: 4
+      auto_find_batch_size: False
+      learning_rate: 5e-5
+      learning_rate_scheduler: warmup_linear
+      warmup_steps: 0
+      optimizer: adamw_hf
+      weight_decay: 0.0
+      gradient_accumulation_step: 1
+      precision: 32
+      metric_for_best_model: accuracy
+      label_smoothing_factor: 0.0
+      random_seed: 42
+      evaluation_strategy: epoch
+      evaluation_steps: 500
+      logging_strategy: epoch
+      logging_steps: 500
+      save_strategy: epoch
+      save_steps: 500
+      save_total_limit: -1
+      early_stopping: False
+      early_stopping_patience: 1
+      max_grad_norm: 1.0
+      resume_from_checkpoint: False
+      save_as_mlflow_model: True
+
+    outputs:
+      mlflow_model_folder: ${{parent.outputs.trained_model}}
diff --git a/...ation/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.sh b/...ation/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+set -x
+
+# script inputs
+registry_name="azureml-preview"
+subscription_id="<SUBSCRIPTION_ID>"
+resource_group_name="<RESOURCE_GROUP>"
+workspace_name="<WORKSPACE_NAME>"
+
+compute_cluster_model_import="sample-model-import-cluster"
+compute_cluster_finetune="sample-finetune-cluster-gpu-nc6"
+# if above compute cluster does not exist, create it with the following vm size
+compute_model_import_sku="Standard_D12"
+compute_finetune_sku="Standard_NC6"
+# This is the number of GPUs in a single node of the selected 'vm_size' compute. 
+# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
+# Setting this to more than the number of GPUs will result in an error.
+gpus_per_node=1
+
+# huggingFace model
+huggingface_model_name="microsoft/beit-base-patch16-224-pt22k-ft22k"
+# This is the foundation model for finetuning from azureml system registry
+# using the latest version of the model - not working yet
+aml_registry_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k"
+model_version=1
+
+version=$(date +%s)
+finetuned_huggingface_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k-fridge-objects-multiclass-classification"
+huggingface_endpoint_name="hf-mc-fridge-items-$version"
+deployment_sku="Standard_DS3_V2"
+
+# Deepspeed config
+ds_finetune="./deepspeed_configs/zero1.json"
+
+# Scoring file
+huggingface_sample_request_data="./huggingface_sample_request_data.json"
+
+# finetuning job parameters
+finetuning_pipeline_component="transformers_image_classification_pipeline"
+# Training settings
+number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute
+
+# 1. Install dependencies
+pip install azure-ai-ml==1.0.0
+pip install azure-identity
+pip install datasets==2.3.2
+
+unameOut=$(uname -a)
+case "${unameOut}" in
+    *Microsoft*)     OS="WSL";; #must be first since Windows subsystem for linux will have Linux in the name too
+    *microsoft*)     OS="WSL2";; #WARNING: My v2 uses ubuntu 20.4 at the moment slightly different name may not always work
+    Linux*)     OS="Linux";;
+    Darwin*)    OS="Mac";;
+    CYGWIN*)    OS="Cygwin";;
+    MINGW*)     OS="Windows";;
+    *Msys)      OS="Windows";;
+    *)          OS="UNKNOWN:${unameOut}"
+esac
+if [[ ${OS} == "Mac" ]] && sysctl -n machdep.cpu.brand_string | grep -q 'Apple M1'; then
+    OS="MacM1"
+fi
+echo ${OS};
+
+jq_version=$(jq --version)
+echo ${jq_version};
+if [[ $? == 0 ]]; then
+    echo "jq already installed"
+else
+    echo "jq not installed, installing now..."
+    # Install jq
+    if [[ ${OS} == "Mac" ]] || [[ ${OS} == "MacM1" ]]; then
+        # Install jq on mac
+        brew install jq
+    elif [[ ${OS} == "WSL" ]] || [[ ${OS} == "WSL2" ]] || [[ ${OS} == "Linux" ]]; then
+        # Install jq on WSL
+        sudo apt-get install jq
+    elif [[ ${OS} == "Windows" ]] || [[ ${OS} == "Cygwin" ]]; then
+        # Install jq on windows
+        curl -L -o ./jq.exe https://github.com/stedolan/jq/releases/latest/download/jq-win64.exe
+    else
+        echo "Failed to install jq! This might cause issues"
+    fi
+fi
+
+
+# 2. Setup pre-requisites
+az account set -s $subscription_id
+workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name"
+
+# check if $compute_cluster_model_import exists, else create it
+if az ml compute show --name $compute_cluster_model_import $workspace_info
+then
+    echo "Compute cluster $compute_cluster_model_import already exists"
+else
+    echo "Creating compute cluster $compute_cluster_model_import"
+    az ml compute create --name $compute_cluster_model_import --type amlcompute --min-instances 0 --max-instances 2 --size $compute_model_import_sku $workspace_info || {
+        echo "Failed to create compute cluster $compute_cluster_model_import"
+        exit 1
+    }
+fi
+
+# check if $compute_cluster_finetune exists, else create it
+if az ml compute show --name $compute_cluster_finetune $workspace_info
+then
+    echo "Compute cluster $compute_cluster_finetune already exists"
+else
+    echo "Creating compute cluster $compute_cluster_finetune"
+    az ml compute create --name $compute_cluster_finetune --type amlcompute --min-instances 0 --max-instances 2 --size $compute_finetune_sku $workspace_info || {
+        echo "Failed to create compute cluster $compute_cluster_finetune"
+        exit 1
+    }
+fi
+
+# check if the finetuning pipeline component exists
+if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name
+then
+    echo "Finetuning pipeline component $finetuning_pipeline_component does not exist"
+    exit 1
+fi
+
+# 3. Check if the model exists in the registry
+# need to confirm model show command works for registries outside the tenant (aka system registry)
+if ! az ml model show --name $aml_registry_model_name --version $model_version --registry-name $registry_name 
+then
+    echo "Model $aml_registry_model_name:$model_version does not exist in registry $registry_name"
+    exit 1
+fi
+
+# 4. Prepare data
+python prepare_data.py
+# training data
+train_data="./data/training-mltable-folder"
+# validation data
+validation_data="./data/validation-mltable-folder"
+
+# Check if training data, validation data exist
+if [ ! -d $train_data ]; then
+    echo "Training data $train_data does not exist"
+    exit 1
+fi
+if [ ! -d $validation_data ]; then
+    echo "Validation data $validation_data does not exist"
+    exit 1
+fi
+
+# 5. Submit finetuning job using pipeline.yaml for a HuggingFace Transformers model
+
+# # Need to switch to using latest version for model, currently blocked with a bug.
+
+# # If you want to use a HuggingFace model, specify the inputs.model_name instead of inputs.mlflow_model_path.path like below
+# inputs.model_name=$huggingface_model_name
+
+huggingface_parent_job=$( az ml job create \
+  --file "./hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml" \
+  $workspace_info \
+  --set jobs.huggingface_transformers_model_finetune_job.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \
+  inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$aml_registry_model_name/versions/$model_version" \
+  inputs.training_data.path=$train_data \
+  inputs.validation_data.path=$validation_data \
+  inputs.compute_model_import=$compute_cluster_model_import \
+  inputs.compute_finetune=$compute_cluster_finetune
+  ) || {
+    echo "Failed to submit finetuning job"
+    exit 1
+  }
+
+huggingface_parent_job_name=$(echo "$huggingface_parent_job" | jq -r ".display_name")
+az ml job stream --name $huggingface_parent_job_name $workspace_info || {
+    echo "job stream failed"; exit 1;
+}
+
+# 6. Create model in workspace from train job output for fine-tuned HuggingFace Transformers model
+az ml model create --name $finetuned_huggingface_model_name --version $version --type mlflow_model \
+ --path azureml://jobs/$huggingface_parent_job_name/outputs/trained_model $workspace_info  || {
+    echo "model create in workspace failed"; exit 1;
+}
+
+# 7. Deploy the fine-tuned HuggingFace Transformers model to an endpoint
+# create online endpoint 
+az ml online-endpoint create --name $huggingface_endpoint_name $workspace_info  || {
+    echo "endpoint create failed"; exit 1;
+}
+
+# deploy model from registry to endpoint in workspace
+az ml online-deployment create --file ./deploy.yaml $workspace_info --all-traffic --set \
+  endpoint_name=$huggingface_endpoint_name model=azureml:$finetuned_huggingface_model_name:$version \
+  instance_type=$deployment_sku || {
+    echo "deployment create failed"; exit 1;
+}
+
+# 8. Try a sample scoring request on the deployed HuggingFace Transformers model
+
+# Check if scoring data file exists
+if [ -f $huggingface_sample_request_data ]; then
+    echo "Invoking endpoint $huggingface_endpoint_name with following input:\n\n"
+    cat $huggingface_sample_request_data
+    echo "\n\n"
+else
+    echo "Scoring file $huggingface_sample_request_data does not exist"
+    exit 1
+fi
+
+az ml online-endpoint invoke --name $huggingface_endpoint_name --request-file $huggingface_sample_request_data $workspace_info || {
+    echo "endpoint invoke failed"; exit 1;
+}
+
+# 9. Delete the endpoint
+az ml online-endpoint delete --name $huggingface_endpoint_name $workspace_info --yes || {
+    echo "endpoint delete failed"; exit 1;
+}
+
+# 10. Delete the request data file
+
+rm $huggingface_sample_request_data