Skip to content

Commit

Permalink
Migrate existing notebooks to the azureml-examples repo from azureml-…
Browse files Browse the repository at this point in the history
…foundation-models (#2432)

Co-authored-by: grajguru <grajguru@microsoft.com>
  • Loading branch information
gauravrajguru and grajguru authored Jul 7, 2023
1 parent 2cee822 commit 98118c7
Show file tree
Hide file tree
Showing 20 changed files with 3,742 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 1,
"allgather_partitions": true,
"allgather_bucket_size": 200000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 200000000,
"contiguous_gradients": false,
"cpu_offload": false
},
"zero_allow_untested_optimizer": true,
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"gradient_accumulation_steps": "auto",
"wall_clock_breakdown": false
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
name: demo
instance_type: Standard_DS3_v2
instance_count: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json
type: pipeline

experiment_name: AzureML-Train-Finetune-Vision-MultiClass-Samples

inputs:
# # model - specify the foundation model available in the azureml system registry
mlflow_model_path:
path: azureml://registries/azureml-preview/models/google-vit-base-patch16-224/versions/1
type: mlflow_model
# model_name: microsoft/beit-base-patch16-224-pt22k-ft22k
# dataset files
training_data:
path: ./data/training-mltable-folder
type: mltable
validation_data:
path: ./data/validation-mltable-folder
type: mltable
# deepspeed config file
ds_finetune:
path: ./deepspeed_configs/zero1.json
type: uri_file
# compute
compute_model_import: sample-model-import-cluster
compute_finetune: sample-finetune-cluster-gpu-nc6

outputs:
# map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model
# registering the model is required to deploy the model to an online or batch endpoint
trained_model:
type: mlflow_model

settings:
force_rerun: true
default_compute: azureml:sample-finetune-cluster-gpu-nc6

jobs:
huggingface_transformers_model_finetune_job:
type: pipeline
component: azureml://registries/azureml-preview/components/image_classification_pipeline/labels/latest
inputs:

# Compute
compute_model_import: ${{parent.inputs.compute_model_import}}
compute_finetune: ${{parent.inputs.compute_finetune}}
number_of_gpu_to_use_finetuning: 1
num_nodes_finetune: 1

# model
task_name: image-classification
model_family: HuggingFaceImage
# # specify the model_name instead of mlflow_model if you want to use a model from the huggingface hub
mlflow_model: ${{parent.inputs.mlflow_model_path}}
# model_name: ${{parent.inputs.model_name}}

# data
training_data: ${{parent.inputs.training_data}}
validation_data: ${{parent.inputs.validation_data}}

image_width: 224
image_height: 224
number_of_workers: 8
apply_augmentations: True
apply_deepspeed: False
deepspeed_config: ${{parent.inputs.ds_finetune}}
apply_ort: False
number_of_epochs: 15
max_steps: -1
training_batch_size: 4
validation_batch_size: 4
auto_find_batch_size: False
learning_rate: 5e-5
learning_rate_scheduler: warmup_linear
warmup_steps: 0
optimizer: adamw_hf
weight_decay: 0.0
gradient_accumulation_step: 1
precision: 32
metric_for_best_model: accuracy
label_smoothing_factor: 0.0
random_seed: 42
evaluation_strategy: epoch
evaluation_steps: 500
logging_strategy: epoch
logging_steps: 500
save_strategy: epoch
save_steps: 500
save_total_limit: -1
early_stopping: False
early_stopping_patience: 1
max_grad_norm: 1.0
resume_from_checkpoint: False
save_as_mlflow_model: True

outputs:
mlflow_model_folder: ${{parent.outputs.trained_model}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
#!/bin/bash
set -x

# script inputs
registry_name="azureml-preview"
subscription_id="<SUBSCRIPTION_ID>"
resource_group_name="<RESOURCE_GROUP>"
workspace_name="<WORKSPACE_NAME>"

compute_cluster_model_import="sample-model-import-cluster"
compute_cluster_finetune="sample-finetune-cluster-gpu-nc6"
# if above compute cluster does not exist, create it with the following vm size
compute_model_import_sku="Standard_D12"
compute_finetune_sku="Standard_NC6"
# This is the number of GPUs in a single node of the selected 'vm_size' compute.
# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
# Setting this to more than the number of GPUs will result in an error.
gpus_per_node=1

# huggingFace model
huggingface_model_name="microsoft/beit-base-patch16-224-pt22k-ft22k"
# This is the foundation model for finetuning from azureml system registry
# using the latest version of the model - not working yet
aml_registry_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k"
model_version=1

version=$(date +%s)
finetuned_huggingface_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k-fridge-objects-multiclass-classification"
huggingface_endpoint_name="hf-mc-fridge-items-$version"
deployment_sku="Standard_DS3_V2"

# Deepspeed config
ds_finetune="./deepspeed_configs/zero1.json"

# Scoring file
huggingface_sample_request_data="./huggingface_sample_request_data.json"

# finetuning job parameters
finetuning_pipeline_component="transformers_image_classification_pipeline"
# Training settings
number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute

# 1. Install dependencies
pip install azure-ai-ml==1.0.0
pip install azure-identity
pip install datasets==2.3.2

unameOut=$(uname -a)
case "${unameOut}" in
*Microsoft*) OS="WSL";; #must be first since Windows subsystem for linux will have Linux in the name too
*microsoft*) OS="WSL2";; #WARNING: My v2 uses ubuntu 20.4 at the moment slightly different name may not always work
Linux*) OS="Linux";;
Darwin*) OS="Mac";;
CYGWIN*) OS="Cygwin";;
MINGW*) OS="Windows";;
*Msys) OS="Windows";;
*) OS="UNKNOWN:${unameOut}"
esac
if [[ ${OS} == "Mac" ]] && sysctl -n machdep.cpu.brand_string | grep -q 'Apple M1'; then
OS="MacM1"
fi
echo ${OS};

jq_version=$(jq --version)
echo ${jq_version};
if [[ $? == 0 ]]; then
echo "jq already installed"
else
echo "jq not installed, installing now..."
# Install jq
if [[ ${OS} == "Mac" ]] || [[ ${OS} == "MacM1" ]]; then
# Install jq on mac
brew install jq
elif [[ ${OS} == "WSL" ]] || [[ ${OS} == "WSL2" ]] || [[ ${OS} == "Linux" ]]; then
# Install jq on WSL
sudo apt-get install jq
elif [[ ${OS} == "Windows" ]] || [[ ${OS} == "Cygwin" ]]; then
# Install jq on windows
curl -L -o ./jq.exe https://github.com/stedolan/jq/releases/latest/download/jq-win64.exe
else
echo "Failed to install jq! This might cause issues"
fi
fi


# 2. Setup pre-requisites
az account set -s $subscription_id
workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name"

# check if $compute_cluster_model_import exists, else create it
if az ml compute show --name $compute_cluster_model_import $workspace_info
then
echo "Compute cluster $compute_cluster_model_import already exists"
else
echo "Creating compute cluster $compute_cluster_model_import"
az ml compute create --name $compute_cluster_model_import --type amlcompute --min-instances 0 --max-instances 2 --size $compute_model_import_sku $workspace_info || {
echo "Failed to create compute cluster $compute_cluster_model_import"
exit 1
}
fi

# check if $compute_cluster_finetune exists, else create it
if az ml compute show --name $compute_cluster_finetune $workspace_info
then
echo "Compute cluster $compute_cluster_finetune already exists"
else
echo "Creating compute cluster $compute_cluster_finetune"
az ml compute create --name $compute_cluster_finetune --type amlcompute --min-instances 0 --max-instances 2 --size $compute_finetune_sku $workspace_info || {
echo "Failed to create compute cluster $compute_cluster_finetune"
exit 1
}
fi

# check if the finetuning pipeline component exists
if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name
then
echo "Finetuning pipeline component $finetuning_pipeline_component does not exist"
exit 1
fi

# 3. Check if the model exists in the registry
# need to confirm model show command works for registries outside the tenant (aka system registry)
if ! az ml model show --name $aml_registry_model_name --version $model_version --registry-name $registry_name
then
echo "Model $aml_registry_model_name:$model_version does not exist in registry $registry_name"
exit 1
fi

# 4. Prepare data
python prepare_data.py
# training data
train_data="./data/training-mltable-folder"
# validation data
validation_data="./data/validation-mltable-folder"

# Check if training data, validation data exist
if [ ! -d $train_data ]; then
echo "Training data $train_data does not exist"
exit 1
fi
if [ ! -d $validation_data ]; then
echo "Validation data $validation_data does not exist"
exit 1
fi

# 5. Submit finetuning job using pipeline.yaml for a HuggingFace Transformers model

# # Need to switch to using latest version for model, currently blocked with a bug.

# # If you want to use a HuggingFace model, specify the inputs.model_name instead of inputs.mlflow_model_path.path like below
# inputs.model_name=$huggingface_model_name

huggingface_parent_job=$( az ml job create \
--file "./hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml" \
$workspace_info \
--set jobs.huggingface_transformers_model_finetune_job.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \
inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$aml_registry_model_name/versions/$model_version" \
inputs.training_data.path=$train_data \
inputs.validation_data.path=$validation_data \
inputs.compute_model_import=$compute_cluster_model_import \
inputs.compute_finetune=$compute_cluster_finetune
) || {
echo "Failed to submit finetuning job"
exit 1
}

huggingface_parent_job_name=$(echo "$huggingface_parent_job" | jq -r ".display_name")
az ml job stream --name $huggingface_parent_job_name $workspace_info || {
echo "job stream failed"; exit 1;
}

# 6. Create model in workspace from train job output for fine-tuned HuggingFace Transformers model
az ml model create --name $finetuned_huggingface_model_name --version $version --type mlflow_model \
--path azureml://jobs/$huggingface_parent_job_name/outputs/trained_model $workspace_info || {
echo "model create in workspace failed"; exit 1;
}

# 7. Deploy the fine-tuned HuggingFace Transformers model to an endpoint
# create online endpoint
az ml online-endpoint create --name $huggingface_endpoint_name $workspace_info || {
echo "endpoint create failed"; exit 1;
}

# deploy model from registry to endpoint in workspace
az ml online-deployment create --file ./deploy.yaml $workspace_info --all-traffic --set \
endpoint_name=$huggingface_endpoint_name model=azureml:$finetuned_huggingface_model_name:$version \
instance_type=$deployment_sku || {
echo "deployment create failed"; exit 1;
}

# 8. Try a sample scoring request on the deployed HuggingFace Transformers model

# Check if scoring data file exists
if [ -f $huggingface_sample_request_data ]; then
echo "Invoking endpoint $huggingface_endpoint_name with following input:\n\n"
cat $huggingface_sample_request_data
echo "\n\n"
else
echo "Scoring file $huggingface_sample_request_data does not exist"
exit 1
fi

az ml online-endpoint invoke --name $huggingface_endpoint_name --request-file $huggingface_sample_request_data $workspace_info || {
echo "endpoint invoke failed"; exit 1;
}

# 9. Delete the endpoint
az ml online-endpoint delete --name $huggingface_endpoint_name $workspace_info --yes || {
echo "endpoint delete failed"; exit 1;
}

# 10. Delete the request data file

rm $huggingface_sample_request_data
Loading

0 comments on commit 98118c7

Please sign in to comment.