From 98118c752c42cb197c3875237a06bb3ef0d087e2 Mon Sep 17 00:00:00 2001 From: Gaurav Rajguru Date: Fri, 7 Jul 2023 14:54:25 +0530 Subject: [PATCH] Migrate existing notebooks to the azureml-examples repo from azureml-foundation-models (#2432) Co-authored-by: grajguru --- .../deepspeed_configs/zero1.json | 42 + .../multiclass-classification/deploy.yaml | 4 + ...ts-multiclass-classification-pipeline.yaml | 96 ++ ...fridgeobjects-multiclass-classification.sh | 214 ++++ .../multiclass-classification/prepare_data.py | 193 ++++ .../multiclass-classification/readme.md | 14 + .../deepspeed_configs/zero1.json | 42 + .../multilabel-classification/deploy.yaml | 4 + ...ts-multilabel-classification-pipeline.yaml | 97 ++ ...fridgeobjects-multilabel-classification.sh | 212 ++++ .../multilabel-classification/prepare_data.py | 196 ++++ .../multilabel-classification/readme.md | 14 + .../image-classification/deploy.yaml | 6 + .../image-classification-online-endpoint.sh | 80 ++ .../image-classification/prepare_data.py | 76 ++ .../deepspeed_configs/zero1.json | 42 + ...dgeobjects-multiclass-classification.ipynb | 1006 +++++++++++++++++ .../deepspeed_configs/zero1.json | 42 + ...dgeobjects-multilabel-classification.ipynb | 998 ++++++++++++++++ ...image-classification-online-endpoint.ipynb | 364 ++++++ 20 files changed, 3742 insertions(+) create mode 100644 cli/foundation-models/system/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json create mode 100644 cli/foundation-models/system/finetune/image-classification/multiclass-classification/deploy.yaml create mode 100644 cli/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml create mode 100644 cli/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.sh create mode 100644 cli/foundation-models/system/finetune/image-classification/multiclass-classification/prepare_data.py create mode 100644 cli/foundation-models/system/finetune/image-classification/multiclass-classification/readme.md create mode 100644 cli/foundation-models/system/finetune/image-classification/multilabel-classification/deepspeed_configs/zero1.json create mode 100644 cli/foundation-models/system/finetune/image-classification/multilabel-classification/deploy.yaml create mode 100644 cli/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification-pipeline.yaml create mode 100644 cli/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification.sh create mode 100644 cli/foundation-models/system/finetune/image-classification/multilabel-classification/prepare_data.py create mode 100644 cli/foundation-models/system/finetune/image-classification/multilabel-classification/readme.md create mode 100644 cli/foundation-models/system/inference/image-classification/deploy.yaml create mode 100644 cli/foundation-models/system/inference/image-classification/image-classification-online-endpoint.sh create mode 100644 cli/foundation-models/system/inference/image-classification/prepare_data.py create mode 100644 sdk/python/foundation-models/system/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json create mode 100644 sdk/python/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.ipynb create mode 100644 sdk/python/foundation-models/system/finetune/image-classification/multilabel-classification/deepspeed_configs/zero1.json create mode 100644 sdk/python/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification.ipynb create mode 100644 sdk/python/foundation-models/system/inference/image-classification/image-classification-online-endpoint.ipynb diff --git a/cli/foundation-models/system/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json new file mode 100644 index 0000000000..1d2b843bf9 --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json @@ -0,0 +1,42 @@ +{ + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 200000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000, + "contiguous_gradients": false, + "cpu_offload": false + }, + "zero_allow_untested_optimizer": true, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "wall_clock_breakdown": false +} diff --git a/cli/foundation-models/system/finetune/image-classification/multiclass-classification/deploy.yaml b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/deploy.yaml new file mode 100644 index 0000000000..b5884aa6cb --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/deploy.yaml @@ -0,0 +1,4 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: demo +instance_type: Standard_DS3_v2 +instance_count: 1 \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml new file mode 100644 index 0000000000..ecd6cd5307 --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml @@ -0,0 +1,96 @@ +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline + +experiment_name: AzureML-Train-Finetune-Vision-MultiClass-Samples + +inputs: + # # model - specify the foundation model available in the azureml system registry + mlflow_model_path: + path: azureml://registries/azureml-preview/models/google-vit-base-patch16-224/versions/1 + type: mlflow_model + # model_name: microsoft/beit-base-patch16-224-pt22k-ft22k + # dataset files + training_data: + path: ./data/training-mltable-folder + type: mltable + validation_data: + path: ./data/validation-mltable-folder + type: mltable + # deepspeed config file + ds_finetune: + path: ./deepspeed_configs/zero1.json + type: uri_file + # compute + compute_model_import: sample-model-import-cluster + compute_finetune: sample-finetune-cluster-gpu-nc6 + +outputs: + # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model + # registering the model is required to deploy the model to an online or batch endpoint + trained_model: + type: mlflow_model + +settings: + force_rerun: true + default_compute: azureml:sample-finetune-cluster-gpu-nc6 + +jobs: + huggingface_transformers_model_finetune_job: + type: pipeline + component: azureml://registries/azureml-preview/components/image_classification_pipeline/labels/latest + inputs: + + # Compute + compute_model_import: ${{parent.inputs.compute_model_import}} + compute_finetune: ${{parent.inputs.compute_finetune}} + number_of_gpu_to_use_finetuning: 1 + num_nodes_finetune: 1 + + # model + task_name: image-classification + model_family: HuggingFaceImage + # # specify the model_name instead of mlflow_model if you want to use a model from the huggingface hub + mlflow_model: ${{parent.inputs.mlflow_model_path}} + # model_name: ${{parent.inputs.model_name}} + + # data + training_data: ${{parent.inputs.training_data}} + validation_data: ${{parent.inputs.validation_data}} + + image_width: 224 + image_height: 224 + number_of_workers: 8 + apply_augmentations: True + apply_deepspeed: False + deepspeed_config: ${{parent.inputs.ds_finetune}} + apply_ort: False + number_of_epochs: 15 + max_steps: -1 + training_batch_size: 4 + validation_batch_size: 4 + auto_find_batch_size: False + learning_rate: 5e-5 + learning_rate_scheduler: warmup_linear + warmup_steps: 0 + optimizer: adamw_hf + weight_decay: 0.0 + gradient_accumulation_step: 1 + precision: 32 + metric_for_best_model: accuracy + label_smoothing_factor: 0.0 + random_seed: 42 + evaluation_strategy: epoch + evaluation_steps: 500 + logging_strategy: epoch + logging_steps: 500 + save_strategy: epoch + save_steps: 500 + save_total_limit: -1 + early_stopping: False + early_stopping_patience: 1 + max_grad_norm: 1.0 + resume_from_checkpoint: False + save_as_mlflow_model: True + + outputs: + mlflow_model_folder: ${{parent.outputs.trained_model}} diff --git a/cli/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.sh b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.sh new file mode 100644 index 0000000000..cee5fbc773 --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.sh @@ -0,0 +1,214 @@ +#!/bin/bash +set -x + +# script inputs +registry_name="azureml-preview" +subscription_id="" +resource_group_name="" +workspace_name="" + +compute_cluster_model_import="sample-model-import-cluster" +compute_cluster_finetune="sample-finetune-cluster-gpu-nc6" +# if above compute cluster does not exist, create it with the following vm size +compute_model_import_sku="Standard_D12" +compute_finetune_sku="Standard_NC6" +# This is the number of GPUs in a single node of the selected 'vm_size' compute. +# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. +# Setting this to more than the number of GPUs will result in an error. +gpus_per_node=1 + +# huggingFace model +huggingface_model_name="microsoft/beit-base-patch16-224-pt22k-ft22k" +# This is the foundation model for finetuning from azureml system registry +# using the latest version of the model - not working yet +aml_registry_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k" +model_version=1 + +version=$(date +%s) +finetuned_huggingface_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k-fridge-objects-multiclass-classification" +huggingface_endpoint_name="hf-mc-fridge-items-$version" +deployment_sku="Standard_DS3_V2" + +# Deepspeed config +ds_finetune="./deepspeed_configs/zero1.json" + +# Scoring file +huggingface_sample_request_data="./huggingface_sample_request_data.json" + +# finetuning job parameters +finetuning_pipeline_component="transformers_image_classification_pipeline" +# Training settings +number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute + +# 1. Install dependencies +pip install azure-ai-ml==1.0.0 +pip install azure-identity +pip install datasets==2.3.2 + +unameOut=$(uname -a) +case "${unameOut}" in + *Microsoft*) OS="WSL";; #must be first since Windows subsystem for linux will have Linux in the name too + *microsoft*) OS="WSL2";; #WARNING: My v2 uses ubuntu 20.4 at the moment slightly different name may not always work + Linux*) OS="Linux";; + Darwin*) OS="Mac";; + CYGWIN*) OS="Cygwin";; + MINGW*) OS="Windows";; + *Msys) OS="Windows";; + *) OS="UNKNOWN:${unameOut}" +esac +if [[ ${OS} == "Mac" ]] && sysctl -n machdep.cpu.brand_string | grep -q 'Apple M1'; then + OS="MacM1" +fi +echo ${OS}; + +jq_version=$(jq --version) +echo ${jq_version}; +if [[ $? == 0 ]]; then + echo "jq already installed" +else + echo "jq not installed, installing now..." + # Install jq + if [[ ${OS} == "Mac" ]] || [[ ${OS} == "MacM1" ]]; then + # Install jq on mac + brew install jq + elif [[ ${OS} == "WSL" ]] || [[ ${OS} == "WSL2" ]] || [[ ${OS} == "Linux" ]]; then + # Install jq on WSL + sudo apt-get install jq + elif [[ ${OS} == "Windows" ]] || [[ ${OS} == "Cygwin" ]]; then + # Install jq on windows + curl -L -o ./jq.exe https://github.com/stedolan/jq/releases/latest/download/jq-win64.exe + else + echo "Failed to install jq! This might cause issues" + fi +fi + + +# 2. Setup pre-requisites +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# check if $compute_cluster_model_import exists, else create it +if az ml compute show --name $compute_cluster_model_import $workspace_info +then + echo "Compute cluster $compute_cluster_model_import already exists" +else + echo "Creating compute cluster $compute_cluster_model_import" + az ml compute create --name $compute_cluster_model_import --type amlcompute --min-instances 0 --max-instances 2 --size $compute_model_import_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster_model_import" + exit 1 + } +fi + +# check if $compute_cluster_finetune exists, else create it +if az ml compute show --name $compute_cluster_finetune $workspace_info +then + echo "Compute cluster $compute_cluster_finetune already exists" +else + echo "Creating compute cluster $compute_cluster_finetune" + az ml compute create --name $compute_cluster_finetune --type amlcompute --min-instances 0 --max-instances 2 --size $compute_finetune_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster_finetune" + exit 1 + } +fi + +# check if the finetuning pipeline component exists +if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name +then + echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" + exit 1 +fi + +# 3. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $aml_registry_model_name --version $model_version --registry-name $registry_name +then + echo "Model $aml_registry_model_name:$model_version does not exist in registry $registry_name" + exit 1 +fi + +# 4. Prepare data +python prepare_data.py +# training data +train_data="./data/training-mltable-folder" +# validation data +validation_data="./data/validation-mltable-folder" + +# Check if training data, validation data exist +if [ ! -d $train_data ]; then + echo "Training data $train_data does not exist" + exit 1 +fi +if [ ! -d $validation_data ]; then + echo "Validation data $validation_data does not exist" + exit 1 +fi + +# 5. Submit finetuning job using pipeline.yaml for a HuggingFace Transformers model + +# # Need to switch to using latest version for model, currently blocked with a bug. + +# # If you want to use a HuggingFace model, specify the inputs.model_name instead of inputs.mlflow_model_path.path like below +# inputs.model_name=$huggingface_model_name + +huggingface_parent_job=$( az ml job create \ + --file "./hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml" \ + $workspace_info \ + --set jobs.huggingface_transformers_model_finetune_job.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ + inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$aml_registry_model_name/versions/$model_version" \ + inputs.training_data.path=$train_data \ + inputs.validation_data.path=$validation_data \ + inputs.compute_model_import=$compute_cluster_model_import \ + inputs.compute_finetune=$compute_cluster_finetune + ) || { + echo "Failed to submit finetuning job" + exit 1 + } + +huggingface_parent_job_name=$(echo "$huggingface_parent_job" | jq -r ".display_name") +az ml job stream --name $huggingface_parent_job_name $workspace_info || { + echo "job stream failed"; exit 1; +} + +# 6. Create model in workspace from train job output for fine-tuned HuggingFace Transformers model +az ml model create --name $finetuned_huggingface_model_name --version $version --type mlflow_model \ + --path azureml://jobs/$huggingface_parent_job_name/outputs/trained_model $workspace_info || { + echo "model create in workspace failed"; exit 1; +} + +# 7. Deploy the fine-tuned HuggingFace Transformers model to an endpoint +# create online endpoint +az ml online-endpoint create --name $huggingface_endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# deploy model from registry to endpoint in workspace +az ml online-deployment create --file ./deploy.yaml $workspace_info --all-traffic --set \ + endpoint_name=$huggingface_endpoint_name model=azureml:$finetuned_huggingface_model_name:$version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +# 8. Try a sample scoring request on the deployed HuggingFace Transformers model + +# Check if scoring data file exists +if [ -f $huggingface_sample_request_data ]; then + echo "Invoking endpoint $huggingface_endpoint_name with following input:\n\n" + cat $huggingface_sample_request_data + echo "\n\n" +else + echo "Scoring file $huggingface_sample_request_data does not exist" + exit 1 +fi + +az ml online-endpoint invoke --name $huggingface_endpoint_name --request-file $huggingface_sample_request_data $workspace_info || { + echo "endpoint invoke failed"; exit 1; +} + +# 9. Delete the endpoint +az ml online-endpoint delete --name $huggingface_endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +} + +# 10. Delete the request data file + +rm $huggingface_sample_request_data diff --git a/cli/foundation-models/system/finetune/image-classification/multiclass-classification/prepare_data.py b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/prepare_data.py new file mode 100644 index 0000000000..2cff013fe8 --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/prepare_data.py @@ -0,0 +1,193 @@ +import argparse +import base64 +import json +import os +import urllib +from zipfile import ZipFile + +from azure.identity import InteractiveBrowserCredential +from azure.ai.ml import MLClient +from azure.ai.ml.entities import Data +from azure.ai.ml.constants import AssetTypes + + +def create_ml_table_file(filename): + """Create ML Table definition""" + + return ( + "paths:\n" + " - file: ./{0}\n" + "transformations:\n" + " - read_json_lines:\n" + " encoding: utf8\n" + " invalid_lines: error\n" + " include_path_column: false\n" + " - convert_column_types:\n" + " - columns: image_url\n" + " column_type: stream_info" + ).format(filename) + + +def save_ml_table_file(output_path, mltable_file_contents): + with open(os.path.join(output_path, "MLTable"), "w") as f: + f.write(mltable_file_contents) + + +def create_jsonl_and_mltable_files(uri_folder_data_path, dataset_dir): + print("Creating jsonl files") + + dataset_parent_dir = os.path.dirname(dataset_dir) + + # We'll copy each JSONL file within its related MLTable folder + training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder") + validation_mltable_path = os.path.join( + dataset_parent_dir, "validation-mltable-folder" + ) + + # Create MLTable folders, if they don't exist + os.makedirs(training_mltable_path, exist_ok=True) + os.makedirs(validation_mltable_path, exist_ok=True) + + train_validation_ratio = 5 + + # Path to the training and validation files + train_annotations_file = os.path.join( + training_mltable_path, "train_annotations.jsonl" + ) + validation_annotations_file = os.path.join( + validation_mltable_path, "validation_annotations.jsonl" + ) + + # Baseline of json line dictionary + json_line_sample = {"image_url": uri_folder_data_path, "label": ""} + + index = 0 + # Scan each sub directary and generate a jsonl line per image, distributed on train and valid JSONL files + with open(train_annotations_file, "w") as train_f: + with open(validation_annotations_file, "w") as validation_f: + for class_name in os.listdir(dataset_dir): + sub_dir = os.path.join(dataset_dir, class_name) + if not os.path.isdir(sub_dir): + continue + + # Scan each sub directary + print(f"Parsing {sub_dir}") + for image in os.listdir(sub_dir): + json_line = dict(json_line_sample) + json_line["image_url"] += f"{class_name}/{image}" + json_line["label"] = class_name + + if index % train_validation_ratio == 0: + # validation annotation + validation_f.write(json.dumps(json_line) + "\n") + else: + # train annotation + train_f.write(json.dumps(json_line) + "\n") + index += 1 + print("done") + + # Create and save train mltable + train_mltable_file_contents = create_ml_table_file( + os.path.basename(train_annotations_file) + ) + save_ml_table_file(training_mltable_path, train_mltable_file_contents) + + # Create and save validation mltable + validation_mltable_file_contents = create_ml_table_file( + os.path.basename(validation_annotations_file) + ) + save_ml_table_file(validation_mltable_path, validation_mltable_file_contents) + + +def upload_data_and_create_jsonl_mltable_files(ml_client, dataset_parent_dir): + + # Create directory, if it does not exist + os.makedirs(dataset_parent_dir, exist_ok=True) + + # download data + print("Downloading data.") + download_url = "https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip" + + # Extract current dataset name from dataset url + dataset_name = os.path.basename(download_url).split(".")[0] + # Get dataset path for later use + dataset_dir = os.path.join(dataset_parent_dir, dataset_name) + + # Get the name of zip file + data_file = os.path.join(dataset_parent_dir, f"{dataset_name}.zip") + + # Download data from public url + urllib.request.urlretrieve(download_url, filename=data_file) + + # extract files + with ZipFile(data_file, "r") as zip: + print("extracting files...") + zip.extractall(path=dataset_parent_dir) + print("done") + # delete zip file + os.remove(data_file) + + # Upload data and create a data asset URI folder + print("Uploading data to blob storage") + my_data = Data( + path=dataset_dir, + type=AssetTypes.URI_FOLDER, + description="Fridge-items images", + name="fridge-items-images-2", + ) + + uri_folder_data_asset = ml_client.data.create_or_update(my_data) + + print(uri_folder_data_asset) + print("") + print("Path to folder in Blob Storage:") + print(uri_folder_data_asset.path) + create_jsonl_and_mltable_files( + uri_folder_data_path=uri_folder_data_asset.path, dataset_dir=dataset_dir + ) + + +def read_image(image_path): + with open(image_path, "rb") as f: + return f.read() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Prepare data for image classification" + ) + + parser.add_argument("--subscription", type=str, help="Subscription ID") + parser.add_argument("--resource_group", type=str, help="Resource group name") + parser.add_argument("--workspace", type=str, help="Workspace name") + parser.add_argument( + "--data_path", type=str, default="./data", help="Dataset location" + ) + + args, unknown = parser.parse_known_args() + args_dict = vars(args) + + credential = InteractiveBrowserCredential() + ml_client = None + try: + ml_client = MLClient.from_config(credential) + except Exception as ex: + # Enter details of your AML workspace + subscription_id = args.subscription + resource_group = args.group + workspace = args.workspace + ml_client = MLClient(credential, subscription_id, resource_group, workspace) + + upload_data_and_create_jsonl_mltable_files( + ml_client=ml_client, dataset_parent_dir=args.data_path + ) + + sample_image = os.path.join(args.data_path, "fridgeObjects", "milk_bottle", "99.jpg") + huggingface_request_json = { + "inputs": { + "image": [base64.encodebytes(read_image(sample_image)).decode("utf-8")], + } + } + huggingface_request_file_name = "huggingface_sample_request_data.json" + with open(huggingface_request_file_name, "w") as huggingface_request_file: + json.dump(huggingface_request_json, huggingface_request_file) diff --git a/cli/foundation-models/system/finetune/image-classification/multiclass-classification/readme.md b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/readme.md new file mode 100644 index 0000000000..5401ecd2d8 --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multiclass-classification/readme.md @@ -0,0 +1,14 @@ +# Fine-tuning a model for Image Multi-class Classification task + +You can launch a sample pipeline for image multi-class classification using `transformers_image_classification_pipeline` component. + +For using this component, run the shell script file `bash ./hftransformers-fridgeobjects-multiclass-classification.sh`. + +Currently following models are supported: +| Model Name | Source | +| ------ | ---------- | +| [microsoft-beit-base-patch16-224-pt22k-ft22k](https://ml.azure.com/registries/azureml-preview/models/microsoft-beit-base-patch16-224-pt22k-ft22k/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry | +| [microsoft-swinv2-base-patch4-window12-192-22k](https://ml.azure.com/registries/azureml-preview/models/microsoft-swinv2-base-patch4-window12-192-22k/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry | +| [facebook-deit-base-patch16-224](https://ml.azure.com/registries/azureml-preview/models/facebook-deit-base-patch16-224/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry | +| [google-vit-base-patch16-224](https://ml.azure.com/registries/azureml-preview/models/google-vit-base-patch16-224/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry | +| [Image classification models from Huggingface](https://huggingface.co/models?pipeline_tag=image-classification&sort=downloads) | HuggingFace | \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/image-classification/multilabel-classification/deepspeed_configs/zero1.json b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/deepspeed_configs/zero1.json new file mode 100644 index 0000000000..1d2b843bf9 --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/deepspeed_configs/zero1.json @@ -0,0 +1,42 @@ +{ + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 200000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000, + "contiguous_gradients": false, + "cpu_offload": false + }, + "zero_allow_untested_optimizer": true, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "wall_clock_breakdown": false +} diff --git a/cli/foundation-models/system/finetune/image-classification/multilabel-classification/deploy.yaml b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/deploy.yaml new file mode 100644 index 0000000000..b5884aa6cb --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/deploy.yaml @@ -0,0 +1,4 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: demo +instance_type: Standard_DS3_v2 +instance_count: 1 \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification-pipeline.yaml b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification-pipeline.yaml new file mode 100644 index 0000000000..83d4a9a96d --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification-pipeline.yaml @@ -0,0 +1,97 @@ +$schema: https://azuremlschemas.azureedge.net/latest/pipelineJob.schema.json +type: pipeline + +experiment_name: AzureML-Train-Finetune-Vision-MultiLabel-Samples + +inputs: + # # model - specify the foundation model available in the azureml system registry + mlflow_model_path: + path: azureml://registries/azureml-preview/models/google-vit-base-patch16-224/versions/1 + type: mlflow_model + # model_name: microsoft/beit-base-patch16-224-pt22k-ft22k + # dataset files + training_data: + path: ./data/training-mltable-folder + type: mltable + validation_data: + path: ./data/validation-mltable-folder + type: mltable + # deepspeed config file + ds_finetune: + path: ./deepspeed_configs/zero1.json + type: uri_file + # compute + compute_model_import: sample-model-import-cluster + compute_finetune: sample-finetune-cluster-gpu-nc6 + + +outputs: + # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model + # registering the model is required to deploy the model to an online or batch endpoint + trained_model: + type: mlflow_model + +settings: + force_rerun: true + default_compute: azureml:sample-finetune-cluster-gpu-nc6 + +jobs: + huggingface_transformers_model_finetune_job: + type: pipeline + component: azureml://registries/azureml-preview/components/image_classification_pipeline/labels/latest + inputs: + + # Compute + compute_model_import: ${{parent.inputs.compute_model_import}} + compute_finetune: ${{parent.inputs.compute_finetune}} + number_of_gpu_to_use_finetuning: 1 + num_nodes_finetune: 1 + + # model + task_name: image-classification-multilabel + model_family: HuggingFaceImage + # # specify the model_name instead of mlflow_model if you want to use a model from the huggingface hub + mlflow_model: ${{parent.inputs.mlflow_model_path}} + # model_name: ${{parent.inputs.model_name}} + + # data + training_data: ${{parent.inputs.training_data}} + validation_data: ${{parent.inputs.validation_data}} + + image_width: 224 + image_height: 224 + number_of_workers: 8 + apply_augmentations: True + apply_deepspeed: False + deepspeed_config: ${{parent.inputs.ds_finetune}} + apply_ort: False + number_of_epochs: 15 + max_steps: -1 + training_batch_size: 4 + validation_batch_size: 4 + auto_find_batch_size: False + learning_rate: 5e-5 + learning_rate_scheduler: warmup_linear + warmup_steps: 0 + optimizer: adamw_hf + weight_decay: 0.0 + gradient_accumulation_step: 1 + precision: 32 + metric_for_best_model: accuracy + label_smoothing_factor: 0.0 + random_seed: 42 + evaluation_strategy: epoch + evaluation_steps: 500 + logging_strategy: epoch + logging_steps: 500 + save_strategy: epoch + save_steps: 500 + save_total_limit: -1 + early_stopping: False + early_stopping_patience: 1 + max_grad_norm: 1.0 + resume_from_checkpoint: False + save_as_mlflow_model: True + + outputs: + mlflow_model_folder: ${{parent.outputs.trained_model}} \ No newline at end of file diff --git a/cli/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification.sh b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification.sh new file mode 100644 index 0000000000..8078872a60 --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification.sh @@ -0,0 +1,212 @@ +#!/bin/bash +set -x + +# script inputs +registry_name="azureml-preview" +subscription_id="" +resource_group_name="" +workspace_name="" + +compute_cluster_model_import="sample-model-import-cluster" +compute_cluster_finetune="sample-finetune-cluster-gpu-nc6" +# if above compute cluster does not exist, create it with the following vm size +compute_model_import_sku="Standard_D12" +compute_finetune_sku="Standard_NC6" +# This is the number of GPUs in a single node of the selected 'vm_size' compute. +# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train. +# Setting this to more than the number of GPUs will result in an error. +gpus_per_node=1 + +# huggingFace model +huggingface_model_name="microsoft/beit-base-patch16-224-pt22k-ft22k" +# This is the foundation model for finetuning from azureml system registry +# using the latest version of the model - not working yet +aml_registry_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k" +model_version=1 + +version=$(date +%s) +finetuned_huggingface_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k-fridge-objects-multilabel-classification" +huggingface_endpoint_name="hf-ml-fridge-items-$version" +deployment_sku="Standard_DS3_V2" + +# Deepspeed config +ds_finetune="./deepspeed_configs/zero1.json" + +# Scoring file +huggingface_sample_request_data="./huggingface_sample_request_data.json" + +# finetuning job parameters +finetuning_pipeline_component="transformers_image_classification_pipeline" +# Training settings +number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs available in the compute + +# 1. Install dependencies +pip install azure-ai-ml==1.0.0 +pip install azure-identity +pip install datasets==2.3.2 + +unameOut=$(uname -a) +case "${unameOut}" in + *Microsoft*) OS="WSL";; #must be first since Windows subsystem for linux will have Linux in the name too + *microsoft*) OS="WSL2";; #WARNING: My v2 uses ubuntu 20.4 at the moment slightly different name may not always work + Linux*) OS="Linux";; + Darwin*) OS="Mac";; + CYGWIN*) OS="Cygwin";; + MINGW*) OS="Windows";; + *Msys) OS="Windows";; + *) OS="UNKNOWN:${unameOut}" +esac +if [[ ${OS} == "Mac" ]] && sysctl -n machdep.cpu.brand_string | grep -q 'Apple M1'; then + OS="MacM1" +fi +echo ${OS}; + +jq_version=$(jq --version) +echo ${jq_version}; +if [[ $? == 0 ]]; then + echo "jq already installed" +else + echo "jq not installed" + # Install jq + if [[ ${OS} == "Mac" ]] || [[ ${OS} == "MacM1" ]]; then + # Install jq on mac + brew install jq + elif [[ ${OS} == "WSL" ]] || [[ ${OS} == "WSL2" ]] || [[ ${OS} == "Linux" ]]; then + # Install jq on WSL + sudo apt-get install jq + elif [[ ${OS} == "Windows" ]] || [[ ${OS} == "Cygwin" ]]; then + # Install jq on windows + curl -L -o ./jq.exe https://github.com/stedolan/jq/releases/latest/download/jq-win64.exe + else + echo "Failed to install jq! This might cause issues" + fi +fi + +# 2. Setup pre-requisites +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# check if $compute_cluster_model_import exists, else create it +if az ml compute show --name $compute_cluster_model_import $workspace_info +then + echo "Compute cluster $compute_cluster_model_import already exists" +else + echo "Creating compute cluster $compute_cluster_model_import" + az ml compute create --name $compute_cluster_model_import --type amlcompute --min-instances 0 --max-instances 2 --size $compute_model_import_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster_model_import" + exit 1 + } +fi + +# check if $compute_cluster_finetune exists, else create it +if az ml compute show --name $compute_cluster_finetune $workspace_info +then + echo "Compute cluster $compute_cluster_finetune already exists" +else + echo "Creating compute cluster $compute_cluster_finetune" + az ml compute create --name $compute_cluster_finetune --type amlcompute --min-instances 0 --max-instances 2 --size $compute_finetune_sku $workspace_info || { + echo "Failed to create compute cluster $compute_cluster_finetune" + exit 1 + } +fi + +# check if the finetuning pipeline component exists +if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name +then + echo "Finetuning pipeline component $finetuning_pipeline_component does not exist" + exit 1 +fi + +# 3. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $aml_registry_model_name --version $model_version --registry-name $registry_name +then + echo "Model $aml_registry_model_name:$model_version does not exist in registry $registry_name" + exit 1 +fi +# 4. Prepare data +python prepare_data.py +# training data +train_data="./data/training-mltable-folder" +# validation data +validation_data="./data/validation-mltable-folder" + +# Check if training data, validation data exist +if [ ! -d $train_data ]; then + echo "Training data $train_data does not exist" + exit 1 +fi +if [ ! -d $validation_data ]; then + echo "Validation data $validation_data does not exist" + exit 1 +fi + +# 5. Submit finetuning job using pipeline.yaml for a HuggingFace Transformers model + +# # Need to switch to using latest version for model, currently blocked with a bug. + +# # If you want to use a HuggingFace model, specify the inputs.model_name instead of inputs.mlflow_model_path.path like below +# inputs.model_name=$huggingface_model_name + +huggingface_parent_job=$( az ml job create \ + --file "./hftransformers-fridgeobjects-multilabel-classification-pipeline.yaml" \ + $workspace_info \ + --set jobs.huggingface_transformers_model_finetune_job.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \ + inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$aml_registry_model_name/versions/$model_version" \ + inputs.training_data.path=$train_data \ + inputs.validation_data.path=$validation_data \ + inputs.compute_model_import=$compute_cluster_model_import \ + inputs.compute_finetune=$compute_cluster_finetune + ) || { + echo "Failed to submit finetuning job" + exit 1 + } + +huggingface_parent_job_name=$(echo "$huggingface_parent_job" | jq -r ".display_name") +az ml job stream --name $huggingface_parent_job_name $workspace_info || { + echo "job stream failed"; exit 1; +} + +# 6. Create model in workspace from train job output for fine-tuned HuggingFace Transformers model +az ml model create --name $finetuned_huggingface_model_name --version $version --type mlflow_model \ + --path azureml://jobs/$huggingface_parent_job_name/outputs/trained_model $workspace_info || { + echo "model create in workspace failed"; exit 1; +} + +# 7. Deploy the fine-tuned HuggingFace Transformers model to an endpoint +# create online endpoint +az ml online-endpoint create --name $huggingface_endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# deploy model from registry to endpoint in workspace +az ml online-deployment create --file ./deploy.yaml $workspace_info --all-traffic --set \ + endpoint_name=$huggingface_endpoint_name model=azureml:$finetuned_huggingface_model_name:$version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +# 8. Try a sample scoring request on the deployed HuggingFace Transformers model + +# Check if scoring data file exists +if [ -f $huggingface_sample_request_data ]; then + echo "Invoking endpoint $huggingface_endpoint_name with following input:\n\n" + cat $huggingface_sample_request_data + echo "\n\n" +else + echo "Scoring file $huggingface_sample_request_data does not exist" + exit 1 +fi + +az ml online-endpoint invoke --name $huggingface_endpoint_name --request-file $huggingface_sample_request_data $workspace_info || { + echo "endpoint invoke failed"; exit 1; +} + +# 9. Delete the endpoint +az ml online-endpoint delete --name $huggingface_endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +} + +# 10. Delete the request data file + +rm $huggingface_sample_request_data diff --git a/cli/foundation-models/system/finetune/image-classification/multilabel-classification/prepare_data.py b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/prepare_data.py new file mode 100644 index 0000000000..a93169242c --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/prepare_data.py @@ -0,0 +1,196 @@ +import argparse +import base64 +import json +import os +import urllib +from zipfile import ZipFile + +from azure.identity import InteractiveBrowserCredential +from azure.ai.ml import MLClient +from azure.ai.ml.entities import Data +from azure.ai.ml.constants import AssetTypes + + +def create_ml_table_file(filename): + """Create ML Table definition""" + + return ( + "paths:\n" + " - file: ./{0}\n" + "transformations:\n" + " - read_json_lines:\n" + " encoding: utf8\n" + " invalid_lines: error\n" + " include_path_column: false\n" + " - convert_column_types:\n" + " - columns: image_url\n" + " column_type: stream_info" + ).format(filename) + + +def save_ml_table_file(output_path, mltable_file_contents): + with open(os.path.join(output_path, "MLTable"), "w") as f: + f.write(mltable_file_contents) + + +def create_jsonl_and_mltable_files(uri_folder_data_path, dataset_dir): + print("Creating jsonl files") + + dataset_parent_dir = os.path.dirname(dataset_dir) + + # We'll copy each JSONL file within its related MLTable folder + training_mltable_path = os.path.join(dataset_parent_dir, "training-mltable-folder") + validation_mltable_path = os.path.join( + dataset_parent_dir, "validation-mltable-folder" + ) + + # Create MLTable folders, if they don't exist + os.makedirs(training_mltable_path, exist_ok=True) + os.makedirs(validation_mltable_path, exist_ok=True) + + train_validation_ratio = 5 + + # Path to the training and validation files + train_annotations_file = os.path.join( + training_mltable_path, "train_annotations.jsonl" + ) + validation_annotations_file = os.path.join( + validation_mltable_path, "validation_annotations.jsonl" + ) + + # Path to the labels file. + label_file = os.path.join(dataset_dir, "labels.csv") + + # Baseline of json line dictionary + json_line_sample = {"image_url": uri_folder_data_path, "label": ""} + + index = 0 + # Read each annotation and convert it to jsonl line + with open(train_annotations_file, "w") as train_f: + with open(validation_annotations_file, "w") as validation_f: + with open(label_file, "r") as labels: + for i, line in enumerate(labels): + # Skipping the title line and any empty lines. + if i == 0 or len(line.strip()) == 0: + continue + line_split = line.strip().split(",") + if len(line_split) != 2: + print("Skipping the invalid line: {}".format(line)) + continue + json_line = dict(json_line_sample) + json_line["image_url"] += f"images/{line_split[0]}" + json_line["label"] = line_split[1].strip().split(" ") + + if i % train_validation_ratio == 0: + # validation annotation + validation_f.write(json.dumps(json_line) + "\n") + else: + # train annotation + train_f.write(json.dumps(json_line) + "\n") + print("done") + + # Create and save train mltable + train_mltable_file_contents = create_ml_table_file( + os.path.basename(train_annotations_file) + ) + save_ml_table_file(training_mltable_path, train_mltable_file_contents) + + # Create and save validation mltable + validation_mltable_file_contents = create_ml_table_file( + os.path.basename(validation_annotations_file) + ) + save_ml_table_file(validation_mltable_path, validation_mltable_file_contents) + + +def upload_data_and_create_jsonl_mltable_files(ml_client, dataset_parent_dir): + + # Create directory, if it does not exist + os.makedirs(dataset_parent_dir, exist_ok=True) + + # download data + print("Downloading data.") + download_url = "https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/multilabelFridgeObjects.zip" + + # Extract current dataset name from dataset url + dataset_name = os.path.basename(download_url).split(".")[0] + # Get dataset path for later use + dataset_dir = os.path.join(dataset_parent_dir, dataset_name) + + # Get the name of zip file + data_file = os.path.join(dataset_parent_dir, f"{dataset_name}.zip") + + # Download data from public url + urllib.request.urlretrieve(download_url, filename=data_file) + + # extract files + with ZipFile(data_file, "r") as zip: + print("extracting files...") + zip.extractall(path=dataset_parent_dir) + print("done") + # delete zip file + os.remove(data_file) + + # Upload data and create a data asset URI folder + print("Uploading data to blob storage") + my_data = Data( + path=dataset_dir, + type=AssetTypes.URI_FOLDER, + description="Fridge-items images", + name="fridge-items-images-2", + ) + + uri_folder_data_asset = ml_client.data.create_or_update(my_data) + + print(uri_folder_data_asset) + print("") + print("Path to folder in Blob Storage:") + print(uri_folder_data_asset.path) + create_jsonl_and_mltable_files( + uri_folder_data_path=uri_folder_data_asset.path, dataset_dir=dataset_dir + ) + + +def read_image(image_path): + with open(image_path, "rb") as f: + return f.read() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Prepare data for image classification" + ) + + parser.add_argument("--subscription", type=str, help="Subscription ID") + parser.add_argument("--group", type=str, help="Resource group name") + parser.add_argument("--workspace", type=str, help="Workspace name") + parser.add_argument( + "--data_path", type=str, default="./data", help="Dataset location" + ) + + args, unknown = parser.parse_known_args() + args_dict = vars(args) + + credential = InteractiveBrowserCredential() + ml_client = None + try: + ml_client = MLClient.from_config(credential) + except Exception as ex: + # Enter details of your AML workspace + subscription_id = args.subscription + resource_group = args.group + workspace = args.workspace + ml_client = MLClient(credential, subscription_id, resource_group, workspace) + + upload_data_and_create_jsonl_mltable_files( + ml_client=ml_client, dataset_parent_dir=args.data_path + ) + + sample_image = os.path.join(args.data_path, "multilabelFridgeObjects", "images", "56.jpg") + huggingface_request_json = { + "inputs": { + "image": [base64.encodebytes(read_image(sample_image)).decode("utf-8")], + } + } + huggingface_request_file_name = "huggingface_sample_request_data.json" + with open(huggingface_request_file_name, "w") as huggingface_request_file: + json.dump(huggingface_request_json, huggingface_request_file) diff --git a/cli/foundation-models/system/finetune/image-classification/multilabel-classification/readme.md b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/readme.md new file mode 100644 index 0000000000..91d2a893ee --- /dev/null +++ b/cli/foundation-models/system/finetune/image-classification/multilabel-classification/readme.md @@ -0,0 +1,14 @@ +# Fine-tuning a model for Image Multi-label Classification task + +You can launch a sample pipeline for image multi-label classification using `transformers_image_classification_pipeline` component. + +For using this component, run the shell script file `bash ./hftransformers-fridgeobjects-multilabel-classification.sh`. + +Currently following models are supported: +| Model Name | Source | +| ------ | ---------- | +| [microsoft-beit-base-patch16-224-pt22k-ft22k](https://ml.azure.com/registries/azureml-preview/models/microsoft-beit-base-patch16-224-pt22k-ft22k/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry | +| [microsoft-swinv2-base-patch4-window12-192-22k](https://ml.azure.com/registries/azureml-preview/models/microsoft-swinv2-base-patch4-window12-192-22k/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry | +| [facebook-deit-base-patch16-224](https://ml.azure.com/registries/azureml-preview/models/facebook-deit-base-patch16-224/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry | +| [google-vit-base-patch16-224](https://ml.azure.com/registries/azureml-preview/models/google-vit-base-patch16-224/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry | +| [Image classification models from Huggingface](https://huggingface.co/models?pipeline_tag=image-classification&sort=downloads) | HuggingFace | diff --git a/cli/foundation-models/system/inference/image-classification/deploy.yaml b/cli/foundation-models/system/inference/image-classification/deploy.yaml new file mode 100644 index 0000000000..336e5519f5 --- /dev/null +++ b/cli/foundation-models/system/inference/image-classification/deploy.yaml @@ -0,0 +1,6 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +name: demo +instance_type: Standard_DS3_v2 +instance_count: 1 +request_settings: + request_timeout_ms: 60000 \ No newline at end of file diff --git a/cli/foundation-models/system/inference/image-classification/image-classification-online-endpoint.sh b/cli/foundation-models/system/inference/image-classification/image-classification-online-endpoint.sh new file mode 100644 index 0000000000..da32880ce7 --- /dev/null +++ b/cli/foundation-models/system/inference/image-classification/image-classification-online-endpoint.sh @@ -0,0 +1,80 @@ +set -x +# the commands in this file map to steps in this notebook: https://aka.ms/azureml-infer-sdk-image-classification +# the sample scoring file available in the same folder as the above notebook + +# script inputs +registry_name="azureml-preview" +subscription_id="" +resource_group_name="" +workspace_name="" + +# This is the model from system registry that needs to be deployed +model_name="microsoft-beit-base-patch16-224-pt22k-ft22k" +# using the latest version of the model - not working yet +model_version=1 + +version=$(date +%s) +endpoint_name="image-classification-$version" + +# todo: fetch deployment_sku from the min_inference_sku tag of the model +deployment_sku="Standard_DS2_v3" + +# Prepare data for deployment +python ./prepare_data.py --is_multilabel 0 +# sample_request_data +sample_request_data="./sample_request_data.json" + +# 1. Setup pre-requisites +if [ "$subscription_id" = "" ] || \ + ["$resource_group_name" = "" ] || \ + [ "$workspace_name" = "" ]; then + echo "Please update the script with the subscription_id, resource_group_name and workspace_name" + exit 1 +fi + +az account set -s $subscription_id +workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name" + +# 2. Check if the model exists in the registry +# need to confirm model show command works for registries outside the tenant (aka system registry) +if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name +then + echo "Model $model_name:$model_version does not exist in registry $registry_name" + exit 1 +fi + +# 3. Deploy the model to an endpoint +# create online endpoint +az ml online-endpoint create --name $endpoint_name $workspace_info || { + echo "endpoint create failed"; exit 1; +} + +# deploy model from registry to endpoint in workspace +az ml online-deployment create --file deploy.yml $workspace_info --all-traffic --set \ + endpoint_name=$endpoint_name model=azureml://registries/$registry_name/models/$model_name/versions/$model_version \ + instance_type=$deployment_sku || { + echo "deployment create failed"; exit 1; +} + +# 4. Try a sample scoring request + +# Check if scoring data file exists +if [ -f $sample_request_data ]; then + echo "Invoking endpoint $endpoint_name with following input:\n\n" + cat $sample_request_data + echo "\n\n" +else + echo "Scoring file $sample_request_data does not exist" + exit 1 +fi + +az ml online-endpoint invoke --name $endpoint_name --request-file $sample_request_data $workspace_info || { + echo "endpoint invoke failed"; exit 1; +} + +# 6. Delete the endpoint and sample_request_data.json +az ml online-endpoint delete --name $endpoint_name $workspace_info --yes || { + echo "endpoint delete failed"; exit 1; +} + +rm $sample_request_data \ No newline at end of file diff --git a/cli/foundation-models/system/inference/image-classification/prepare_data.py b/cli/foundation-models/system/inference/image-classification/prepare_data.py new file mode 100644 index 0000000000..db4893e78b --- /dev/null +++ b/cli/foundation-models/system/inference/image-classification/prepare_data.py @@ -0,0 +1,76 @@ +import argparse +import base64 +import json +import os +import urllib +from zipfile import ZipFile + + +def download_and_unzip(dataset_parent_dir: str, is_multilabel_dataset: int): + + # Create directory, if it does not exist + os.makedirs(dataset_parent_dir, exist_ok=True) + + # download data + if is_multilabel_dataset == 0: + download_url = "https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip" + else: + download_url = "https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/multilabelFridgeObjects.zip" + print(f"Downloading data from {download_url}") + + # Extract current dataset name from dataset url + dataset_name = os.path.basename(download_url).split(".")[0] + + # Get the name of zip file + data_file = os.path.join(dataset_parent_dir, f"{dataset_name}.zip") + + # Download data from public url + urllib.request.urlretrieve(download_url, filename=data_file) + + # extract files + with ZipFile(data_file, "r") as zip: + print("extracting files...") + zip.extractall(path=dataset_parent_dir) + print("done") + # delete zip file + os.remove(data_file) + + +def read_image(image_path): + with open(image_path, "rb") as f: + return f.read() + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Prepare data for image classification" + ) + parser.add_argument( + "--data_path", type=str, default="./data", help="Dataset location" + ) + parser.add_argument( + "--is_multilabel", type=int, default=0, help="Is multilabel dataset" + ) + + args, unknown = parser.parse_known_args() + args_dict = vars(args) + + download_and_unzip( + dataset_parent_dir=args.data_path, + is_multilabel_dataset=args.is_multilabel, + ) + + if args.is_multilabel == 0: + sample_image = os.path.join(args.data_path, "fridgeObjects", "milk_bottle", "99.jpg") + else: + sample_image = os.path.join(args.data_path, "multilabelFridgeObjects", "images", "56.jpg") + + request_json = { + "inputs": { + "image": [base64.encodebytes(read_image(sample_image)).decode("utf-8")], + } + } + + request_file_name = "sample_request_data.json" + + with open(request_file_name, "w") as request_file: + json.dump(request_json, request_file) \ No newline at end of file diff --git a/sdk/python/foundation-models/system/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json b/sdk/python/foundation-models/system/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json new file mode 100644 index 0000000000..1d2b843bf9 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/image-classification/multiclass-classification/deepspeed_configs/zero1.json @@ -0,0 +1,42 @@ +{ + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 200000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000, + "contiguous_gradients": false, + "cpu_offload": false + }, + "zero_allow_untested_optimizer": true, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "wall_clock_breakdown": false +} diff --git a/sdk/python/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.ipynb b/sdk/python/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.ipynb new file mode 100644 index 0000000000..c581ce498d --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/image-classification/multiclass-classification/hftransformers-fridgeobjects-multiclass-classification.ipynb @@ -0,0 +1,1006 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multi-class Image Classification using transformers specific pipeline component\n", + "\n", + "This sample shows how to use `transformers_image_classification_pipeline` component from the `azureml-preview` system registry to fine tune a model for multi-class image classification task using fridgeObjects Dataset. We then deploy the fine tuned model to an online endpoint for real time inference.\n", + "\n", + "### Training data\n", + "We will use the [fridgeObjects](https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip) dataset.\n", + "\n", + "### Model\n", + "We will use the `microsoft-beit-base-patch16-224-pt22k-ft22k` model in this notebook. If you need to fine tune a model that is available on HuggingFace, but not available in `azureml-preview` system registry, you can either register the model and use the registered model or use the `model_name` parameter to instruct the components to pull the model directly from HuggingFace.\n", + "\n", + "### Outline\n", + "1. Install dependencies\n", + "2. Setup pre-requisites such as compute\n", + "3. Pick a model to fine tune\n", + "4. Prepare dataset for finetuning the model\n", + "5. Submit the fine tuning job using transformers specific image-classification component\n", + "6. Review training and evaluation metrics\n", + "7. Register the fine tuned model\n", + "8. Deploy the fine tuned model for real time inference\n", + "9. Test deployed end point\n", + "9. Clean up resources" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Install dependencies\n", + "Before starting off, if you are running the notebook on Azure Machine Learning Studio or running first time locally, you will need the following packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install azure-ai-ml==1.0.0\n", + "! pip install azure-identity\n", + "! pip install datasets==2.3.2" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Setup pre-requisites" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.1 Connect to Azure Machine Learning workspace\n", + "\n", + "Before we dive in the code, you'll need to connect to your workspace. The workspace is the top-level resource for Azure Machine Learning, providing a centralized place to work with all the artifacts you create when you use Azure Machine Learning.\n", + "\n", + "We are using `DefaultAzureCredential` to get access to workspace. `DefaultAzureCredential` should be capable of handling most scenarios. If you want to learn more about other available credentials, go to [set up authentication doc](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk), [azure-identity reference doc](https://learn.microsoft.com/en-us/python/api/azure-identity/azure.identity?view=azure-python).\n", + "\n", + "Replace `AML_WORKSPACE_NAME`, `RESOURCE_GROUP` and `SUBSCRIPTION_ID` with their respective values in the below cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "\n", + "experiment_name = (\n", + " \"AzureML-Train-Finetune-Vision-MultiClass-Samples\" # can rename to any valid name\n", + ")\n", + "\n", + "credential = DefaultAzureCredential()\n", + "workspace_ml_client = None\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential)\n", + " subscription_id = workspace_ml_client.subscription_id\n", + " resource_group = workspace_ml_client.resource_group_name\n", + " workspace_name = workspace_ml_client.workspace_name\n", + "except Exception as ex:\n", + " print(ex)\n", + " # Enter details of your AML workspace\n", + " subscription_id = \"SUBSCRIPTION_ID\"\n", + " resource_group = \"RESOURCE_GROUP\"\n", + " workspace_name = \"AML_WORKSPACE_NAME\"\n", + "\n", + "workspace_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, workspace_name\n", + ")\n", + "registry_ml_client = MLClient(\n", + " credential,\n", + " subscription_id,\n", + " resource_group,\n", + " registry_name=\"azureml-preview\",\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2 Create compute\n", + "\n", + "In order to finetune a model on Azure Machine Learning studio, you will need to create a compute resource first. **Creating a compute will take 3-4 minutes.** \n", + "\n", + "For additional references, see [Azure Machine Learning in a Day](https://github.com/Azure/azureml-examples/blob/main/tutorials/azureml-in-a-day/azureml-in-a-day.ipynb). " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Create CPU compute for model selection component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import AmlCompute\n", + "from azure.core.exceptions import ResourceNotFoundError\n", + "\n", + "model_import_cluster_name = \"sample-model-import-cluster\"\n", + "try:\n", + " _ = workspace_ml_client.compute.get(model_import_cluster_name)\n", + " print(\"Found existing compute target.\")\n", + "except ResourceNotFoundError:\n", + " print(\"Creating a new compute target...\")\n", + " compute_config = AmlCompute(\n", + " name=model_import_cluster_name,\n", + " type=\"amlcompute\",\n", + " size=\"Standard_D12_v2\",\n", + " idle_time_before_scale_down=120,\n", + " min_instances=0,\n", + " max_instances=4,\n", + " )\n", + " workspace_ml_client.begin_create_or_update(compute_config).result()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Create GPU compute for finetune component\n", + "\n", + "The list of GPU machines can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "finetune_cluster_name = \"sample-finetune-cluster-gpu-nc6\"\n", + "\n", + "try:\n", + " _ = workspace_ml_client.compute.get(finetune_cluster_name)\n", + " print(\"Found existing compute target.\")\n", + "except ResourceNotFoundError:\n", + " print(\"Creating a new compute target...\")\n", + " compute_config = AmlCompute(\n", + " name=finetune_cluster_name,\n", + " type=\"amlcompute\",\n", + " size=\"Standard_NC6\",\n", + " idle_time_before_scale_down=120,\n", + " min_instances=0,\n", + " max_instances=4,\n", + " )\n", + " workspace_ml_client.begin_create_or_update(compute_config).result()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick a foundation model to fine tune\n", + "\n", + "We will use the `microsoft-beit-base-patch16-224-pt22k-ft22k` model in this notebook. If you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either register the model and use the registered model or use the `model_name` parameter to instruct the components to pull the model directly from HuggingFace.\n", + "\n", + "Currently following models are supported:\n", + "\n", + "| Model Name | Source |\n", + "| ------ | ---------- |\n", + "| [microsoft-beit-base-patch16-224-pt22k-ft22k](https://ml.azure.com/registries/azureml-preview/models/microsoft-beit-base-patch16-224-pt22k-ft22k/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry |\n", + "| [microsoft-swinv2-base-patch4-window12-192-22k](https://ml.azure.com/registries/azureml-preview/models/microsoft-swinv2-base-patch4-window12-192-22k/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry |\n", + "| [facebook-deit-base-patch16-224](https://ml.azure.com/registries/azureml-preview/models/facebook-deit-base-patch16-224/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry |\n", + "| [google-vit-base-patch16-224](https://ml.azure.com/registries/azureml-preview/models/google-vit-base-patch16-224/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry |\n", + "| [Image classification models from Huggingface](https://huggingface.co/models?pipeline_tag=image-classification&sort=downloads)| HuggingFace |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "huggingface_model_name = \"microsoft/beit-base-patch16-224-pt22k-ft22k\"\n", + "\n", + "aml_registry_model_name = \"microsoft-beit-base-patch16-224-pt22k-ft22k\"\n", + "model_version = \"1\"\n", + "foundation_model = registry_ml_client.models.get(aml_registry_model_name, model_version)\n", + "print(f\"\\n\\nUsing model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for fine tuning\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Prepare the dataset for fine-tuning the model\n", + "\n", + "We will use the [fridgeObjects](https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip) dataset. The fridge object dataset is stored in a directory. There are four different folders inside:\n", + "- /water_bottle\n", + "- /milk_bottle\n", + "- /carton\n", + "- /can\n", + "\n", + "This is the most common data format for multiclass image classification. Each folder title corresponds to the image label for the images contained inside. \n", + "\n", + "#### 4.1 Download the Data\n", + "We first download and unzip the data locally. By default, the data would be downloaded in `./data` folder in current directory. \n", + "If you prefer to download the data at a different location, update it in `dataset_parent_dir = ...` in the next cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib\n", + "from zipfile import ZipFile\n", + "\n", + "# Change to a different location if you prefer\n", + "dataset_parent_dir = \"./data\"\n", + "\n", + "# create data folder if it doesnt exist.\n", + "os.makedirs(dataset_parent_dir, exist_ok=True)\n", + "\n", + "# download data\n", + "download_url = \"https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip\"\n", + "\n", + "# Extract current dataset name from dataset url\n", + "dataset_name = os.path.split(download_url)[-1].split(\".\")[0]\n", + "# Get dataset path for later use\n", + "dataset_dir = os.path.join(dataset_parent_dir, dataset_name)\n", + "\n", + "# Get the data zip file path\n", + "data_file = os.path.join(dataset_parent_dir, f\"{dataset_name}.zip\")\n", + "\n", + "# Download the dataset\n", + "urllib.request.urlretrieve(download_url, filename=data_file)\n", + "\n", + "# extract files\n", + "with ZipFile(data_file, \"r\") as zip:\n", + " print(\"extracting files...\")\n", + " zip.extractall(path=dataset_parent_dir)\n", + " print(\"done\")\n", + "# delete zip file\n", + "os.remove(data_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Image\n", + "\n", + "sample_image = os.path.join(dataset_dir, \"milk_bottle\", \"99.jpg\")\n", + "Image(filename=sample_image)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.2 Upload the images to Datastore through an AML Data asset (URI Folder)\n", + "\n", + "In order to use the data for training in Azure ML, we upload it to our default Azure Blob Storage of our Azure ML Workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uploading image files by creating a 'data asset URI FOLDER':\n", + "\n", + "from azure.ai.ml.entities import Data\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "my_data = Data(\n", + " path=dataset_dir,\n", + " type=AssetTypes.URI_FOLDER,\n", + " description=\"Fridge-items images\",\n", + " name=\"fridge-items-images-multiclass\",\n", + ")\n", + "\n", + "uri_folder_data_asset = workspace_ml_client.data.create_or_update(my_data)\n", + "\n", + "print(uri_folder_data_asset)\n", + "print(\"\")\n", + "print(\"Path to folder in Blob Storage:\")\n", + "print(uri_folder_data_asset.path)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.3 Convert the downloaded data to JSONL\n", + "\n", + "For documentation on preparing the datasets beyond this notebook, please refer to the [documentation on how to prepare datasets](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-prepare-datasets-for-automl-images).\n", + "\n", + "In order to use this data to create an AzureML MLTable, we first need to convert it to the required JSONL format. The following script is creating two `.jsonl` files (one for training and one for validation) in the corresponding MLTable folder. The train / validation ratio corresponds to 20% of the data going into the validation file. For further details on jsonl file used for image classification task in automated ml, please refer to the [data schema documentation for multi-class image classification task](https://learn.microsoft.com/en-us/azure/machine-learning/reference-automl-images-schema#image-classification-binarymulti-class)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "\n", + "# We'll copy each JSONL file within its related MLTable folder\n", + "training_mltable_path = os.path.join(dataset_parent_dir, \"training-mltable-folder\")\n", + "validation_mltable_path = os.path.join(dataset_parent_dir, \"validation-mltable-folder\")\n", + "\n", + "# First, let's create the folders if they don't exist\n", + "os.makedirs(training_mltable_path, exist_ok=True)\n", + "os.makedirs(validation_mltable_path, exist_ok=True)\n", + "\n", + "train_validation_ratio = 5\n", + "\n", + "# Path to the training and validation files\n", + "train_annotations_file = os.path.join(training_mltable_path, \"train_annotations.jsonl\")\n", + "validation_annotations_file = os.path.join(\n", + " validation_mltable_path, \"validation_annotations.jsonl\"\n", + ")\n", + "\n", + "# Baseline of json line dictionary\n", + "json_line_sample = {\n", + " \"image_url\": uri_folder_data_asset.path,\n", + " \"label\": \"\",\n", + "}\n", + "\n", + "index = 0\n", + "# Scan each sub directary and generate a jsonl line per image, distributed on train and valid JSONL files\n", + "with open(train_annotations_file, \"w\") as train_f:\n", + " with open(validation_annotations_file, \"w\") as validation_f:\n", + " for class_name in os.listdir(dataset_dir):\n", + " sub_dir = os.path.join(dataset_dir, class_name)\n", + " if not os.path.isdir(sub_dir):\n", + " continue\n", + "\n", + " # Scan each sub directary\n", + " print(f\"Parsing {sub_dir}\")\n", + " for image in os.listdir(sub_dir):\n", + " json_line = dict(json_line_sample)\n", + " json_line[\"image_url\"] += f\"{class_name}/{image}\"\n", + " json_line[\"label\"] = class_name\n", + "\n", + " if index % train_validation_ratio == 0:\n", + " # validation annotation\n", + " validation_f.write(json.dumps(json_line) + \"\\n\")\n", + " else:\n", + " # train annotation\n", + " train_f.write(json.dumps(json_line) + \"\\n\")\n", + " index += 1" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.4 Create MLTable data input\n", + "\n", + "Create MLTable data input using the jsonl files created above.\n", + "\n", + "For documentation on creating your own MLTable assets for jobs beyond this notebook, please refer to below resources\n", + "- [MLTable YAML Schema](https://learn.microsoft.com/en-us/azure/machine-learning/reference-yaml-mltable) - covers how to write MLTable YAML, which is required for each MLTable asset.\n", + "- [Create MLTable data asset](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-data-assets?tabs=Python-SDK#create-a-mltable-data-asset) - covers how to create MLTable data asset. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_ml_table_file(filename):\n", + " \"\"\"Create ML Table definition\"\"\"\n", + "\n", + " return (\n", + " \"paths:\\n\"\n", + " \" - file: ./{0}\\n\"\n", + " \"transformations:\\n\"\n", + " \" - read_json_lines:\\n\"\n", + " \" encoding: utf8\\n\"\n", + " \" invalid_lines: error\\n\"\n", + " \" include_path_column: false\\n\"\n", + " \" - convert_column_types:\\n\"\n", + " \" - columns: image_url\\n\"\n", + " \" column_type: stream_info\"\n", + " ).format(filename)\n", + "\n", + "\n", + "def save_ml_table_file(output_path, mltable_file_contents):\n", + " with open(os.path.join(output_path, \"MLTable\"), \"w\") as f:\n", + " f.write(mltable_file_contents)\n", + "\n", + "\n", + "# Create and save train mltable\n", + "train_mltable_file_contents = create_ml_table_file(\n", + " os.path.basename(train_annotations_file)\n", + ")\n", + "save_ml_table_file(training_mltable_path, train_mltable_file_contents)\n", + "\n", + "# Create and save validation mltable\n", + "validation_mltable_file_contents = create_ml_table_file(\n", + " os.path.basename(validation_annotations_file)\n", + ")\n", + "save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Submit the fine tuning job using `transformers_image_classification_pipeline` component\n", + " \n", + "Create the job that uses the `transformers_image_classification_pipeline` component for multi-class image-classification task. [Learn more]() about all the parameters supported for fine tuning." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.1 Create component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FINETUNE_PIPELINE_COMPONENT_NAME = \"transformers_image_classification_pipeline\"\n", + "pipeline_component_transformers_func = registry_ml_client.components.get(\n", + " name=FINETUNE_PIPELINE_COMPONENT_NAME, label=\"latest\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.2 Create arguments to be passed to `transformers_image_classification_pipeline` component\n", + "\n", + "The `transformers_image_classification_pipeline` component consists of model selection and finetuning components. The detailed arguments for each component can be found at following README files:\n", + "- [Model Import Component](../../docs/component_docs/image_finetune/transformers_model_import_component.md)\n", + "- [Finetune Component](../../docs/component_docs/image_finetune/transformers_finetune_component.md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_component_args = {\n", + " # model_selection_args\n", + " \"model_family\": \"HuggingFaceImage\",\n", + " # # specify the model_name instead of mlflow_model if you want to use a model from the huggingface hub\n", + " \"mlflow_model\": foundation_model,\n", + " # \"model_name\": huggingface_model_name,\n", + " # finetune_args\n", + " \"auto_hyperparameter_selection\": False,\n", + " \"image_width\": 224,\n", + " \"image_height\": 224,\n", + " \"task_name\": \"image-classification\",\n", + " \"apply_augmentations\": True,\n", + " \"number_of_workers\": 8,\n", + " \"apply_deepspeed\": False,\n", + " \"deepspeed_config\": \"./deepspeed_configs/zero1.json\",\n", + " \"apply_ort\": False,\n", + " \"number_of_epochs\": 15,\n", + " \"max_steps\": -1,\n", + " \"training_batch_size\": 4,\n", + " \"validation_batch_size\": 4,\n", + " \"auto_find_batch_size\": False,\n", + " \"learning_rate\": 5e-5,\n", + " \"learning_rate_scheduler\": \"warmup_linear\",\n", + " \"warmup_steps\": 0,\n", + " \"optimizer\": \"adamw_hf\",\n", + " \"weight_decay\": 0.0,\n", + " \"gradient_accumulation_step\": 1,\n", + " \"precision\": \"32\",\n", + " \"metric_for_best_model\": \"accuracy\",\n", + " \"label_smoothing_factor\": 0.0,\n", + " \"random_seed\": 42,\n", + " \"evaluation_strategy\": \"epoch\",\n", + " \"evaluation_steps\": 500,\n", + " \"logging_strategy\": \"epoch\",\n", + " \"logging_steps\": 500,\n", + " \"save_strategy\": \"epoch\",\n", + " \"save_steps\": 500,\n", + " \"save_total_limit\": -1,\n", + " \"early_stopping\": False,\n", + " \"early_stopping_patience\": 1,\n", + " \"max_grad_norm\": 1.0,\n", + " \"resume_from_checkpoint\": False,\n", + " \"save_as_mlflow_model\": True,\n", + "}\n", + "number_of_gpu_to_use_finetuning = 1\n", + "num_nodes_finetune = 1\n", + "\n", + "# Ensure that the user provides only one of mlflow_model or model_name\n", + "if pipeline_component_args.get(\"mlflow_model\") is None and pipeline_component_args.get(\"model_name\") is None:\n", + " raise ValueError(\n", + " \"You must specify either mlflow_model or model_name for the model to finetune\"\n", + " )\n", + "if pipeline_component_args.get(\"mlflow_model\") is not None and pipeline_component_args.get(\"model_name\") is not None:\n", + " raise ValueError(\n", + " \"You must specify ONLY one of mlflow_model and model_name for the model to finetune\"\n", + " )\n", + "elif pipeline_component_args.get(\"mlflow_model\") is None and pipeline_component_args.get(\"model_name\") is not None:\n", + " use_model_name = huggingface_model_name\n", + "elif pipeline_component_args.get(\"mlflow_model\") is not None and pipeline_component_args.get(\"model_name\") is None:\n", + " use_model_name = aml_registry_model_name\n", + "print(f\"Finetuning model {use_model_name}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.3 Utility function to create pipeline using `transformers_image_classification_pipeline` component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml.entities import PipelineComponent\n", + "from azure.ai.ml import Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "\n", + "@pipeline()\n", + "def create_pipeline_transformers():\n", + " \"\"\"Create pipeline.\"\"\"\n", + "\n", + " transformers_pipeline_component: PipelineComponent = pipeline_component_transformers_func(\n", + " compute_model_import=model_import_cluster_name,\n", + " compute_finetune=finetune_cluster_name,\n", + " training_data=Input(type=AssetTypes.MLTABLE, path=training_mltable_path),\n", + " validation_data=Input(type=AssetTypes.MLTABLE, path=validation_mltable_path),\n", + " number_of_gpu_to_use_finetuning=number_of_gpu_to_use_finetuning,\n", + " num_nodes_finetune=num_nodes_finetune,\n", + " **pipeline_component_args,\n", + " )\n", + " return {\n", + " # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model\n", + " # registering the model is required to deploy the model to an online or batch endpoint\n", + " \"trained_model\": transformers_pipeline_component.outputs.mlflow_model_folder,\n", + " }" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.4 Run the fine tuning job using `transformers_image_classification_pipeline` component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformers_pipeline_object = create_pipeline_transformers()\n", + "\n", + "transformers_pipeline_object.display_name = (\n", + " use_model_name + \"_transformers_pipeline_component_run_\" + \"multiclass\"\n", + ")\n", + "# Don't use cached results from previous jobs\n", + "transformers_pipeline_object.settings.force_rerun = True\n", + "\n", + "print(\"Submitting pipeline\")\n", + "\n", + "transformers_pipeline_run = workspace_ml_client.jobs.create_or_update(\n", + " transformers_pipeline_object, experiment_name=experiment_name\n", + ")\n", + "\n", + "print(f\"Pipeline created. URL: {transformers_pipeline_run.studio_url}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.jobs.stream(transformers_pipeline_run.name)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Get metrics from finetune component\n", + "\n", + "The model training happens as part of the finetune component. Please follow below steps to extract validation metrics from the run" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 6.1 Initialize MLFlow Client\n", + "\n", + "The models and artifacts that are produced by AutoML can be accessed via the MLFlow interface.\n", + "Initialize the MLFlow client here, and set the backend as Azure ML, via. the MLFlow Client.\n", + "\n", + "IMPORTANT - You need to have installed the latest MLFlow packages with:\n", + "\n", + " pip install azureml-mlflow\n", + " pip install mlflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow\n", + "\n", + "# Obtain the tracking URL from MLClient\n", + "MLFLOW_TRACKING_URI = workspace_ml_client.workspaces.get(\n", + " name=workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "\n", + "print(MLFLOW_TRACKING_URI)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set the MLFLOW TRACKING URI\n", + "mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\n", + "print(f\"\\nCurrent tracking uri: {mlflow.get_tracking_uri()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mlflow.tracking.client import MlflowClient\n", + "\n", + "# Initialize MLFlow client\n", + "mlflow_client = MlflowClient()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6.2 Get the training and evaluation run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + "filter = \"tags.mlflow.rootRunId='\" + transformers_pipeline_run.name + \"'\"\n", + "runs = mlflow.search_runs(experiment_names=[experiment_name], filter_string = filter, output_format=\"list\")\n", + "# get the training and evaluation runs. \n", + "# using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + "for run in runs:\n", + " # check if run.data.metrics.epoch exists\n", + " if 'epoch' in run.data.metrics:\n", + " training_run = run\n", + " # else, check if run.data.metrics.accuracy exists\n", + " elif 'accuracy' in run.data.metrics:\n", + " evaluation_run = run" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6.3 Get training metrics\n", + "\n", + "Access the results (such as Models, Artifacts, Metrics) of a previously completed run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "pd.DataFrame(training_run.data.metrics, index=[0]).T" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Register the fine tuned model with the workspace\n", + "\n", + "We will register the model from the output of the fine tuning job. This will track lineage between the fine tuned model and the fine tuning job. The fine tuning job, further, tracks lineage to the foundation model, data and training code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "# genrating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time())) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import Model\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# Check if the `trained_model` output is available\n", + "print(f\"Pipeline job outputs: {workspace_ml_client.jobs.get(transformers_pipeline_run.name).outputs}\")\n", + "\n", + "# Fetch the model from pipeline job output - not working, hence fetching from fine tune child job\n", + "model_path_from_job = f\"azureml://jobs/{transformers_pipeline_run.name}/outputs/trained_model\"\n", + "print(f\"Path to register model: {model_path_from_job}\")\n", + "\n", + "finetuned_model_name = f\"{use_model_name.replace('/', '-')}-fridge-objects-multiclass-classification\"\n", + "finetuned_model_description = f\"{use_model_name.replace('/', '-')} fine tuned model for fridge objects multiclass classification\"\n", + "prepare_to_register_model = Model(\n", + " path=model_path_from_job,\n", + " type=AssetTypes.MLFLOW_MODEL,\n", + " name=finetuned_model_name,\n", + " version=timestamp, # use timestamp as version to avoid version conflict\n", + " description=finetuned_model_description\n", + ")\n", + "print(f\"Prepare to register model: \\n{prepare_to_register_model}\")\n", + "\n", + "# Register the model from pipeline job output \n", + "registered_model = workspace_ml_client.models.create_or_update(prepare_to_register_model)\n", + "print(f\"Registered model: {registered_model}\")\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8. Deploy the fine tuned model to an online endpoint\n", + "Online endpoints give a durable REST API that can be used to integrate with applications that need to use the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment\n", + "\n", + "# Endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name\n", + "online_endpoint_name = \"hf-mc-fridge-items-\" + datetime.datetime.now().strftime(\n", + " \"%m%d%H%M\"\n", + ")\n", + "online_endpoint_description = f\"Online endpoint for {registered_model.name}, finetuned for fridge objects multiclass classification\"\n", + "# Create an online endpoint\n", + "endpoint = ManagedOnlineEndpoint(\n", + " name=online_endpoint_name,\n", + " description=online_endpoint_description,\n", + " auth_mode=\"key\",\n", + " tags={\"foo\": \"bar\"},\n", + ")\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import OnlineRequestSettings, ProbeSettings\n", + "\n", + "deployment_name = \"hf-mc-fridge-mlflow-deploy\"\n", + "print(registered_model.id)\n", + "print(online_endpoint_name)\n", + "print(deployment_name)\n", + "\n", + "# Create a deployment\n", + "demo_deployment = ManagedOnlineDeployment(\n", + " name=deployment_name,\n", + " endpoint_name=online_endpoint_name,\n", + " model=registered_model.id,\n", + " # use GPU instance type like Standard_NC6s_v3 for faster explanations\n", + " instance_type=\"Standard_DS3_V2\", #\"Standard_DS3_V2\",\n", + " instance_count=1,\n", + " request_settings=OnlineRequestSettings(\n", + " max_concurrent_requests_per_instance=1,\n", + " request_timeout_ms=5000, #90000,\n", + " max_queue_wait_ms=500\n", + " ),\n", + " liveness_probe=ProbeSettings(\n", + " failure_threshold=30,\n", + " success_threshold=1,\n", + " timeout=10,\n", + " period=10,\n", + " initial_delay=10,\n", + " ),\n", + " readiness_probe=ProbeSettings(\n", + " failure_threshold=10,\n", + " success_threshold=1,\n", + " timeout=10,\n", + " period=10,\n", + " initial_delay=10,\n", + " ),\n", + ")\n", + "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", + "endpoint.traffic = {deployment_name: 100}\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9. Test the endpoint with sample data\n", + "\n", + "We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "demo_deployment = workspace_ml_client.online_deployments.get(\n", + " name=deployment_name,\n", + " endpoint_name=online_endpoint_name,\n", + ")\n", + "\n", + "# Get the details for online endpoint\n", + "endpoint = workspace_ml_client.online_endpoints.get(name=online_endpoint_name)\n", + "\n", + "# existing traffic details\n", + "print(endpoint.traffic)\n", + "# Get the scoring URI\n", + "print(endpoint.scoring_uri)\n", + "print(demo_deployment)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create request json\n", + "import base64\n", + "import json\n", + "\n", + "sample_image = os.path.join(dataset_dir, \"milk_bottle\", \"99.jpg\")\n", + "\n", + "def read_image(image_path):\n", + " with open(image_path, \"rb\") as f:\n", + " return f.read()\n", + "\n", + "request_json = {\n", + " \"inputs\": {\n", + " \"image\": [base64.encodebytes(read_image(sample_image)).decode(\"utf-8\")],\n", + " }\n", + "}\n", + "\n", + "request_file_name = \"sample_request_data.json\"\n", + "with open(request_file_name, \"w\") as request_file:\n", + " json.dump(request_json, request_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resp = workspace_ml_client.online_endpoints.invoke(\n", + " endpoint_name=online_endpoint_name,\n", + " deployment_name=demo_deployment.name,\n", + " request_file=request_file_name,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resp" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 10. Clean up resources - delete the online endpoint\n", + "Don't forget to delete the online endpoint, else you will leave the billing meter running for the compute used by the endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/foundation-models/system/finetune/image-classification/multilabel-classification/deepspeed_configs/zero1.json b/sdk/python/foundation-models/system/finetune/image-classification/multilabel-classification/deepspeed_configs/zero1.json new file mode 100644 index 0000000000..1d2b843bf9 --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/image-classification/multilabel-classification/deepspeed_configs/zero1.json @@ -0,0 +1,42 @@ +{ + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 200000000, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 200000000, + "contiguous_gradients": false, + "cpu_offload": false + }, + "zero_allow_untested_optimizer": true, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "wall_clock_breakdown": false +} diff --git a/sdk/python/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification.ipynb b/sdk/python/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification.ipynb new file mode 100644 index 0000000000..88bde6c4ed --- /dev/null +++ b/sdk/python/foundation-models/system/finetune/image-classification/multilabel-classification/hftransformers-fridgeobjects-multilabel-classification.ipynb @@ -0,0 +1,998 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mult-label Image Classification using transformers specific pipeline component\n", + "\n", + "This sample shows how to use `transformers_image_classification_pipeline` component from the `azureml-preview` system registry to fine tune a model for multi-label image classification task using fridgeObjects Dataset. We then deploy the fine tuned model to an online endpoint for real time inference.\n", + "\n", + "### Training data\n", + "We will use the [multi-label fridgeObjects](https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/multilabelFridgeObjects.zip) dataset.\n", + "\n", + "### Model\n", + "We will use the `microsoft-beit-base-patch16-224-pt22k-ft22k` model in this notebook. If you need to fine tune a model that is available on HuggingFace, but not available in `azureml-preview` system registry, you can either register the model and use the registered model or use the `model_name` parameter to instruct the components to pull the model directly from HuggingFace.\n", + "\n", + "### Outline\n", + "1. Install dependencies\n", + "2. Setup pre-requisites such as compute\n", + "3. Pick a model to fine tune\n", + "4. Prepare dataset for finetuning the model\n", + "5. Submit the fine tuning job using transformers specific image-classification component\n", + "6. Review training and evaluation metrics\n", + "7. Register the fine tuned model\n", + "8. Deploy the fine tuned model for real time inference\n", + "9. Test deployed end point\n", + "9. Clean up resources" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Install dependencies\n", + "Before starting off, if you are running the notebook on Azure Machine Learning Studio or running first time locally, you will need the following packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install azure-ai-ml==1.0.0\n", + "! pip install azure-identity\n", + "! pip install datasets==2.3.2" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Setup pre-requisites" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.1 Connect to Azure Machine Learning workspace\n", + "\n", + "Before we dive in the code, you'll need to connect to your workspace. The workspace is the top-level resource for Azure Machine Learning, providing a centralized place to work with all the artifacts you create when you use Azure Machine Learning.\n", + "\n", + "We are using `DefaultAzureCredential` to get access to workspace. `DefaultAzureCredential` should be capable of handling most scenarios. If you want to learn more about other available credentials, go to [set up authentication doc](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk), [azure-identity reference doc](https://learn.microsoft.com/en-us/python/api/azure-identity/azure.identity?view=azure-python).\n", + "\n", + "Replace `AML_WORKSPACE_NAME`, `RESOURCE_GROUP` and `SUBSCRIPTION_ID` with their respective values in the below cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential\n", + "\n", + "\n", + "experiment_name = (\n", + " \"AzureML-Train-Finetune-Vision-MultiLabel-Samples\" # can rename to any valid name\n", + ")\n", + "\n", + "credential = DefaultAzureCredential()\n", + "workspace_ml_client = None\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential)\n", + " subscription_id = workspace_ml_client.subscription_id\n", + " resource_group = workspace_ml_client.resource_group_name\n", + " workspace_name = workspace_ml_client.workspace_name\n", + "except Exception as ex:\n", + " print(ex)\n", + " # Enter details of your AML workspace\n", + " subscription_id = \"SUBSCRIPTION_ID\"\n", + " resource_group = \"RESOURCE_GROUP\"\n", + " workspace_name = \"AML_WORKSPACE_NAME\"\n", + " workspace_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, workspace_name\n", + " )\n", + "\n", + "registry_ml_client = MLClient(\n", + " credential,\n", + " subscription_id,\n", + " resource_group,\n", + " registry_name=\"azureml-preview\",\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2 Create compute\n", + "\n", + "In order to finetune a model on Azure Machine Learning studio, you will need to create a compute resource first. **Creating a compute will take 3-4 minutes.** \n", + "\n", + "For additional references, see [Azure Machine Learning in a Day](https://github.com/Azure/azureml-examples/blob/main/tutorials/azureml-in-a-day/azureml-in-a-day.ipynb). " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Create CPU compute for model selection component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import AmlCompute\n", + "from azure.core.exceptions import ResourceNotFoundError\n", + "\n", + "model_import_cluster_name = \"sample-model-import-cluster\"\n", + "try:\n", + " _ = workspace_ml_client.compute.get(model_import_cluster_name)\n", + " print(\"Found existing compute target.\")\n", + "except ResourceNotFoundError:\n", + " print(\"Creating a new compute target...\")\n", + " compute_config = AmlCompute(\n", + " name=model_import_cluster_name,\n", + " type=\"amlcompute\",\n", + " size=\"Standard_D12_v2\",\n", + " idle_time_before_scale_down=120,\n", + " min_instances=0,\n", + " max_instances=4,\n", + " )\n", + " workspace_ml_client.begin_create_or_update(compute_config).result()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Create GPU compute for finetune component\n", + "\n", + "The list of GPU machines can be found [here](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes-gpu)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "finetune_cluster_name = \"sample-finetune-cluster-gpu-nc6\"\n", + "\n", + "try:\n", + " _ = workspace_ml_client.compute.get(finetune_cluster_name)\n", + " print(\"Found existing compute target.\")\n", + "except ResourceNotFoundError:\n", + " print(\"Creating a new compute target...\")\n", + " compute_config = AmlCompute(\n", + " name=finetune_cluster_name,\n", + " type=\"amlcompute\",\n", + " size=\"Standard_NC6\",\n", + " idle_time_before_scale_down=120,\n", + " min_instances=0,\n", + " max_instances=4,\n", + " )\n", + " workspace_ml_client.begin_create_or_update(compute_config).result()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Pick a foundation model to fine tune\n", + "\n", + "We will use the `microsoft-beit-base-patch16-224-pt22k-ft22k` model in this notebook. If you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either register the model and use the registered model or use the `model_name` parameter to instruct the components to pull the model directly from HuggingFace.\n", + "\n", + "Currently following models are supported:\n", + "\n", + "| Model Name | Source |\n", + "| ------ | ---------- |\n", + "| [microsoft-beit-base-patch16-224-pt22k-ft22k](https://ml.azure.com/registries/azureml-preview/models/microsoft-beit-base-patch16-224-pt22k-ft22k/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry |\n", + "| [microsoft-swinv2-base-patch4-window12-192-22k](https://ml.azure.com/registries/azureml-preview/models/microsoft-swinv2-base-patch4-window12-192-22k/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry |\n", + "| [facebook-deit-base-patch16-224](https://ml.azure.com/registries/azureml-preview/models/facebook-deit-base-patch16-224/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry |\n", + "| [google-vit-base-patch16-224](https://ml.azure.com/registries/azureml-preview/models/google-vit-base-patch16-224/version/1?tid=72f988bf-86f1-41af-91ab-2d7cd011db47#overview) | azureml-preview registry |\n", + "| [Image classification models from Huggingface](https://huggingface.co/models?pipeline_tag=image-classification&sort=downloads)| HuggingFace |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "huggingface_model_name = \"microsoft/beit-base-patch16-224-pt22k-ft22k\"\n", + "\n", + "aml_registry_model_name = \"microsoft-beit-base-patch16-224-pt22k-ft22k\"\n", + "model_version = \"1\"\n", + "foundation_model = registry_ml_client.models.get(aml_registry_model_name, model_version)\n", + "print(f\"\\n\\nUsing model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for fine tuning\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Prepare the dataset for fine-tuning the model\n", + "\n", + "We will use the [multi-label fridgeObjects](https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/multilabelFridgeObjects.zip) dataset. The fridge object dataset is annotated in the CSV file, where each image corresponds to a line. It defines a mapping of the filename to the labels. Since this is a multi-label classification problem, each image can be associated to multiple labels.\n", + "\n", + "This is the most common data format for multilabel image classification. Each folder title corresponds to the image label for the images contained inside. \n", + "\n", + "#### 4.1 Download the Data\n", + "We first download and unzip the data locally. By default, the data would be downloaded in `./data` folder in current directory. \n", + "If you prefer to download the data at a different location, update it in `dataset_parent_dir = ...` in the next cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib\n", + "from zipfile import ZipFile\n", + "\n", + "# Change to a different location if you prefer\n", + "dataset_parent_dir = \"./data\"\n", + "\n", + "# create data folder if it doesnt exist.\n", + "os.makedirs(dataset_parent_dir, exist_ok=True)\n", + "\n", + "# download data\n", + "download_url = \"https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/multilabelFridgeObjects.zip\"\n", + "\n", + "# Extract current dataset name from dataset url\n", + "dataset_name = os.path.split(download_url)[-1].split(\".\")[0]\n", + "# Get dataset path for later use\n", + "dataset_dir = os.path.join(dataset_parent_dir, dataset_name)\n", + "\n", + "# Get the data zip file path\n", + "data_file = os.path.join(dataset_parent_dir, f\"{dataset_name}.zip\")\n", + "\n", + "# Download the dataset\n", + "urllib.request.urlretrieve(download_url, filename=data_file)\n", + "\n", + "# extract files\n", + "with ZipFile(data_file, \"r\") as zip:\n", + " print(\"extracting files...\")\n", + " zip.extractall(path=dataset_parent_dir)\n", + " print(\"done\")\n", + "# delete zip file\n", + "os.remove(data_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Image\n", + "\n", + "sample_image = os.path.join(dataset_dir, \"images\", \"56.jpg\")\n", + "Image(filename=sample_image)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.2 Upload the images to Datastore through an AML Data asset (URI Folder)\n", + "\n", + "In order to use the data for training in Azure ML, we upload it to our default Azure Blob Storage of our Azure ML Workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Uploading image files by creating a 'data asset URI FOLDER':\n", + "\n", + "from azure.ai.ml.entities import Data\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "my_data = Data(\n", + " path=dataset_dir,\n", + " type=AssetTypes.URI_FOLDER,\n", + " description=\"Fridge-items images multilabel\",\n", + " name=\"fridge-items-images-multilabel\",\n", + ")\n", + "\n", + "uri_folder_data_asset = workspace_ml_client.data.create_or_update(my_data)\n", + "\n", + "print(uri_folder_data_asset)\n", + "print(\"\")\n", + "print(\"Path to folder in Blob Storage:\")\n", + "print(uri_folder_data_asset.path)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.3 Convert the downloaded data to JSONL\n", + "\n", + "For documentation on preparing the datasets beyond this notebook, please refer to the [documentation on how to prepare datasets](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-prepare-datasets-for-automl-images).\n", + "\n", + "In order to use this data to create an AzureML MLTable, we first need to convert it to the required JSONL format. The following script is creating two `.jsonl` files (one for training and one for validation) in the corresponding MLTable folder. The train / validation ratio corresponds to 20% of the data going into the validation file. For further details on jsonl file used for image classification task in automated ml, please refer to the [data schema documentation for multi-label image classification task](https://learn.microsoft.com/en-us/azure/machine-learning/reference-automl-images-schema#image-classification-multi-label)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "\n", + "# We'll copy each JSONL file within its related MLTable folder\n", + "training_mltable_path = os.path.join(dataset_parent_dir, \"training-mltable-folder\")\n", + "validation_mltable_path = os.path.join(dataset_parent_dir, \"validation-mltable-folder\")\n", + "\n", + "# First, let's create the folders if they don't exist\n", + "os.makedirs(training_mltable_path, exist_ok=True)\n", + "os.makedirs(validation_mltable_path, exist_ok=True)\n", + "\n", + "train_validation_ratio = 5\n", + "\n", + "# Path to the training and validation files\n", + "train_annotations_file = os.path.join(training_mltable_path, \"train_annotations.jsonl\")\n", + "validation_annotations_file = os.path.join(\n", + " validation_mltable_path, \"validation_annotations.jsonl\"\n", + ")\n", + "\n", + "# Baseline of json line dictionary\n", + "json_line_sample = {\n", + " \"image_url\": uri_folder_data_asset.path,\n", + " \"label\": [],\n", + "}\n", + "\n", + "# Path to the labels file.\n", + "labelFile = os.path.join(dataset_dir, \"labels.csv\")\n", + "\n", + "# Read each annotation and convert it to jsonl line\n", + "with open(train_annotations_file, \"w\") as train_f:\n", + " with open(validation_annotations_file, \"w\") as validation_f:\n", + " with open(labelFile, \"r\") as labels:\n", + " for i, line in enumerate(labels):\n", + " # Skipping the title line and any empty lines.\n", + " if i == 0 or len(line.strip()) == 0:\n", + " continue\n", + " line_split = line.strip().split(\",\")\n", + " if len(line_split) != 2:\n", + " print(f\"Skipping the invalid line: {line}\")\n", + " continue\n", + " json_line = dict(json_line_sample)\n", + " json_line[\"image_url\"] += f\"images/{line_split[0]}\"\n", + " json_line[\"label\"] = line_split[1].strip().split(\" \")\n", + "\n", + " if i % train_validation_ratio == 0:\n", + " # validation annotation\n", + " validation_f.write(json.dumps(json_line) + \"\\n\")\n", + " else:\n", + " # train annotation\n", + " train_f.write(json.dumps(json_line) + \"\\n\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 4.4 Create MLTable data input\n", + "\n", + "Create MLTable data input using the jsonl files created above.\n", + "\n", + "For documentation on creating your own MLTable assets for jobs beyond this notebook, please refer to below resources\n", + "- [MLTable YAML Schema](https://learn.microsoft.com/en-us/azure/machine-learning/reference-yaml-mltable) - covers how to write MLTable YAML, which is required for each MLTable asset.\n", + "- [Create MLTable data asset](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-data-assets?tabs=Python-SDK#create-a-mltable-data-asset) - covers how to create MLTable data asset. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def create_ml_table_file(filename):\n", + " \"\"\"Create ML Table definition\"\"\"\n", + "\n", + " return (\n", + " \"paths:\\n\"\n", + " \" - file: ./{0}\\n\"\n", + " \"transformations:\\n\"\n", + " \" - read_json_lines:\\n\"\n", + " \" encoding: utf8\\n\"\n", + " \" invalid_lines: error\\n\"\n", + " \" include_path_column: false\\n\"\n", + " \" - convert_column_types:\\n\"\n", + " \" - columns: image_url\\n\"\n", + " \" column_type: stream_info\"\n", + " ).format(filename)\n", + "\n", + "\n", + "def save_ml_table_file(output_path, mltable_file_contents):\n", + " with open(os.path.join(output_path, \"MLTable\"), \"w\") as f:\n", + " f.write(mltable_file_contents)\n", + "\n", + "\n", + "# Create and save train mltable\n", + "train_mltable_file_contents = create_ml_table_file(\n", + " os.path.basename(train_annotations_file)\n", + ")\n", + "save_ml_table_file(training_mltable_path, train_mltable_file_contents)\n", + "\n", + "# Save train and validation mltable\n", + "validation_mltable_file_contents = create_ml_table_file(\n", + " os.path.basename(validation_annotations_file)\n", + ")\n", + "save_ml_table_file(validation_mltable_path, validation_mltable_file_contents)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Submit the fine tuning job using `transformers_image_classification_pipeline` component\n", + " \n", + "Create the job that uses the `transformers_image_classification_pipeline` component for multi-label image-classification task. [Learn more]() about all the parameters supported for fine tuning." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.1 Create component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FINETUNE_PIPELINE_COMPONENT_NAME = \"transformers_image_classification_pipeline\"\n", + "pipeline_component_transformers_func = registry_ml_client.components.get(\n", + " name=FINETUNE_PIPELINE_COMPONENT_NAME, label=\"latest\"\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.2 Create arguments to be passed to `transformers_image_classification_pipeline` component\n", + "\n", + "The `transformers_image_classification_pipeline` component consists of model selection and finetuning components. The detailed arguments for each component can be found at following README files:\n", + "- [Model Import Component](../../docs/component_docs/image_finetune/transformers_model_import_component.md)\n", + "- [Finetune Component](../../docs/component_docs/image_finetune/transformers_finetune_component.md)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_component_args = {\n", + " # model_selection_args\n", + " \"model_family\": \"HuggingFaceImage\",\n", + " # # specify the model_name instead of mlflow_model if you want to use a model from the huggingface hub\n", + " \"mlflow_model\": foundation_model,\n", + " # \"model_name\": huggingface_model_name,\n", + " # finetune_args\n", + " \"auto_hyperparameter_selection\": False,\n", + " \"image_width\": 224,\n", + " \"image_height\": 224,\n", + " \"task_name\": \"image-classification-multilabel\",\n", + " \"apply_augmentations\": True,\n", + " \"number_of_workers\": 8,\n", + " \"apply_deepspeed\": False,\n", + " \"deepspeed_config\": \"./deepspeed_configs/zero1.json\",\n", + " \"apply_ort\": False,\n", + " \"number_of_epochs\": 15,\n", + " \"max_steps\": -1,\n", + " \"training_batch_size\": 4,\n", + " \"validation_batch_size\": 4,\n", + " \"auto_find_batch_size\": False,\n", + " \"learning_rate\": 5e-5,\n", + " \"learning_rate_scheduler\": \"warmup_linear\",\n", + " \"warmup_steps\": 0,\n", + " \"optimizer\": \"adamw_hf\",\n", + " \"weight_decay\": 0.0,\n", + " \"gradient_accumulation_step\": 1,\n", + " \"precision\": \"32\",\n", + " \"metric_for_best_model\": \"accuracy\",\n", + " \"label_smoothing_factor\": 0.0,\n", + " \"random_seed\": 42,\n", + " \"evaluation_strategy\": \"epoch\",\n", + " \"evaluation_steps\": 500,\n", + " \"logging_strategy\": \"epoch\",\n", + " \"logging_steps\": 500,\n", + " \"save_strategy\": \"epoch\",\n", + " \"save_steps\": 500,\n", + " \"save_total_limit\": -1,\n", + " \"early_stopping\": False,\n", + " \"early_stopping_patience\": 1,\n", + " \"max_grad_norm\": 1.0,\n", + " \"resume_from_checkpoint\": False,\n", + " \"save_as_mlflow_model\": True,\n", + "}\n", + "number_of_gpu_to_use_finetuning = 1\n", + "num_nodes_finetune = 1\n", + "\n", + "# Ensure that the user provides only one of mlflow_model or model_name\n", + "if pipeline_component_args.get(\"mlflow_model\") is None and pipeline_component_args.get(\"model_name\") is None:\n", + " raise ValueError(\n", + " \"You must specify either mlflow_model or model_name for the model to finetune\"\n", + " )\n", + "if pipeline_component_args.get(\"mlflow_model\") is not None and pipeline_component_args.get(\"model_name\") is not None:\n", + " raise ValueError(\n", + " \"You must specify ONLY one of mlflow_model and model_name for the model to finetune\"\n", + " )\n", + "elif pipeline_component_args.get(\"mlflow_model\") is None and pipeline_component_args.get(\"model_name\") is not None:\n", + " use_model_name = huggingface_model_name\n", + "elif pipeline_component_args.get(\"mlflow_model\") is not None and pipeline_component_args.get(\"model_name\") is None:\n", + " use_model_name = aml_registry_model_name\n", + "print(f\"Finetuning model {use_model_name}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.3 Utility function to create pipeline using `transformers_image_classification_pipeline` component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.dsl import pipeline\n", + "from azure.ai.ml.entities import PipelineComponent\n", + "from azure.ai.ml import Input\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "\n", + "@pipeline()\n", + "def create_pipeline_transformers():\n", + " \"\"\"Create pipeline.\"\"\"\n", + "\n", + " transformers_pipeline_component: PipelineComponent = pipeline_component_transformers_func(\n", + " compute_model_import=model_import_cluster_name,\n", + " compute_finetune=finetune_cluster_name,\n", + " training_data=Input(type=AssetTypes.MLTABLE, path=training_mltable_path),\n", + " validation_data=Input(type=AssetTypes.MLTABLE, path=validation_mltable_path),\n", + " number_of_gpu_to_use_finetuning=number_of_gpu_to_use_finetuning,\n", + " num_nodes_finetune=num_nodes_finetune,\n", + " **pipeline_component_args,\n", + " )\n", + " return {\n", + " # map the output of the fine tuning job to the output of pipeline job so that we can easily register the fine tuned model\n", + " # registering the model is required to deploy the model to an online or batch endpoint\n", + " \"trained_model\": transformers_pipeline_component.outputs.mlflow_model_folder,\n", + " }" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 5.4 Run the fine tuning job using `transformers_image_classification_pipeline` component" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformers_pipeline_object = create_pipeline_transformers()\n", + "\n", + "transformers_pipeline_object.display_name = (\n", + " use_model_name + \"_transformers_pipeline_component_run_\" + \"multilabel\"\n", + ")\n", + "# Don't use cached results from previous jobs\n", + "transformers_pipeline_object.settings.force_rerun = True\n", + "\n", + "print(\"Submitting pipeline\")\n", + "\n", + "transformers_pipeline_run = workspace_ml_client.jobs.create_or_update(\n", + " transformers_pipeline_object, experiment_name=experiment_name\n", + ")\n", + "\n", + "print(f\"Pipeline created. URL: {transformers_pipeline_run.studio_url}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.jobs.stream(transformers_pipeline_run.name)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Get metrics from finetune component\n", + "\n", + "The model training happens as part of the finetune component. Please follow below steps to extract validation metrics from the run" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 6.1 Initialize MLFlow Client\n", + "\n", + "The models and artifacts that are produced by AutoML can be accessed via the MLFlow interface.\n", + "Initialize the MLFlow client here, and set the backend as Azure ML, via. the MLFlow Client.\n", + "\n", + "IMPORTANT - You need to have installed the latest MLFlow packages with:\n", + "\n", + " pip install azureml-mlflow\n", + " pip install mlflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow\n", + "\n", + "# Obtain the tracking URL from MLClient\n", + "MLFLOW_TRACKING_URI = workspace_ml_client.workspaces.get(\n", + " name=workspace_ml_client.workspace_name\n", + ").mlflow_tracking_uri\n", + "\n", + "print(MLFLOW_TRACKING_URI)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set the MLFLOW TRACKING URI\n", + "mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)\n", + "print(f\"\\nCurrent tracking uri: {mlflow.get_tracking_uri()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mlflow.tracking.client import MlflowClient\n", + "\n", + "# Initialize MLFlow client\n", + "mlflow_client = MlflowClient()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6.2 Get the training and evaluation run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# concat 'tags.mlflow.rootRunId=' and pipeline_job.name in single quotes as filter variable\n", + "filter = \"tags.mlflow.rootRunId='\" + transformers_pipeline_run.name + \"'\"\n", + "runs = mlflow.search_runs(experiment_names=[experiment_name], filter_string = filter, output_format=\"list\")\n", + "# get the training and evaluation runs. \n", + "# using a hacky way till 'Bug 2320997: not able to show eval metrics in FT notebooks - mlflow client now showing display names' is fixed\n", + "for run in runs:\n", + " # check if run.data.metrics.epoch exists\n", + " if 'epoch' in run.data.metrics:\n", + " training_run = run\n", + " # else, check if run.data.metrics.accuracy exists\n", + " elif 'accuracy' in run.data.metrics:\n", + " evaluation_run = run" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 6.3 Get training metrics\n", + "\n", + "Access the results (such as Models, Artifacts, Metrics) of a previously completed run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "pd.DataFrame(training_run.data.metrics, index=[0]).T" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 7. Register the fine tuned model with the workspace\n", + "\n", + "We will register the model from the output of the fine tuning job. This will track lineage between the fine tuned model and the fine tuning job. The fine tuning job, further, tracks lineage to the foundation model, data and training code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "# genrating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time())) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import Model\n", + "from azure.ai.ml.constants import AssetTypes\n", + "\n", + "# Check if the `trained_model` output is available\n", + "print(f\"Pipeline job outputs: {workspace_ml_client.jobs.get(transformers_pipeline_run.name).outputs}\")\n", + "\n", + "# Fetch the model from pipeline job output - not working, hence fetching from fine tune child job\n", + "model_path_from_job = f\"azureml://jobs/{transformers_pipeline_run.name}/outputs/trained_model\"\n", + "print(f\"Path to register model: {model_path_from_job}\")\n", + "\n", + "finetuned_model_name = f\"{use_model_name.replace('/', '-')}-fridge-objects-multilabel-classification\"\n", + "finetuned_model_description = f\"{use_model_name.replace('/', '-')} fine tuned model for fridge objects multilabel classification\"\n", + "prepare_to_register_model = Model(\n", + " path=model_path_from_job,\n", + " type=AssetTypes.MLFLOW_MODEL,\n", + " name=finetuned_model_name,\n", + " version=timestamp, # use timestamp as version to avoid version conflict\n", + " description=finetuned_model_description\n", + ")\n", + "print(f\"Prepare to register model: \\n{prepare_to_register_model}\")\n", + "\n", + "# Register the model from pipeline job output \n", + "registered_model = workspace_ml_client.models.create_or_update(prepare_to_register_model)\n", + "print(f\"Registered model: {registered_model}\")\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 8. Deploy the fine tuned model to an online endpoint\n", + "Online endpoints give a durable REST API that can be used to integrate with applications that need to use the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment\n", + "\n", + "# Endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name\n", + "online_endpoint_name = \"hf-ml-fridge-items-\" + datetime.datetime.now().strftime(\n", + " \"%m%d%H%M\"\n", + ")\n", + "online_endpoint_description = f\"Online endpoint for {registered_model.name}, finetuned for fridge objects multilabel classification\"\n", + "# Create an online endpoint\n", + "endpoint = ManagedOnlineEndpoint(\n", + " name=online_endpoint_name,\n", + " description=online_endpoint_description,\n", + " auth_mode=\"key\",\n", + " tags={\"foo\": \"bar\"},\n", + ")\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import OnlineRequestSettings, ProbeSettings\n", + "\n", + "deployment_name = \"hf-ml-fridge-items-mlflow-deploy\"\n", + "# Create a deployment\n", + "req_timeout = OnlineRequestSettings(request_timeout_ms=90000)\n", + "demo_deployment = ManagedOnlineDeployment(\n", + " name=deployment_name,\n", + " endpoint_name=online_endpoint_name,\n", + " model=registered_model.id,\n", + " # use GPU instance type like Standard_NC6s_v3 for faster explanations\n", + " instance_type=\"Standard_DS3_V2\",\n", + " instance_count=1,\n", + " request_settings=req_timeout,\n", + " liveness_probe=ProbeSettings(\n", + " failure_threshold=30,\n", + " success_threshold=1,\n", + " timeout=2,\n", + " period=10,\n", + " initial_delay=2000,\n", + " ),\n", + " readiness_probe=ProbeSettings(\n", + " failure_threshold=10,\n", + " success_threshold=1,\n", + " timeout=10,\n", + " period=10,\n", + " initial_delay=2000,\n", + " ),\n", + ")\n", + "\n", + "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", + "endpoint.traffic = {deployment_name: 100}\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9. Test the endpoint with sample data\n", + "\n", + "We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "demo_deployment = workspace_ml_client.online_deployments.get(\n", + " name=deployment_name,\n", + " endpoint_name=online_endpoint_name,\n", + ")\n", + "\n", + "# Get the details for online endpoint\n", + "endpoint = workspace_ml_client.online_endpoints.get(name=online_endpoint_name)\n", + "\n", + "# existing traffic details\n", + "print(endpoint.traffic)\n", + "# Get the scoring URI\n", + "print(endpoint.scoring_uri)\n", + "print(demo_deployment)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create request json\n", + "import base64\n", + "import json\n", + "\n", + "sample_image = os.path.join(dataset_dir, \"images\", \"56.jpg\")\n", + "\n", + "def read_image(image_path):\n", + " with open(image_path, \"rb\") as f:\n", + " return f.read()\n", + "\n", + "request_json = {\n", + " \"inputs\": {\n", + " \"image\": [base64.encodebytes(read_image(sample_image)).decode(\"utf-8\")],\n", + " }\n", + "}\n", + "\n", + "request_file_name = \"sample_request_data.json\"\n", + "with open(request_file_name, \"w\") as request_file:\n", + " json.dump(request_json, request_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resp = workspace_ml_client.online_endpoints.invoke(\n", + " endpoint_name=online_endpoint_name,\n", + " deployment_name=demo_deployment.name,\n", + " request_file=request_file_name,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "resp" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 10. Clean up resources - delete the online endpoint\n", + "Don't forget to delete the online endpoint, else you will leave the billing meter running for the compute used by the endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sdk/python/foundation-models/system/inference/image-classification/image-classification-online-endpoint.ipynb b/sdk/python/foundation-models/system/inference/image-classification/image-classification-online-endpoint.ipynb new file mode 100644 index 0000000000..380349b231 --- /dev/null +++ b/sdk/python/foundation-models/system/inference/image-classification/image-classification-online-endpoint.ipynb @@ -0,0 +1,364 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Image Classification Inference using Online Endpoints\n", + "\n", + "This sample shows how deploy `image-classification` type models to an online endpoint for inference.\n", + "\n", + "### Task\n", + "`image-classification` tasks assign label(s) or class(es) to an image. There are two common types of `image-classification` tasks:\n", + "\n", + "* MultiClass: An image is categorised into one of the three or more classes.\n", + "* MultiLabel: An image can be categorised into more than one class.\n", + " \n", + "### Model\n", + "Models that can perform the `image-classification` task are tagged with `image-classification`. We will use the `microsoft-beit-base-patch16-224-pt22k-ft22k` model in this notebook. If you opened this notebook from a specific model card, remember to replace the specific model name. If you don't find a model that suits your scenario or domain, you can discover and [import models from HuggingFace hub](../../import/import-model-from-huggingface.ipynb) and then use them for inference. \n", + "\n", + "### Inference data\n", + "We will use the [fridgeObjects](https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip) dataset.\n", + "\n", + "\n", + "### Outline\n", + "* Setup pre-requisites.\n", + "* Pick a model to deploy.\n", + "* Prepare data for inference. \n", + "* Deploy the model for real time inference.\n", + "* Test the endpoint\n", + "* Clean up resources." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Setup pre-requisites\n", + "* Install dependencies\n", + "* Connect to AzureML Workspace. Learn more at [set up SDK authentication](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk). Replace ``, `` and `` below.\n", + "* Connect to `azureml` system registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml import MLClient\n", + "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential, ClientSecretCredential\n", + "from azure.ai.ml.entities import AmlCompute\n", + "import time\n", + "\n", + "try:\n", + " credential = DefaultAzureCredential()\n", + " credential.get_token(\"https://management.azure.com/.default\")\n", + "except Exception as ex:\n", + " credential = InteractiveBrowserCredential()\n", + "\n", + "try:\n", + " workspace_ml_client = MLClient.from_config(credential)\n", + " subscription_id = workspace_ml_client.subscription_id\n", + " resource_group = workspace_ml_client.resource_group_name\n", + " workspace_name = workspace_ml_client.workspace_name\n", + "except Exception as ex:\n", + " print(ex)\n", + " # Enter details of your AML workspace\n", + " subscription_id = \"\"\n", + " resource_group = \"\"\n", + " workspace_name = \"\"\n", + "workspace_ml_client = MLClient(\n", + " credential, subscription_id, resource_group, workspace_name\n", + ")\n", + "\n", + "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n", + "registry_ml_client = MLClient(\n", + " credential,\n", + " subscription_id,\n", + " resource_group,\n", + " # workspace_name\n", + " registry_name=\"azureml-preview\",\n", + ")\n", + "# genrating a unique timestamp that can be used for names and versions that need to be unique\n", + "timestamp = str(int(time.time())) \n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Pick a model to deploy\n", + "\n", + "Browse models in the Model Catalog in the AzureML Studio, filtering by the `image-classification` task. In this example, we use the `microsoft-beit-base-patch16-224-pt22k-ft22k ` model. If you have opened this notebook for a different model, replace the model name and version accordingly. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"microsoft-beit-base-patch16-224-pt22k-ft22k\"\n", + "model_version = \"1\"\n", + "foundation_model=registry_ml_client.models.get(model_name, model_version)\n", + "\n", + "print(f\"\\n\\nUsing model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for inferencing\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Prepare multi-class classification data for inference\n", + "\n", + "We will use the [fridgeObjects](https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip) dataset. The fridge object dataset is stored in a directory. There are four different folders inside:\n", + "- /water_bottle\n", + "- /milk_bottle\n", + "- /carton\n", + "- /can\n", + "\n", + "This is the most common data format for multiclass image classification. Each folder title corresponds to the image label for the images contained inside. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib\n", + "from zipfile import ZipFile\n", + "\n", + "# Change to a different location if you prefer\n", + "dataset_parent_dir = \"./data\"\n", + "\n", + "# create data folder if it doesnt exist.\n", + "os.makedirs(dataset_parent_dir, exist_ok=True)\n", + "\n", + "# download data\n", + "download_url = \"https://cvbp-secondary.z19.web.core.windows.net/datasets/image_classification/fridgeObjects.zip\"\n", + "\n", + "# Extract current dataset name from dataset url\n", + "dataset_name = os.path.split(download_url)[-1].split(\".\")[0]\n", + "# Get dataset path for later use\n", + "dataset_dir = os.path.join(dataset_parent_dir, dataset_name)\n", + "\n", + "# Get the data zip file path\n", + "data_file = os.path.join(dataset_parent_dir, f\"{dataset_name}.zip\")\n", + "\n", + "# Download the dataset\n", + "urllib.request.urlretrieve(download_url, filename=data_file)\n", + "\n", + "# extract files\n", + "with ZipFile(data_file, \"r\") as zip:\n", + " print(\"extracting files...\")\n", + " zip.extractall(path=dataset_parent_dir)\n", + " print(\"done\")\n", + "# delete zip file\n", + "os.remove(data_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Image\n", + "\n", + "sample_image = os.path.join(dataset_dir, \"milk_bottle\", \"99.jpg\")\n", + "Image(filename=sample_image)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Deploy the model to an online endpoint\n", + "Online endpoints give a durable REST API that can be used to integrate with applications that need to use the model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time, sys\n", + "from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, OnlineRequestSettings\n", + "\n", + "# Create online endpoint - endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name\n", + "timestamp = int(time.time())\n", + "online_endpoint_name = \"hf-image-classif-\" + str(timestamp)\n", + "# create an online endpoint\n", + "endpoint = ManagedOnlineEndpoint(\n", + " name=online_endpoint_name,\n", + " description=\"Online endpoint for \" + foundation_model.name + \", for image-classification task\",\n", + " auth_mode=\"key\"\n", + ")\n", + "workspace_ml_client.begin_create_or_update(endpoint).wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azure.ai.ml.entities import OnlineRequestSettings, ProbeSettings\n", + "\n", + "deployment_name = \"hf-image-classif-mlflow-deploy\"\n", + "\n", + "print(foundation_model.id)\n", + "print(online_endpoint_name)\n", + "print(deployment_name)\n", + "\n", + "# Create a deployment\n", + "demo_deployment = ManagedOnlineDeployment(\n", + " name=deployment_name,\n", + " endpoint_name=online_endpoint_name,\n", + " model=foundation_model.id,\n", + " # use GPU instance type like Standard_NC6s_v3 for faster explanations\n", + " instance_type=\"Standard_DS3_V2\", #\"Standard_DS3_V2\",\n", + " instance_count=1,\n", + " request_settings=OnlineRequestSettings(\n", + " max_concurrent_requests_per_instance=1,\n", + " request_timeout_ms=5000, #90000,\n", + " max_queue_wait_ms=500\n", + " ),\n", + " liveness_probe=ProbeSettings(\n", + " failure_threshold=30,\n", + " success_threshold=1,\n", + " timeout=10,\n", + " period=10,\n", + " initial_delay=10,\n", + " ),\n", + " readiness_probe=ProbeSettings(\n", + " failure_threshold=10,\n", + " success_threshold=1,\n", + " timeout=10,\n", + " period=10,\n", + " initial_delay=10,\n", + " ),\n", + ")\n", + "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", + "endpoint.traffic = {deployment_name: 100}\n", + "workspace_ml_client.begin_create_or_update(endpoint).result()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Test the endpoint with sample data\n", + "\n", + "We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "demo_deployment = workspace_ml_client.online_deployments.get(\n", + " name=deployment_name,\n", + " endpoint_name=online_endpoint_name,\n", + ")\n", + "\n", + "# Get the details for online endpoint\n", + "endpoint = workspace_ml_client.online_endpoints.get(name=online_endpoint_name)\n", + "\n", + "# existing traffic details\n", + "print(endpoint.traffic)\n", + "# Get the scoring URI\n", + "print(endpoint.scoring_uri)\n", + "print(demo_deployment)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create request json\n", + "import base64\n", + "import json\n", + "\n", + "sample_image = os.path.join(dataset_dir, \"milk_bottle\", \"99.jpg\")\n", + "\n", + "def read_image(image_path):\n", + " with open(image_path, \"rb\") as f:\n", + " return f.read()\n", + "\n", + "# {\"inputs\":{\"image\":[\"\"]}}\n", + "request_json = {\n", + " \"inputs\": {\n", + " \"image\": [base64.encodebytes(read_image(sample_image)).decode(\"utf-8\")],\n", + " }\n", + "}\n", + "\n", + "request_file_name = \"sample_request_data.json\"\n", + "\n", + "with open(request_file_name, \"w\") as request_file:\n", + " json.dump(request_json, request_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# score the sample_score.json file using the online endpoint with the azureml endpoint invoke method\n", + "response = workspace_ml_client.online_endpoints.invoke(\n", + " endpoint_name=online_endpoint_name,\n", + " deployment_name=demo_deployment.name,\n", + " request_file=request_file_name,\n", + ")\n", + "print(f\"raw response: {response}\\n\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Delete the online endpoint\n", + "Don't forget to delete the online endpoint, else you will leave the billing meter running for the compute used by the endpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}