Azure · rdheekonda · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/.env_example b/.env_example
@@ -47,6 +47,7 @@ AZURE_ML_MODEL_VERSION_TO_DEPLOY=4
 AZURE_ML_MODEL_DEPLOY_INSTANCE_SIZE="Standard_DS3_v2"
 AZURE_ML_MODEL_DEPLOY_INSTANCE_COUNT=1
 AZURE_ML_MODEL_DEPLOY_REQUEST_TIMEOUT_MS=90000
+AZURE_ML_MODEL_DEPLOY_LIVENESS_PROBE_INIT_DELAY_SECS=1200
 
 # AZURE ML Inference Configuration
 AZURE_ML_SCORE_DEPLOYMENT_NAME="mistralai-mixtral-8x7b-instru-1"

diff --git a/assets/aml_deployment_resource_not_ready_error.png b/assets/aml_deployment_resource_not_ready_error.png
diff --git a/doc/code/aml_endpoints.py → doc/code/aml_endpoints.pct.py b/doc/code/aml_endpoints.py → doc/code/aml_endpoints.pct.py
diff --git a/doc/code/azure_completions.py → doc/code/azure_completions.pct.py b/doc/code/azure_completions.py → doc/code/azure_completions.pct.py
diff --git a/doc/code/azure_embeddings.py → doc/code/azure_embeddings.pct.py b/doc/code/azure_embeddings.py → doc/code/azure_embeddings.pct.py
diff --git a/doc/code/azure_openai_chat.py → doc/code/azure_openai_chat.pct.py b/doc/code/azure_openai_chat.py → doc/code/azure_openai_chat.pct.py
diff --git a/doc/code/huggingface_endpoints.py.tt → doc/code/huggingface_endpoints.pct.py b/doc/code/huggingface_endpoints.py.tt → doc/code/huggingface_endpoints.pct.py
diff --git a/doc/code/memory.py → doc/code/memory.pct.py b/doc/code/memory.py → doc/code/memory.pct.py
diff --git a/doc/code/orchestrator.py → doc/code/orchestrator.pct.py b/doc/code/orchestrator.py → doc/code/orchestrator.pct.py
diff --git a/doc/code/scoring.py → doc/code/scoring.pct.py b/doc/code/scoring.py → doc/code/scoring.pct.py
diff --git a/doc/demo/1_gandalf.py → doc/demo/1_gandalf.pct.py b/doc/demo/1_gandalf.py → doc/demo/1_gandalf.pct.py
diff --git a/doc/demo/2_multiturn_strategies.py → doc/demo/2_multiturn_strategies.pct.py b/doc/demo/2_multiturn_strategies.py → doc/demo/2_multiturn_strategies.pct.py
diff --git a/doc/deployment/deploy_hf_model_aml.ipynb b/doc/deployment/deploy_hf_model_aml.ipynb
@@ -63,12 +63,14 @@
     "\n",
     "10. **AZURE_ML_MODEL_DEPLOY_REQUEST_TIMEOUT_MS**\n",
     "    - Set the AZURE ML inference endpoint request timeout, recommended value is 60000 (in millis).\n",
-    "\n"
+    "\n",
+    "11. **AZURE_ML_MODEL_DEPLOY_LIVENESS_PROBE_INIT_DELAY_SECS**\n",
+    "    - Configure the liveness probe initial delay value for the Azure ML container hosting your model. The default `initial_delay` value for the liveness probe, as established by Azure ML managed compute, is 600 seconds. Consider raising this value for the deployment of larger models.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -87,14 +89,31 @@
     "model_version = os.getenv(\"AZURE_ML_MODEL_VERSION_TO_DEPLOY\")\n",
     "instance_type = os.getenv('AZURE_ML_MODEL_DEPLOY_INSTANCE_SIZE')\n",
     "instance_count = int(os.getenv('AZURE_ML_MODEL_DEPLOY_INSTANCE_COUNT'))\n",
-    "request_timeout_ms = os.getenv('AZURE_ML_MODEL_DEPLOY_REQUEST_TIMEOUT_MS')"
+    "request_timeout_ms = int(os.getenv('AZURE_ML_MODEL_DEPLOY_REQUEST_TIMEOUT_MS'))\n",
+    "liveness_probe_initial_delay = int(os.getenv('AZURE_ML_MODEL_DEPLOY_LIVENESS_PROBE_INIT_DELAY_SECS'))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Subscription ID: 6d13156d-cc60-485a-9e9e-54857689f99b\n",
+      "Resource group: airt\n",
+      "Workspace name: vicuna-13B\n",
+      "Registry name: HuggingFace\n",
+      "Model to deploy: cognitivecomputations-Wizard-Vicuna-30B-Uncensored\n",
+      "Instance type: Standard_ND40rs_v2\n",
+      "Instance count: 1\n",
+      "Request timeout in millis: 90000\n",
+      "Liveness probe initial delay in secs: 1800\n"
+     ]
+    }
+   ],
    "source": [
     "print(f\"Subscription ID: {subscription_id}\")\n",
     "print(f\"Resource group: {resource_group}\")\n",
@@ -103,7 +122,8 @@
     "print(f\"Model to deploy: {model_to_deploy}\")\n",
     "print(f\"Instance type: {instance_type}\")\n",
     "print(f\"Instance count: {instance_count}\")\n",
-    "print(f\"Request timeout in millis: {request_timeout_ms}\")"
+    "print(f\"Request timeout in millis: {request_timeout_ms}\")\n",
+    "print(f\"Liveness probe initial delay in secs: {liveness_probe_initial_delay}\")"
    ]
   },
   {
@@ -117,9 +137,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "DefaultAzureCredential failed to retrieve a token from the included credentials.\n",
+      "Attempted credentials:\n",
+      "\tEnvironmentCredential: EnvironmentCredential authentication unavailable. Environment variables are not fully configured.\n",
+      "Visit https://aka.ms/azsdk/python/identity/environmentcredential/troubleshoot to troubleshoot this issue.\n",
+      "\tManagedIdentityCredential: ManagedIdentityCredential authentication unavailable, no response from the IMDS endpoint.\n",
+      "\tSharedTokenCacheCredential: SharedTokenCacheCredential authentication unavailable. No accounts were found in the cache.\n",
+      "\tAzureCliCredential: ERROR: AADSTS70043: The refresh token has expired or is invalid due to sign-in frequency checks by conditional access. The token was issued on 2024-02-12T04:32:40.1696820Z and the maximum allowed lifetime for this request is 43200. Trace ID: 21019099-629c-44c9-89d2-f9409bd04300 Correlation ID: 1e0b879f-6576-4ce5-bf7e-99e0185e4223 Timestamp: 2024-02-21 04:43:13Z\n",
+      "Interactive authentication is needed. Please run:\n",
+      "az login --scope https://management.azure.com/.default\n",
+      "\n",
+      "\tAzurePowerShellCredential: Az.Account module >= 2.2.0 is not installed\n",
+      "\tAzureDeveloperCliCredential: Azure Developer CLI could not be found. Please visit https://aka.ms/azure-dev for installation instructions and then,once installed, authenticate to your Azure account using 'azd auth login'.\n",
+      "To mitigate this issue, please refer to the troubleshooting guidelines here at https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot.\n"
+     ]
+    }
+   ],
    "source": [
     "from azure.ai.ml import MLClient\n",
     "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n",
@@ -143,7 +183,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -172,9 +212,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model found in the Azure ML workspace model registry.\n",
+      "\n",
+      "\n",
+      "Using model name: cognitivecomputations-Wizard-Vicuna-30B-Uncensored, version: 1, id: /subscriptions/6d13156d-cc60-485a-9e9e-54857689f99b/resourceGroups/airt/providers/Microsoft.MachineLearningServices/workspaces/vicuna-13B/models/cognitivecomputations-Wizard-Vicuna-30B-Uncensored/versions/1 for inferencing\n"
+     ]
+    }
+   ],
    "source": [
     "# Check if the Hugging Face model exists in the Azure ML workspace model registry\n",
     "model = None\n",
@@ -202,12 +253,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Using the first 32 characters because Azure ML endpoint names must be between 3 and 32 characters in length.\n",
-    "endpoint_name = endpoint_name[:32]"
+    "endpoint_name = endpoint_name[:30]"
    ]
   },
   {
@@ -223,14 +274,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
     "from azure.ai.ml.entities import (\n",
     "    ManagedOnlineEndpoint,\n",
     "    ManagedOnlineDeployment,\n",
     "    OnlineRequestSettings,\n",
+    "    ProbeSettings\n",
     ")\n",
     "\n",
     "# create an online endpoint\n",
@@ -256,24 +308,59 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Check: endpoint cognitivecomputations-Wizard-V exists\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "............................................................................................................................................................................................................"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "ManagedOnlineEndpoint({'public_network_access': 'Enabled', 'provisioning_state': 'Succeeded', 'scoring_uri': 'https://cognitivecomputations-wizard-v.southcentralus.inference.ml.azure.com/score', 'openapi_uri': 'https://cognitivecomputations-wizard-v.southcentralus.inference.ml.azure.com/swagger.json', 'name': 'cognitivecomputations-wizard-v', 'description': 'Online endpoint for cognitivecomputations-Wizard-Vicuna-30B-Uncensored', 'tags': {}, 'properties': {'azureml.onlineendpointid': '/subscriptions/6d13156d-cc60-485a-9e9e-54857689f99b/resourcegroups/airt/providers/microsoft.machinelearningservices/workspaces/vicuna-13b/onlineendpoints/cognitivecomputations-wizard-v', 'AzureAsyncOperationUri': 'https://management.azure.com/subscriptions/6d13156d-cc60-485a-9e9e-54857689f99b/providers/Microsoft.MachineLearningServices/locations/southcentralus/mfeOperationsStatus/oe:ee33e0d3-c89f-496e-a241-eb07cc772890:19cc660f-7182-4790-930a-235ca1d77c3b?api-version=2022-02-01-preview'}, 'print_as_yaml': True, 'id': '/subscriptions/6d13156d-cc60-485a-9e9e-54857689f99b/resourceGroups/airt/providers/Microsoft.MachineLearningServices/workspaces/vicuna-13B/onlineEndpoints/cognitivecomputations-wizard-v', 'Resource__source_path': None, 'base_path': 'c:\\\\Users\\\\rdheekonda\\\\Desktop\\\\airedteam\\\\projects\\\\PyRIT\\\\doc\\\\deployment', 'creation_context': None, 'serialize': <msrest.serialization.Serializer object at 0x000001481317AC20>, 'auth_mode': 'key', 'location': 'southcentralus', 'identity': <azure.ai.ml.entities._credentials.IdentityConfiguration object at 0x000001481317BD90>, 'traffic': {}, 'mirror_traffic': {}, 'kind': 'Managed'})"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# create a deployment\n",
+    "# Create probe settings\n",
+    "liveness_probe = ProbeSettings(initial_delay=liveness_probe_initial_delay)\n",
     "deployment = ManagedOnlineDeployment(\n",
     "    name=f\"{endpoint_name}\",\n",
     "    endpoint_name=endpoint_name,\n",
     "    model=model.id,\n",
     "    instance_type=instance_type,\n",
     "    instance_count=instance_count,\n",
     "    request_settings=OnlineRequestSettings(\n",
-    "        request_timeout_ms=60000,\n",
+    "        request_timeout_ms=request_timeout_ms\n",
     "    ),\n",
+    "    liveness_probe=liveness_probe\n",
     ")\n",
     "workspace_ml_client.online_deployments.begin_create_or_update(deployment).wait()\n",
     "workspace_ml_client.begin_create_or_update(endpoint).result()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/doc/deployment/deploy_hf_model_aml.pct.py b/doc/deployment/deploy_hf_model_aml.pct.py
@@ -55,6 +55,8 @@
 # 10. **AZURE_ML_MODEL_DEPLOY_REQUEST_TIMEOUT_MS**
 #     - Set the AZURE ML inference endpoint request timeout, recommended value is 60000 (in millis).
 #
+# 11. **AZURE_ML_MODEL_DEPLOY_LIVENESS_PROBE_INIT_DELAY_SECS**
+#     - Configure the liveness probe initial delay value for the Azure ML container hosting your model. The default `initial_delay` value for the liveness probe, as established by Azure ML managed compute, is 600 seconds. Consider raising this value for the deployment of larger models.
 #
 
 # %%
@@ -73,7 +75,8 @@
 model_version = os.getenv("AZURE_ML_MODEL_VERSION_TO_DEPLOY")
 instance_type = os.getenv("AZURE_ML_MODEL_DEPLOY_INSTANCE_SIZE")
 instance_count = int(os.getenv("AZURE_ML_MODEL_DEPLOY_INSTANCE_COUNT"))
-request_timeout_ms = os.getenv("AZURE_ML_MODEL_DEPLOY_REQUEST_TIMEOUT_MS")
+request_timeout_ms = int(os.getenv("AZURE_ML_MODEL_DEPLOY_REQUEST_TIMEOUT_MS"))
+liveness_probe_initial_delay = int(os.getenv("AZURE_ML_MODEL_DEPLOY_LIVENESS_PROBE_INIT_DELAY_SECS"))
 
 # %%
 print(f"Subscription ID: {subscription_id}")
@@ -84,6 +87,7 @@
 print(f"Instance type: {instance_type}")
 print(f"Instance count: {instance_count}")
 print(f"Request timeout in millis: {request_timeout_ms}")
+print(f"Liveness probe initial delay in secs: {liveness_probe_initial_delay}")
 
 # %% [markdown]
 # ### Configure Credentials
@@ -167,11 +171,7 @@ def check_model_version_exists(client, model_name, version) -> bool:
 # Authentication mode: The authentication method for the endpoint. Choose between key-based authentication and Azure Machine Learning token-based authentication. A key doesn't expire, but a token does expire.
 
 # %%
-from azure.ai.ml.entities import (
-    ManagedOnlineEndpoint,
-    ManagedOnlineDeployment,
-    OnlineRequestSettings,
-)
+from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, OnlineRequestSettings, ProbeSettings
 
 # create an online endpoint
 endpoint = ManagedOnlineEndpoint(
@@ -188,15 +188,16 @@ def check_model_version_exists(client, model_name, version) -> bool:
 
 # %%
 # create a deployment
+# Create probe settings
+liveness_probe = ProbeSettings(initial_delay=liveness_probe_initial_delay)
 deployment = ManagedOnlineDeployment(
     name=f"{endpoint_name}",
     endpoint_name=endpoint_name,
     model=model.id,
     instance_type=instance_type,
     instance_count=instance_count,
-    request_settings=OnlineRequestSettings(
-        request_timeout_ms=60000,
-    ),
+    request_settings=OnlineRequestSettings(request_timeout_ms=request_timeout_ms),
+    liveness_probe=liveness_probe,
 )
 workspace_ml_client.online_deployments.begin_create_or_update(deployment).wait()
 workspace_ml_client.begin_create_or_update(endpoint).result()
diff --git a/...oyment/HF_AzureML_Model_Endpoint_Guide.md → ...deployment/hf_aml_model_endpoint_guide.md b/...oyment/HF_AzureML_Model_Endpoint_Guide.md → ...deployment/hf_aml_model_endpoint_guide.md
diff --git a/doc/deployment/score_aml_endpoint.ipynb b/doc/deployment/score_aml_endpoint.ipynb
@@ -73,7 +73,7 @@
     "The JSON body can be acquired by the following method: Access the Hugging Face model within the Azure ML model catalog by going to the workspace, then to the studio, selecting 'Model Catalog', and using the search bar to find the model ID. Open the model to view the sample input schema as shown in the image below. \n",
     "<br> <img src=\"./../../assets/aml_model_endpoint_schema.png\" alt=\"aml_model_endpoint_schema.png\" height=\"400\"/> <br>\n",
     "\n",
-    "In addition, we have compiled the details of the request and response for the Hugging Face models hosted on the Azure Machine Learning (Azure ML) endpoint. Please review the [provided link](./HF%20AML%20Model%20Endpoint%20Guide.md) to access the JSON request body and response for the Azure ML endpoint. Additionally, you can deduce the schema from the response if a bad request was sent to the inference endpoint."
+    "In addition, we have compiled the details of the request and response for the Hugging Face models hosted on the Azure Machine Learning (Azure ML) endpoint. Please review the [provided link](./hf_aml_model_endpoint_guide.md) to access the JSON request body and response for the Azure ML endpoint. Additionally, you can deduce the schema from the response if a bad request was sent to the inference endpoint."
    ]
   },
   {

diff --git a/doc/deployment/score_aml_endpoint.pct.py b/doc/deployment/score_aml_endpoint.pct.py
@@ -53,7 +53,7 @@
 # The JSON body can be acquired by the following method: Access the Hugging Face model within the Azure ML model catalog by going to the workspace, then to the studio, selecting 'Model Catalog', and using the search bar to find the model ID. Open the model to view the sample input schema as shown in the image below.
 # <br> <img src="./../../assets/aml_model_endpoint_schema.png" alt="aml_model_endpoint_schema.png" height="400"/> <br>
 #
-# In addition, we have compiled the details of the request and response for the Hugging Face models hosted on the Azure Machine Learning (Azure ML) endpoint. Please review the [provided link](./HF_AzureML_Model_Endpoint_Guide.md) to access the JSON request body and response for the Azure ML endpoint. Additionally, you can deduce the schema from the response if a bad request was sent to the inference endpoint.
+# In addition, we have compiled the details of the request and response for the Hugging Face models hosted on the Azure Machine Learning (Azure ML) endpoint. Please review the [provided link](./hf_aml_model_endpoint_guide.md) to access the JSON request body and response for the Azure ML endpoint. Additionally, you can deduce the schema from the response if a bad request was sent to the inference endpoint.
 
 # %%
 import requests

diff --git a/doc/deployment/troubleshooting_guide_hf_azureml.md b/doc/deployment/troubleshooting_guide_hf_azureml.md
@@ -0,0 +1,40 @@
+# Troubleshooting Guide for HF Azure ML Models
+
+When deploying Hugging Face (HF) models on Azure Machine Learning (Azure ML), you might encounter various issues. This guide aims to help you troubleshoot some common problems.
+
+## 1. ResourceNotReady Error During Azure ML Model Deployment
+
+### Symptom:
+You've deployed your model on Azure ML, but the deployment fails, and you encounter a `ResourceNotReady` error.
+
+### Potential Cause:
+This error typically occurs when the container initialization takes longer than expected. Azure ML has liveness probes that check the health of the deployment. If the container doesn't initialize within the expected timeframe, the liveness probe fails, leading to a `ResourceNotReady` error.
+
+### Solution:
+
+#### Step 1: Check Deployment Logs
+1. Navigate to the Azure ML studio.
+2. Go to the **Endpoints** section.
+3. Select the endpoint you created.
+4. Click on the **Logs** tab.
+5. Choose **Online Deployment Log**.
+
+   Look for a message similar to:
+
+   > "You may have hit a ResourceNotReady error for liveness probe. This happens when container initialization is taking too long."
+
+   For reference, see the example log message in the image below.
+
+   ![Azure ML Deployment ResourceNotReady Error](../../assets/aml_deployment_resource_not_ready_error.png)
+
+
+#### Step 2: Adjust Environment Variable
+1. Locate the `.env` file in your project directory.
+2. Modify the environment variable `AZURE_ML_MODEL_DEPLOY_LIVENESS_PROBE_INIT_DELAY_SECS` related to liveness probe initial delay time to a value greater than the default. For instance, the default value is 600 seconds, you might change it to 1800 seconds.
+3. Save the changes to your `.env` file.
+
+#### Step 3: Redeploy
+Redeploy your model by running the deployment script again. This will apply the new settings.
+
+### Additional Resources:
+For more detailed troubleshooting steps and explanations, refer to the official Azure ML documentation on troubleshooting online endpoints: [Troubleshoot online endpoints](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-troubleshoot-online-endpoints?view=azureml-api-2&tabs=cli#error-resourcenotready).