From 62dde2cd617de2553a3cfac1c9c58e83d9ca3238 Mon Sep 17 00:00:00 2001
From: Tanmay Bansal <tanmaybansal@microsoft.com>
Date: Tue, 11 Jul 2023 11:31:08 +0530
Subject: [PATCH 1/2] Updated asr inference sample score, online and batch
 endpoint notebooks

---
 .../automatic-speech-recognition/asr-batch-endpoint.ipynb    | 5 ++---
 .../automatic-speech-recognition/asr-online-endpoint.ipynb   | 5 ++---
 .../sample-request/sample_score.json                         | 4 ++--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb
index 0583327e5d..5ace35b0c0 100644
--- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb
+++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb
@@ -19,7 +19,6 @@
     "We will use the [Librispeech ASR](https://huggingface.co/datasets/librispeech_asr/viewer/clean/test) dataset. \\\n",
     "You can use also use custom audio files stored on the cloud and verify inference.\n",
     "- Most common audio formats (m4a, wav, flac, wma, mp3, etc.) are supported.\n",
-    "- The whisper model can process only 30 seconds of data at a time, so if the file you upload is longer than 30 seconds, only the first 30 seconds will be transcribed. This can be circumvented by splitting the file into 30 second chunks.\n",
     "\n",
     "### Outline\n",
     "* Set up pre-requisites.\n",
@@ -138,7 +137,7 @@
     "compute_cluster = AmlCompute(\n",
     "    name=compute_name,\n",
     "    description=\"An AML compute cluster\",\n",
-    "    size=\"Standard_DS4_V2\",\n",
+    "    size=\"Standard_DS5_V2\",\n",
     "    min_instances=0,\n",
     "    max_instances=3,\n",
     "    idle_time_before_scale_down=120,\n",
@@ -164,7 +163,7 @@
    "outputs": [],
    "source": [
     "model_name = \"openai-whisper-large\"\n",
-    "model_version = \"4\"\n",
+    "model_version = \"8\"\n",
     "foundation_model = registry_ml_client.models.get(model_name, model_version)\n",
     "print(\n",
     "    f\"Using model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for inferencing.\"\n",
diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb
index ec3be73411..a1ac2c5f02 100644
--- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb
+++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb
@@ -19,7 +19,6 @@
     "We will use custom audio files that have been uploaded to the cloud. \\\n",
     "You can replace the links with any audio file stored on the cloud and verify inference.\n",
     "- Most common audio formats (m4a, wav, flac, wma, mp3, etc.) are supported.\n",
-    "- The whisper model can process only 30 seconds of data at a time, so if the file you upload is longer than 30 seconds, only the first 30 seconds will be transcribed. This can be circumvented by splitting the file into 30 second chunks.\n",
     "\n",
     "### Outline\n",
     "* Set up pre-requisites.\n",
@@ -149,10 +148,10 @@
     "    name=\"demo\",\n",
     "    endpoint_name=online_endpoint_name,\n",
     "    model=foundation_model.id,\n",
-    "    instance_type=\"Standard_DS4_v2\",\n",
+    "    instance_type=\"Standard_DS5_v2\",\n",
     "    instance_count=1,\n",
     "    request_settings=OnlineRequestSettings(\n",
-    "        request_timeout_ms=60000,\n",
+    "        request_timeout_ms=90000,\n",
     "    ),\n",
     ")\n",
     "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n",
diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json
index d93320aca4..c58cdb559b 100644
--- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json
+++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json
@@ -1,6 +1,6 @@
 {
     "inputs": {
-        "audio": ["https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac", "https://audiovisionfiles.blob.core.windows.net/audio/audio.m4a"],
-        "language": ["en", "en"]
+        "audio": ["https://datasets-server.huggingface.co/assets/librispeech_asr/--/all/train.clean.100/84/audio/audio.mp3"],
+        "language": ["en"]
     }
 }
\ No newline at end of file

From a5019bc50e343e3c814cf93e274f50f6caa07367 Mon Sep 17 00:00:00 2001
From: Tanmay Bansal <tanmaybansal@microsoft.com>
Date: Mon, 17 Jul 2023 09:52:10 +0530
Subject: [PATCH 2/2] Updated openai whisper model from 8 to 10 in the batch
 deployment notebook

---
 .../automatic-speech-recognition/asr-batch-endpoint.ipynb       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb
index 5ace35b0c0..4d1aced21d 100644
--- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb
+++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb
@@ -163,7 +163,7 @@
    "outputs": [],
    "source": [
     "model_name = \"openai-whisper-large\"\n",
-    "model_version = \"8\"\n",
+    "model_version = \"10\"\n",
     "foundation_model = registry_ml_client.models.get(model_name, model_version)\n",
     "print(\n",
     "    f\"Using model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for inferencing.\"\n",