Azure · praveenmathamsetty · Jul 24, 2023 · Jul 11, 2023 · Jul 14, 2023 · Jul 17, 2023
diff --git a/.../foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb b/.../foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb
@@ -19,7 +19,6 @@
     "We will use the [Librispeech ASR](https://huggingface.co/datasets/librispeech_asr/viewer/clean/test) dataset. \\\n",
     "You can use also use custom audio files stored on the cloud and verify inference.\n",
     "- Most common audio formats (m4a, wav, flac, wma, mp3, etc.) are supported.\n",
-    "- The whisper model can process only 30 seconds of data at a time, so if the file you upload is longer than 30 seconds, only the first 30 seconds will be transcribed. This can be circumvented by splitting the file into 30 second chunks.\n",
     "\n",
     "### Outline\n",
     "* Set up pre-requisites.\n",
@@ -138,7 +137,7 @@
     "compute_cluster = AmlCompute(\n",
     "    name=compute_name,\n",
     "    description=\"An AML compute cluster\",\n",
-    "    size=\"Standard_DS4_V2\",\n",
+    "    size=\"Standard_DS5_V2\",\n",
     "    min_instances=0,\n",
     "    max_instances=3,\n",
     "    idle_time_before_scale_down=120,\n",
@@ -164,7 +163,7 @@
    "outputs": [],
    "source": [
     "model_name = \"openai-whisper-large\"\n",
-    "model_version = \"4\"\n",
+    "model_version = \"10\"\n",
     "foundation_model = registry_ml_client.models.get(model_name, model_version)\n",
     "print(\n",
     "    f\"Using model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for inferencing.\"\n",

diff --git a/...foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb b/...foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb
@@ -19,7 +19,6 @@
     "We will use custom audio files that have been uploaded to the cloud. \\\n",
     "You can replace the links with any audio file stored on the cloud and verify inference.\n",
     "- Most common audio formats (m4a, wav, flac, wma, mp3, etc.) are supported.\n",
-    "- The whisper model can process only 30 seconds of data at a time, so if the file you upload is longer than 30 seconds, only the first 30 seconds will be transcribed. This can be circumvented by splitting the file into 30 second chunks.\n",
     "\n",
     "### Outline\n",
     "* Set up pre-requisites.\n",
@@ -149,10 +148,10 @@
     "    name=\"demo\",\n",
     "    endpoint_name=online_endpoint_name,\n",
     "    model=foundation_model.id,\n",
-    "    instance_type=\"Standard_DS4_v2\",\n",
+    "    instance_type=\"Standard_DS5_v2\",\n",
     "    instance_count=1,\n",
     "    request_settings=OnlineRequestSettings(\n",
-    "        request_timeout_ms=60000,\n",
+    "        request_timeout_ms=90000,\n",
     "    ),\n",
     ")\n",
     "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n",

diff --git a/...ion-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json b/...ion-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json
@@ -1,6 +1,6 @@
 {
     "inputs": {
-        "audio": ["https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac", "https://audiovisionfiles.blob.core.windows.net/audio/audio.m4a"],
-        "language": ["en", "en"]
+        "audio": ["https://datasets-server.huggingface.co/assets/librispeech_asr/--/all/train.clean.100/84/audio/audio.mp3"],
+        "language": ["en"]
     }
 }