From 62dde2cd617de2553a3cfac1c9c58e83d9ca3238 Mon Sep 17 00:00:00 2001 From: Tanmay Bansal Date: Tue, 11 Jul 2023 11:31:08 +0530 Subject: [PATCH 1/2] Updated asr inference sample score, online and batch endpoint notebooks --- .../automatic-speech-recognition/asr-batch-endpoint.ipynb | 5 ++--- .../automatic-speech-recognition/asr-online-endpoint.ipynb | 5 ++--- .../sample-request/sample_score.json | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb index 0583327e5d..5ace35b0c0 100644 --- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb +++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb @@ -19,7 +19,6 @@ "We will use the [Librispeech ASR](https://huggingface.co/datasets/librispeech_asr/viewer/clean/test) dataset. \\\n", "You can use also use custom audio files stored on the cloud and verify inference.\n", "- Most common audio formats (m4a, wav, flac, wma, mp3, etc.) are supported.\n", - "- The whisper model can process only 30 seconds of data at a time, so if the file you upload is longer than 30 seconds, only the first 30 seconds will be transcribed. This can be circumvented by splitting the file into 30 second chunks.\n", "\n", "### Outline\n", "* Set up pre-requisites.\n", @@ -138,7 +137,7 @@ "compute_cluster = AmlCompute(\n", " name=compute_name,\n", " description=\"An AML compute cluster\",\n", - " size=\"Standard_DS4_V2\",\n", + " size=\"Standard_DS5_V2\",\n", " min_instances=0,\n", " max_instances=3,\n", " idle_time_before_scale_down=120,\n", @@ -164,7 +163,7 @@ "outputs": [], "source": [ "model_name = \"openai-whisper-large\"\n", - "model_version = \"4\"\n", + "model_version = \"8\"\n", "foundation_model = registry_ml_client.models.get(model_name, model_version)\n", "print(\n", " f\"Using model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for inferencing.\"\n", diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb index ec3be73411..a1ac2c5f02 100644 --- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb +++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb @@ -19,7 +19,6 @@ "We will use custom audio files that have been uploaded to the cloud. \\\n", "You can replace the links with any audio file stored on the cloud and verify inference.\n", "- Most common audio formats (m4a, wav, flac, wma, mp3, etc.) are supported.\n", - "- The whisper model can process only 30 seconds of data at a time, so if the file you upload is longer than 30 seconds, only the first 30 seconds will be transcribed. This can be circumvented by splitting the file into 30 second chunks.\n", "\n", "### Outline\n", "* Set up pre-requisites.\n", @@ -149,10 +148,10 @@ " name=\"demo\",\n", " endpoint_name=online_endpoint_name,\n", " model=foundation_model.id,\n", - " instance_type=\"Standard_DS4_v2\",\n", + " instance_type=\"Standard_DS5_v2\",\n", " instance_count=1,\n", " request_settings=OnlineRequestSettings(\n", - " request_timeout_ms=60000,\n", + " request_timeout_ms=90000,\n", " ),\n", ")\n", "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json index d93320aca4..c58cdb559b 100644 --- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json +++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json @@ -1,6 +1,6 @@ { "inputs": { - "audio": ["https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac", "https://audiovisionfiles.blob.core.windows.net/audio/audio.m4a"], - "language": ["en", "en"] + "audio": ["https://datasets-server.huggingface.co/assets/librispeech_asr/--/all/train.clean.100/84/audio/audio.mp3"], + "language": ["en"] } } \ No newline at end of file From a5019bc50e343e3c814cf93e274f50f6caa07367 Mon Sep 17 00:00:00 2001 From: Tanmay Bansal Date: Mon, 17 Jul 2023 09:52:10 +0530 Subject: [PATCH 2/2] Updated openai whisper model from 8 to 10 in the batch deployment notebook --- .../automatic-speech-recognition/asr-batch-endpoint.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb index 5ace35b0c0..4d1aced21d 100644 --- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb +++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb @@ -163,7 +163,7 @@ "outputs": [], "source": [ "model_name = \"openai-whisper-large\"\n", - "model_version = \"8\"\n", + "model_version = \"10\"\n", "foundation_model = registry_ml_client.models.get(model_name, model_version)\n", "print(\n", " f\"Using model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for inferencing.\"\n",