diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb index 0583327e5d..4d1aced21d 100644 --- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb +++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-batch-endpoint.ipynb @@ -19,7 +19,6 @@ "We will use the [Librispeech ASR](https://huggingface.co/datasets/librispeech_asr/viewer/clean/test) dataset. \\\n", "You can use also use custom audio files stored on the cloud and verify inference.\n", "- Most common audio formats (m4a, wav, flac, wma, mp3, etc.) are supported.\n", - "- The whisper model can process only 30 seconds of data at a time, so if the file you upload is longer than 30 seconds, only the first 30 seconds will be transcribed. This can be circumvented by splitting the file into 30 second chunks.\n", "\n", "### Outline\n", "* Set up pre-requisites.\n", @@ -138,7 +137,7 @@ "compute_cluster = AmlCompute(\n", " name=compute_name,\n", " description=\"An AML compute cluster\",\n", - " size=\"Standard_DS4_V2\",\n", + " size=\"Standard_DS5_V2\",\n", " min_instances=0,\n", " max_instances=3,\n", " idle_time_before_scale_down=120,\n", @@ -164,7 +163,7 @@ "outputs": [], "source": [ "model_name = \"openai-whisper-large\"\n", - "model_version = \"4\"\n", + "model_version = \"10\"\n", "foundation_model = registry_ml_client.models.get(model_name, model_version)\n", "print(\n", " f\"Using model name: {foundation_model.name}, version: {foundation_model.version}, id: {foundation_model.id} for inferencing.\"\n", diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb index ec3be73411..a1ac2c5f02 100644 --- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb +++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/asr-online-endpoint.ipynb @@ -19,7 +19,6 @@ "We will use custom audio files that have been uploaded to the cloud. \\\n", "You can replace the links with any audio file stored on the cloud and verify inference.\n", "- Most common audio formats (m4a, wav, flac, wma, mp3, etc.) are supported.\n", - "- The whisper model can process only 30 seconds of data at a time, so if the file you upload is longer than 30 seconds, only the first 30 seconds will be transcribed. This can be circumvented by splitting the file into 30 second chunks.\n", "\n", "### Outline\n", "* Set up pre-requisites.\n", @@ -149,10 +148,10 @@ " name=\"demo\",\n", " endpoint_name=online_endpoint_name,\n", " model=foundation_model.id,\n", - " instance_type=\"Standard_DS4_v2\",\n", + " instance_type=\"Standard_DS5_v2\",\n", " instance_count=1,\n", " request_settings=OnlineRequestSettings(\n", - " request_timeout_ms=60000,\n", + " request_timeout_ms=90000,\n", " ),\n", ")\n", "workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()\n", diff --git a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json index d93320aca4..c58cdb559b 100644 --- a/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json +++ b/sdk/python/foundation-models/system/inference/automatic-speech-recognition/sample-request/sample_score.json @@ -1,6 +1,6 @@ { "inputs": { - "audio": ["https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac", "https://audiovisionfiles.blob.core.windows.net/audio/audio.m4a"], - "language": ["en", "en"] + "audio": ["https://datasets-server.huggingface.co/assets/librispeech_asr/--/all/train.clean.100/84/audio/audio.mp3"], + "language": ["en"] } } \ No newline at end of file