Add audio sample inference code for ONNX whisper model (#440)

* Update run_whisper.py
intel · Oct 12, 2023 · e9fc4c2 · e9fc4c2
1 parent dd38290
commit e9fc4c2
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 18 deletions.
diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md b/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md
@@ -26,6 +26,7 @@ bash run_tuning.sh --config=openai/whisper-large \
                    --approach=static # or dynamic
 ```
 
+## 2. Benchmark
 - To get model accuracy
 
 ```
@@ -53,6 +54,36 @@ numactl -m 0 -C 0-3 bash run_benchmark.sh --config=whisper-large-with-past \
  - If users don't set dataset_location, it will download the dataset or use the cached dataset automatically.
  - numactl command is used to bind specific cores.
 
+## 3. Audio inference
+- To run audio sample inference with FP32/INT8 (both static and dynamic) models
+
+```
+bash run_audio_inference.sh --config=openai/whisper-large \ # model_name_or_path
+                            --audio_path=/path/to/dataset \ # optional, support .wav, .mp3 and other ffmpeg supported formats
+                            --input_model=whisper-large-with-past-static/ \ # folder path of onnx model
+```
+
+- To run audio sample inference with INT4 models
+
+Upgrade onnxruntime to 1.16.0 first and then:
+
+```
+bash run_audio_inference.sh --config=openai/whisper-tiny \ # model_name_or_path
+                            --audio_path=/path/to/dataset \ # optional, support .wav, .mp3 and other ffmpeg supported formats
+                            --input_model=whisper-tiny-onnx-int4/ \ # folder path of onnx model
+```
+
+Available INT4 models on huggingface:
+[whisper-tiny](https://huggingface.co/Intel/whisper-tiny-onnx-int4), 
+[whisper-base](https://huggingface.co/Intel/whisper-base-onnx-int4), 
+[whisper-small](https://huggingface.co/Intel/whisper-small-onnx-int4), 
+[whisper-medium](https://huggingface.co/Intel/whisper-medium-onnx-int4), 
+[whisper-large](https://huggingface.co/Intel/whisper-large-onnx-int4), 
+[whisper-large-v2](https://huggingface.co/Intel/whisper-large-v2-onnx-int4).
+
+**Notes**: 
+ - If users don't set audio_path, it will use sample.wav in intel_extension_for_transformers/neural_chat/assets/audio folder for test.
+
 # Validated model list
 
 |Topology|Pretrained model|PostTrainingDynamic|PostTrainingStatic

diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/requirements.txt b/examples/huggingface/onnxruntime/speech-recognition/quantization/requirements.txt
@@ -9,3 +9,4 @@ evaluate
 neural-compressor
 librosa
 soundfile
+pydub
diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/run_audio_inference.sh b/examples/huggingface/onnxruntime/speech-recognition/quantization/run_audio_inference.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+set -x
+
+function main {
+
+  init_params "$@"
+  run_audio_inference
+
+}
+
+# init params
+function init_params {
+  audio_path=../../../../../intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
+  script="run_whisper.py"
+  for var in "$@"
+  do
+    case $var in
+      --config=*)
+          config=$(echo $var |cut -f2 -d=)
+      ;;
+      --audio_path=*)
+          audio_path=$(echo $var |cut -f2 -d=)
+      ;;
+      --input_model=*)
+          input_model=$(echo $var |cut -f2 -d=)
+      ;;
+    esac
+  done
+
+}
+
+
+# run_audio_inference
+function run_audio_inference {
+
+    python -u ${script} \
+        --model_name_or_path ${config} \
+        --input_model ${input_model} \
+        --audio_path ${audio_path} \
+        --benchmark \
+        --audio_test
+}
+
+main "$@"
diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/run_whisper.py b/examples/huggingface/onnxruntime/speech-recognition/quantization/run_whisper.py
@@ -36,6 +36,10 @@
                     help='cores per instance during benchmark')
 parser.add_argument('--max_new_tokens', default=16, type=int,
                     help='the maximum numbers of tokens to generate')
+parser.add_argument('--audio_path', type=str,
+                    help='the audio path')
+parser.add_argument('--audio_test', dest='audio_test', action='store_true',
+                    help='run test for audio sample')
 
 args = parser.parse_args()
 
@@ -137,7 +141,19 @@ def __iter__(self):
                             break
                         ort_inputs['input_ids'] = input_ids[:, -1:].detach().numpy()
                         yield ort_inputs, 0
-
+
+def audiosegment_to_librosawav(audiosegment):
+    # https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentget_array_of_samples
+    # This way is faster than librosa.load or HuggingFace Dataset wrapper
+    channel_sounds = audiosegment.split_to_mono()[:1]   # only select the first channel
+    samples = [s.get_array_of_samples() for s in channel_sounds]
+
+    fp_arr = np.array(samples).T.astype(np.float32)
+    fp_arr /= np.iinfo(samples[0].typecode).max
+    fp_arr = fp_arr.reshape(-1)
+
+    return fp_arr
+
 if __name__ == "__main__":
     if args.tune:
         if os.path.exists(args.output_model):
@@ -181,21 +197,32 @@ def __iter__(self):
                 os.path.join(args.input_model, 'decoder_with_past_model.onnx'),
                 session_options=sess_options)
         model = ORTModelForSpeechSeq2Seq(sessions[0], sessions[1], config, args.input_model, sessions[2])
-
-        librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test", cache_dir=args.cache_dir)
         processor = WhisperProcessor.from_pretrained(args.model_name_or_path)
-        total_time = 0
-        for idx, batch in enumerate(librispeech_test_clean):
-            if idx > args.iters:
-                break
-            audio = batch["audio"]
-            input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
-            tic = time.time()
-            predicted_ids = model.generate(input_features, max_new_tokens=args.max_new_tokens)
-            toc = time.time()
-            if idx >= args.warmup:
-                total_time += (toc - tic)
-        latency = total_time / (args.iters - args.warmup)
-        print('Latency: %.3f ms' % (latency * 1000))
-        print('Throughput: %.3f images/sec' % (args.batch_size / latency))
-        print('Batch size = %d' % args.batch_size)
+
+        if args.audio_test:
+            from pydub import AudioSegment
+
+            waveform = AudioSegment.from_file(args.audio_path).set_frame_rate(16000)
+            waveform = audiosegment_to_librosawav(waveform)
+            input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
+            predicted_ids = model.generate(input_features)
+            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+            print(transcription)
+
+        else:
+            librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test", cache_dir=args.cache_dir)
+            total_time = 0
+            for idx, batch in enumerate(librispeech_test_clean):
+                if idx > args.iters:
+                    break
+                audio = batch["audio"]
+                input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
+                tic = time.time()
+                predicted_ids = model.generate(input_features, max_new_tokens=args.max_new_tokens)
+                toc = time.time()
+                if idx >= args.warmup:
+                    total_time += (toc - tic)
+            latency = total_time / (args.iters - args.warmup)
+            print('Latency: %.3f ms' % (latency * 1000))
+            print('Throughput: %.3f images/sec' % (args.batch_size / latency))
+            print('Batch size = %d' % args.batch_size)
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,3 +9,4 @@ evaluate @@
     neural-compressor
     librosa
     soundfile
+    pydub