Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
Add audio sample inference code for ONNX whisper model (#440)
Browse files Browse the repository at this point in the history
* Update run_whisper.py
  • Loading branch information
mengniwang95 authored Oct 12, 2023
1 parent dd38290 commit e9fc4c2
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ bash run_tuning.sh --config=openai/whisper-large \
--approach=static # or dynamic
```

## 2. Benchmark
- To get model accuracy

```
Expand Down Expand Up @@ -53,6 +54,36 @@ numactl -m 0 -C 0-3 bash run_benchmark.sh --config=whisper-large-with-past \
- If users don't set dataset_location, it will download the dataset or use the cached dataset automatically.
- numactl command is used to bind specific cores.

## 3. Audio inference
- To run audio sample inference with FP32/INT8 (both static and dynamic) models

```
bash run_audio_inference.sh --config=openai/whisper-large \ # model_name_or_path
--audio_path=/path/to/dataset \ # optional, support .wav, .mp3 and other ffmpeg supported formats
--input_model=whisper-large-with-past-static/ \ # folder path of onnx model
```

- To run audio sample inference with INT4 models

Upgrade onnxruntime to 1.16.0 first and then:

```
bash run_audio_inference.sh --config=openai/whisper-tiny \ # model_name_or_path
--audio_path=/path/to/dataset \ # optional, support .wav, .mp3 and other ffmpeg supported formats
--input_model=whisper-tiny-onnx-int4/ \ # folder path of onnx model
```

Available INT4 models on huggingface:
[whisper-tiny](https://huggingface.co/Intel/whisper-tiny-onnx-int4),
[whisper-base](https://huggingface.co/Intel/whisper-base-onnx-int4),
[whisper-small](https://huggingface.co/Intel/whisper-small-onnx-int4),
[whisper-medium](https://huggingface.co/Intel/whisper-medium-onnx-int4),
[whisper-large](https://huggingface.co/Intel/whisper-large-onnx-int4),
[whisper-large-v2](https://huggingface.co/Intel/whisper-large-v2-onnx-int4).

**Notes**:
- If users don't set audio_path, it will use sample.wav in intel_extension_for_transformers/neural_chat/assets/audio folder for test.

# Validated model list

|Topology|Pretrained model|PostTrainingDynamic|PostTrainingStatic
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ evaluate
neural-compressor
librosa
soundfile
pydub
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
set -x

function main {

init_params "$@"
run_audio_inference

}

# init params
function init_params {
audio_path=../../../../../intel_extension_for_transformers/neural_chat/assets/audio/sample.wav
script="run_whisper.py"
for var in "$@"
do
case $var in
--config=*)
config=$(echo $var |cut -f2 -d=)
;;
--audio_path=*)
audio_path=$(echo $var |cut -f2 -d=)
;;
--input_model=*)
input_model=$(echo $var |cut -f2 -d=)
;;
esac
done

}


# run_audio_inference
function run_audio_inference {

python -u ${script} \
--model_name_or_path ${config} \
--input_model ${input_model} \
--audio_path ${audio_path} \
--benchmark \
--audio_test
}

main "$@"
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@
help='cores per instance during benchmark')
parser.add_argument('--max_new_tokens', default=16, type=int,
help='the maximum numbers of tokens to generate')
parser.add_argument('--audio_path', type=str,
help='the audio path')
parser.add_argument('--audio_test', dest='audio_test', action='store_true',
help='run test for audio sample')

args = parser.parse_args()

Expand Down Expand Up @@ -137,7 +141,19 @@ def __iter__(self):
break
ort_inputs['input_ids'] = input_ids[:, -1:].detach().numpy()
yield ort_inputs, 0


def audiosegment_to_librosawav(audiosegment):
# https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentget_array_of_samples
# This way is faster than librosa.load or HuggingFace Dataset wrapper
channel_sounds = audiosegment.split_to_mono()[:1] # only select the first channel
samples = [s.get_array_of_samples() for s in channel_sounds]

fp_arr = np.array(samples).T.astype(np.float32)
fp_arr /= np.iinfo(samples[0].typecode).max
fp_arr = fp_arr.reshape(-1)

return fp_arr

if __name__ == "__main__":
if args.tune:
if os.path.exists(args.output_model):
Expand Down Expand Up @@ -181,21 +197,32 @@ def __iter__(self):
os.path.join(args.input_model, 'decoder_with_past_model.onnx'),
session_options=sess_options)
model = ORTModelForSpeechSeq2Seq(sessions[0], sessions[1], config, args.input_model, sessions[2])

librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test", cache_dir=args.cache_dir)
processor = WhisperProcessor.from_pretrained(args.model_name_or_path)
total_time = 0
for idx, batch in enumerate(librispeech_test_clean):
if idx > args.iters:
break
audio = batch["audio"]
input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
tic = time.time()
predicted_ids = model.generate(input_features, max_new_tokens=args.max_new_tokens)
toc = time.time()
if idx >= args.warmup:
total_time += (toc - tic)
latency = total_time / (args.iters - args.warmup)
print('Latency: %.3f ms' % (latency * 1000))
print('Throughput: %.3f images/sec' % (args.batch_size / latency))
print('Batch size = %d' % args.batch_size)

if args.audio_test:
from pydub import AudioSegment

waveform = AudioSegment.from_file(args.audio_path).set_frame_rate(16000)
waveform = audiosegment_to_librosawav(waveform)
input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

else:
librispeech_test_clean = load_dataset("librispeech_asr", "clean", split="test", cache_dir=args.cache_dir)
total_time = 0
for idx, batch in enumerate(librispeech_test_clean):
if idx > args.iters:
break
audio = batch["audio"]
input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
tic = time.time()
predicted_ids = model.generate(input_features, max_new_tokens=args.max_new_tokens)
toc = time.time()
if idx >= args.warmup:
total_time += (toc - tic)
latency = total_time / (args.iters - args.warmup)
print('Latency: %.3f ms' % (latency * 1000))
print('Throughput: %.3f images/sec' % (args.batch_size / latency))
print('Batch size = %d' % args.batch_size)

0 comments on commit e9fc4c2

Please sign in to comment.