From 1c6f4c7a065bbdfbcc258d6120d65542ff6e4c59 Mon Sep 17 00:00:00 2001 From: kurianbenoy Date: Wed, 21 Feb 2024 20:08:20 +0530 Subject: [PATCH] fix: bugs in test whisperx code --- _experiments/test_whisperx.py | 38 ++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/_experiments/test_whisperx.py b/_experiments/test_whisperx.py index 0ec25d1..9230825 100644 --- a/_experiments/test_whisperx.py +++ b/_experiments/test_whisperx.py @@ -1,39 +1,45 @@ import whisperx -import gc -device = "cuda" audio_file = "Bishop Thomas Tharayil speaks about Dr Shashi Tharoor at Lourdes Forane Church Thiruvananthapuram.mp4" -batch_size = 16 # reduce if low on GPU mem -compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy) - # 1. Transcribe with original whisper (batched) -model = whisperx.load_model("large-v2", device, compute_type=compute_type +device = "cuda" +batch_size = 16 # reduce if low on GPU mem +compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy) + +# 1. Transcribe with original whisper (batched) +model = whisperx.load_model("large-v2", device, compute_type=compute_type) # save model to local path (optional) # model_dir = "/path/" # model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir) -audio = whisperx.load_audio(audio_file) +audio = whisperx.load_audio( + "Bishop Thomas Tharayil speaks about Dr Shashi Tharoor at Lourdes Forane Church Thiruvananthapuram.mp4" +) # noqa result = model.transcribe(audio, batch_size=batch_size) -print(result["segments"]) # before alignment +print(result["segments"]) # before alignment # delete model if low on GPU resources # import gc; gc.collect(); torch.cuda.empty_cache(); del model # 2. Align whisper output -model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device) -result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False) +model_a, metadata = whisperx.load_align_model( + language_code=result["language"], device=device +) +result = whisperx.align( + result["segments"], model_a, metadata, audio, device, return_char_alignments=False +) -print(result["segments"]) # after alignment +print(result["segments"]) # after alignment # delete model if low on GPU resources # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a # 3. Assign speaker labels -#diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device) +# diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device) # add min/max number of speakers if known -#diarize_segments = diarize_model(audio) +# diarize_segments = diarize_model(audio) # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers) -#result = whisperx.assign_word_speakers(diarize_segments, result) -#print(diarize_segments) -#print(result["segments"]) # segments are now assigned speaker IDs +# result = whisperx.assign_word_speakers(diarize_segments, result) +# print(diarize_segments) +# print(result["segments"]) # segments are now assigned speaker IDs