From 1c6f4c7a065bbdfbcc258d6120d65542ff6e4c59 Mon Sep 17 00:00:00 2001
From: kurianbenoy <kurian.bkk@gmail.com>
Date: Wed, 21 Feb 2024 20:08:20 +0530
Subject: [PATCH] fix: bugs in test whisperx code

---
 _experiments/test_whisperx.py | 38 ++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/_experiments/test_whisperx.py b/_experiments/test_whisperx.py
index 0ec25d1..9230825 100644
--- a/_experiments/test_whisperx.py
+++ b/_experiments/test_whisperx.py
@@ -1,39 +1,45 @@
 import whisperx
-import gc
 
-device = "cuda"                                                                                                                      audio_file = "Bishop Thomas Tharayil speaks about Dr Shashi Tharoor at Lourdes Forane Church Thiruvananthapuram.mp4"
-batch_size = 16 # reduce if low on GPU mem
-compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)
-                                                                                                                                     # 1. Transcribe with original whisper (batched)
-model = whisperx.load_model("large-v2", device, compute_type=compute_type
+device = "cuda"
+batch_size = 16  # reduce if low on GPU mem
+compute_type = "float16"  # change to "int8" if low on GPU mem (may reduce accuracy)
+
+# 1. Transcribe with original whisper (batched)
+model = whisperx.load_model("large-v2", device, compute_type=compute_type)
 
 # save model to local path (optional)
 # model_dir = "/path/"
 # model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)
 
-audio = whisperx.load_audio(audio_file)
+audio = whisperx.load_audio(
+    "Bishop Thomas Tharayil speaks about Dr Shashi Tharoor at Lourdes Forane Church Thiruvananthapuram.mp4"
+)  # noqa
 result = model.transcribe(audio, batch_size=batch_size)
-print(result["segments"]) # before alignment
+print(result["segments"])  # before alignment
 
 # delete model if low on GPU resources
 # import gc; gc.collect(); torch.cuda.empty_cache(); del model
 
 # 2. Align whisper output
-model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
+model_a, metadata = whisperx.load_align_model(
+    language_code=result["language"], device=device
+)
+result = whisperx.align(
+    result["segments"], model_a, metadata, audio, device, return_char_alignments=False
+)
 
-print(result["segments"]) # after alignment
+print(result["segments"])  # after alignment
 
 # delete model if low on GPU resources
 # import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
 
 # 3. Assign speaker labels
-#diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
+# diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
 
 # add min/max number of speakers if known
-#diarize_segments = diarize_model(audio)
+# diarize_segments = diarize_model(audio)
 # diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)
 
-#result = whisperx.assign_word_speakers(diarize_segments, result)
-#print(diarize_segments)
-#print(result["segments"]) # segments are now assigned speaker IDs
+# result = whisperx.assign_word_speakers(diarize_segments, result)
+# print(diarize_segments)
+# print(result["segments"]) # segments are now assigned speaker IDs