From 08e397aa6262fc6aa2bd98d993b791a2452f0a17 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Mon, 5 Dec 2022 19:55:50 -0800
Subject: [PATCH 1/7] Fix log calculation

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 nemo/collections/asr/parts/utils/streaming_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index 784cf24207a0..16574fde226e 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -362,7 +362,7 @@ def __init__(self, asr_model, chunk_size, buffer_size):
         '''
 
         self.NORM_CONSTANT = 1e-5
-        if asr_model.cfg.preprocessor.log:
+        if 'log' in asr_model.preprocessor and asr_model.preprocessor.log:
             self.ZERO_LEVEL_SPEC_DB_VAL = -16.635  # Log-Melspectrogram value for zero signal
         else:
             self.ZERO_LEVEL_SPEC_DB_VAL = 0.0

From 8def69740e84ae9f74dba656d8d1ecaddd63fb49 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Mon, 5 Dec 2022 19:57:56 -0800
Subject: [PATCH 2/7] Fix log calculation

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 nemo/collections/asr/parts/utils/streaming_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index 16574fde226e..4fa1b0e77da9 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -576,7 +576,7 @@ def __init__(self, asr_model, frame_len=1.6, batch_size=4, total_buffer=4.0):
           frame_overlap: duration of overlaps before and after current frame, seconds
           offset: number of symbols to drop for smooth streaming
         '''
-        if asr_model.cfg.preprocessor.log:
+        if 'log' in asr_model.preprocessor and asr_model.preprocessor.log:
             self.ZERO_LEVEL_SPEC_DB_VAL = -16.635  # Log-Melspectrogram value for zero signal
         else:
             self.ZERO_LEVEL_SPEC_DB_VAL = 0.0

From af9d5c304e0bef43e9674127791a701a4d3f086d Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Mon, 5 Dec 2022 19:59:40 -0800
Subject: [PATCH 3/7] Fix log check

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 nemo/collections/asr/parts/utils/streaming_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index 4fa1b0e77da9..9540c8bb06c6 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -362,7 +362,7 @@ def __init__(self, asr_model, chunk_size, buffer_size):
         '''
 
         self.NORM_CONSTANT = 1e-5
-        if 'log' in asr_model.preprocessor and asr_model.preprocessor.log:
+        if hasattr(asr_model.preprocessor, 'log') and asr_model.preprocessor.log:
             self.ZERO_LEVEL_SPEC_DB_VAL = -16.635  # Log-Melspectrogram value for zero signal
         else:
             self.ZERO_LEVEL_SPEC_DB_VAL = 0.0
@@ -576,7 +576,7 @@ def __init__(self, asr_model, frame_len=1.6, batch_size=4, total_buffer=4.0):
           frame_overlap: duration of overlaps before and after current frame, seconds
           offset: number of symbols to drop for smooth streaming
         '''
-        if 'log' in asr_model.preprocessor and asr_model.preprocessor.log:
+        if hasattr(asr_model.preprocessor, 'log') and asr_model.preprocessor.log:
             self.ZERO_LEVEL_SPEC_DB_VAL = -16.635  # Log-Melspectrogram value for zero signal
         else:
             self.ZERO_LEVEL_SPEC_DB_VAL = 0.0

From 41a284c775831bcac76d468a84afb0cb6a6c84bd Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Mon, 5 Dec 2022 20:47:35 -0800
Subject: [PATCH 4/7] Deepcopy the hypothesis to prevent inplace corrections

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 nemo/collections/asr/metrics/rnnt_wer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py
index 27ec9e43b897..476b5a43c663 100644
--- a/nemo/collections/asr/metrics/rnnt_wer.py
+++ b/nemo/collections/asr/metrics/rnnt_wer.py
@@ -445,7 +445,7 @@ def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hyp
                 # keep the original predictions, wrap with the number of repetitions per token and alignments
                 # this is done so that `rnnt_decoder_predictions_tensor()` can process this hypothesis
                 # in order to compute exact time stamps.
-                alignments = hypotheses_list[ind].alignments
+                alignments = copy.deepcopy(hypotheses_list[ind].alignments)
                 token_repetitions = [1] * len(alignments)  # preserve number of repetitions per token
                 hypothesis = (prediction, alignments, token_repetitions)
             else:

From 745b8fb2ebe11c34018d7f32e0e0d38c21c4ba5a Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Mon, 5 Dec 2022 21:06:04 -0800
Subject: [PATCH 5/7] Deepcopy the hypothesis to prevent inplace corrections

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 nemo/collections/asr/parts/utils/streaming_utils.py  | 10 ++++++++++
 nemo/collections/asr/parts/utils/transcribe_utils.py |  8 ++++++++
 2 files changed, 18 insertions(+)

diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py
index 9540c8bb06c6..2852b75fd9f4 100644
--- a/nemo/collections/asr/parts/utils/streaming_utils.py
+++ b/nemo/collections/asr/parts/utils/streaming_utils.py
@@ -963,6 +963,7 @@ def __init__(
 
         self.all_alignments = [[] for _ in range(self.batch_size)]
         self.all_preds = [[] for _ in range(self.batch_size)]
+        self.all_timestamps = [[] for _ in range(self.batch_size)]
         self.previous_hypotheses = None
         self.batch_index_map = {
             idx: idx for idx in range(self.batch_size)
@@ -990,6 +991,7 @@ def reset(self):
 
         self.all_alignments = [[] for _ in range(self.batch_size)]
         self.all_preds = [[] for _ in range(self.batch_size)]
+        self.all_timestamps = [[] for _ in range(self.batch_size)]
         self.previous_hypotheses = None
         self.batch_index_map = {idx: idx for idx in range(self.batch_size)}
 
@@ -1110,6 +1112,14 @@ def _get_batch_preds(self):
             if not has_signal_ended:
                 self.all_preds[global_index_key].append(pred.cpu().numpy())
 
+        timestamps = [hyp.timestep for hyp in best_hyp]
+        for idx, timestep in enumerate(timestamps):
+            global_index_key = new_batch_keys[idx]  # get index of this sample in the global batch
+
+            has_signal_ended = self.frame_bufferer.signal_end[global_index_key]
+            if not has_signal_ended:
+                self.all_timestamps[global_index_key].append(timestep)
+
         if self.stateful_decoding:
             # State resetting is being done on sub-batch only, global index information is not being updated
             reset_states = self.asr_model.decoder.initialize_state(encoded)
diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index 9da3388ef036..77ed489979fb 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -86,8 +86,16 @@ def get_buffered_pred_feat_rnnt(
                 audio_files = [sample for sample in batch]
                 asr.read_audio_file(audio_files, delay, model_stride_in_secs)
                 hyp_list = asr.transcribe(tokens_per_chunk, delay)
+
+                if hasattr(asr, 'all_timestamps'):
+                    for idx, timestamps in enumerate(asr.all_timestamps):
+                        for t_idx, timestamp in enumerate(timestamps):
+                            print("IDX", idx, "Timestamp idx", t_idx, timestamp['word'])
+
+
                 hyps.extend(hyp_list)
 
+
                 batch.clear()
                 asr.sample_offset += len(batch)
 

From 726adb68a349d387462fd5a4d6078b799d5e8398 Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Mon, 5 Dec 2022 21:10:56 -0800
Subject: [PATCH 6/7] Revert changes

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 nemo/collections/asr/parts/utils/transcribe_utils.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py
index 77ed489979fb..9da3388ef036 100644
--- a/nemo/collections/asr/parts/utils/transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/transcribe_utils.py
@@ -86,16 +86,8 @@ def get_buffered_pred_feat_rnnt(
                 audio_files = [sample for sample in batch]
                 asr.read_audio_file(audio_files, delay, model_stride_in_secs)
                 hyp_list = asr.transcribe(tokens_per_chunk, delay)
-
-                if hasattr(asr, 'all_timestamps'):
-                    for idx, timestamps in enumerate(asr.all_timestamps):
-                        for t_idx, timestamp in enumerate(timestamps):
-                            print("IDX", idx, "Timestamp idx", t_idx, timestamp['word'])
-
-
                 hyps.extend(hyp_list)
 
-
                 batch.clear()
                 asr.sample_offset += len(batch)
 

From 7845d8bd28111f985da06d69bb4b227f2d8efeff Mon Sep 17 00:00:00 2001
From: Somshubra Majumdar <titu1994@gmail.com>
Date: Tue, 6 Dec 2022 09:06:41 -0800
Subject: [PATCH 7/7] Add link to HF space to readme

Signed-off-by: smajumdar <titu1994@gmail.com>
---
 README.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.rst b/README.rst
index 43bb89139045..bf089101c198 100644
--- a/README.rst
+++ b/README.rst
@@ -61,6 +61,7 @@ Key Features
 ------------
 
 * Speech processing
+    * `HuggingFace Space for Audio Transcription (File, Micriphone and YouTube) <https://huggingface.co/spaces/smajumdar/nemo_multilingual_language_id>`_
     * `Automatic Speech Recognition (ASR) <https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/intro.html>`_
         * Supported models: Jasper, QuartzNet, CitriNet, Conformer-CTC, Conformer-Transducer, Squeezeformer-CTC, Squeezeformer-Transducer, ContextNet, LSTM-Transducer (RNNT), LSTM-CTC, ...
         * Supports CTC and Transducer/RNNT losses/decoders