From 08e397aa6262fc6aa2bd98d993b791a2452f0a17 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Mon, 5 Dec 2022 19:55:50 -0800 Subject: [PATCH 1/7] Fix log calculation Signed-off-by: smajumdar --- nemo/collections/asr/parts/utils/streaming_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index 784cf24207a0..16574fde226e 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -362,7 +362,7 @@ def __init__(self, asr_model, chunk_size, buffer_size): ''' self.NORM_CONSTANT = 1e-5 - if asr_model.cfg.preprocessor.log: + if 'log' in asr_model.preprocessor and asr_model.preprocessor.log: self.ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal else: self.ZERO_LEVEL_SPEC_DB_VAL = 0.0 From 8def69740e84ae9f74dba656d8d1ecaddd63fb49 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Mon, 5 Dec 2022 19:57:56 -0800 Subject: [PATCH 2/7] Fix log calculation Signed-off-by: smajumdar --- nemo/collections/asr/parts/utils/streaming_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index 16574fde226e..4fa1b0e77da9 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -576,7 +576,7 @@ def __init__(self, asr_model, frame_len=1.6, batch_size=4, total_buffer=4.0): frame_overlap: duration of overlaps before and after current frame, seconds offset: number of symbols to drop for smooth streaming ''' - if asr_model.cfg.preprocessor.log: + if 'log' in asr_model.preprocessor and asr_model.preprocessor.log: self.ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal else: self.ZERO_LEVEL_SPEC_DB_VAL = 0.0 From af9d5c304e0bef43e9674127791a701a4d3f086d Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Mon, 5 Dec 2022 19:59:40 -0800 Subject: [PATCH 3/7] Fix log check Signed-off-by: smajumdar --- nemo/collections/asr/parts/utils/streaming_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index 4fa1b0e77da9..9540c8bb06c6 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -362,7 +362,7 @@ def __init__(self, asr_model, chunk_size, buffer_size): ''' self.NORM_CONSTANT = 1e-5 - if 'log' in asr_model.preprocessor and asr_model.preprocessor.log: + if hasattr(asr_model.preprocessor, 'log') and asr_model.preprocessor.log: self.ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal else: self.ZERO_LEVEL_SPEC_DB_VAL = 0.0 @@ -576,7 +576,7 @@ def __init__(self, asr_model, frame_len=1.6, batch_size=4, total_buffer=4.0): frame_overlap: duration of overlaps before and after current frame, seconds offset: number of symbols to drop for smooth streaming ''' - if 'log' in asr_model.preprocessor and asr_model.preprocessor.log: + if hasattr(asr_model.preprocessor, 'log') and asr_model.preprocessor.log: self.ZERO_LEVEL_SPEC_DB_VAL = -16.635 # Log-Melspectrogram value for zero signal else: self.ZERO_LEVEL_SPEC_DB_VAL = 0.0 From 41a284c775831bcac76d468a84afb0cb6a6c84bd Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Mon, 5 Dec 2022 20:47:35 -0800 Subject: [PATCH 4/7] Deepcopy the hypothesis to prevent inplace corrections Signed-off-by: smajumdar --- nemo/collections/asr/metrics/rnnt_wer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/asr/metrics/rnnt_wer.py b/nemo/collections/asr/metrics/rnnt_wer.py index 27ec9e43b897..476b5a43c663 100644 --- a/nemo/collections/asr/metrics/rnnt_wer.py +++ b/nemo/collections/asr/metrics/rnnt_wer.py @@ -445,7 +445,7 @@ def decode_hypothesis(self, hypotheses_list: List[Hypothesis]) -> List[Union[Hyp # keep the original predictions, wrap with the number of repetitions per token and alignments # this is done so that `rnnt_decoder_predictions_tensor()` can process this hypothesis # in order to compute exact time stamps. - alignments = hypotheses_list[ind].alignments + alignments = copy.deepcopy(hypotheses_list[ind].alignments) token_repetitions = [1] * len(alignments) # preserve number of repetitions per token hypothesis = (prediction, alignments, token_repetitions) else: From 745b8fb2ebe11c34018d7f32e0e0d38c21c4ba5a Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Mon, 5 Dec 2022 21:06:04 -0800 Subject: [PATCH 5/7] Deepcopy the hypothesis to prevent inplace corrections Signed-off-by: smajumdar --- nemo/collections/asr/parts/utils/streaming_utils.py | 10 ++++++++++ nemo/collections/asr/parts/utils/transcribe_utils.py | 8 ++++++++ 2 files changed, 18 insertions(+) diff --git a/nemo/collections/asr/parts/utils/streaming_utils.py b/nemo/collections/asr/parts/utils/streaming_utils.py index 9540c8bb06c6..2852b75fd9f4 100644 --- a/nemo/collections/asr/parts/utils/streaming_utils.py +++ b/nemo/collections/asr/parts/utils/streaming_utils.py @@ -963,6 +963,7 @@ def __init__( self.all_alignments = [[] for _ in range(self.batch_size)] self.all_preds = [[] for _ in range(self.batch_size)] + self.all_timestamps = [[] for _ in range(self.batch_size)] self.previous_hypotheses = None self.batch_index_map = { idx: idx for idx in range(self.batch_size) @@ -990,6 +991,7 @@ def reset(self): self.all_alignments = [[] for _ in range(self.batch_size)] self.all_preds = [[] for _ in range(self.batch_size)] + self.all_timestamps = [[] for _ in range(self.batch_size)] self.previous_hypotheses = None self.batch_index_map = {idx: idx for idx in range(self.batch_size)} @@ -1110,6 +1112,14 @@ def _get_batch_preds(self): if not has_signal_ended: self.all_preds[global_index_key].append(pred.cpu().numpy()) + timestamps = [hyp.timestep for hyp in best_hyp] + for idx, timestep in enumerate(timestamps): + global_index_key = new_batch_keys[idx] # get index of this sample in the global batch + + has_signal_ended = self.frame_bufferer.signal_end[global_index_key] + if not has_signal_ended: + self.all_timestamps[global_index_key].append(timestep) + if self.stateful_decoding: # State resetting is being done on sub-batch only, global index information is not being updated reset_states = self.asr_model.decoder.initialize_state(encoded) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 9da3388ef036..77ed489979fb 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -86,8 +86,16 @@ def get_buffered_pred_feat_rnnt( audio_files = [sample for sample in batch] asr.read_audio_file(audio_files, delay, model_stride_in_secs) hyp_list = asr.transcribe(tokens_per_chunk, delay) + + if hasattr(asr, 'all_timestamps'): + for idx, timestamps in enumerate(asr.all_timestamps): + for t_idx, timestamp in enumerate(timestamps): + print("IDX", idx, "Timestamp idx", t_idx, timestamp['word']) + + hyps.extend(hyp_list) + batch.clear() asr.sample_offset += len(batch) From 726adb68a349d387462fd5a4d6078b799d5e8398 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Mon, 5 Dec 2022 21:10:56 -0800 Subject: [PATCH 6/7] Revert changes Signed-off-by: smajumdar --- nemo/collections/asr/parts/utils/transcribe_utils.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/nemo/collections/asr/parts/utils/transcribe_utils.py b/nemo/collections/asr/parts/utils/transcribe_utils.py index 77ed489979fb..9da3388ef036 100644 --- a/nemo/collections/asr/parts/utils/transcribe_utils.py +++ b/nemo/collections/asr/parts/utils/transcribe_utils.py @@ -86,16 +86,8 @@ def get_buffered_pred_feat_rnnt( audio_files = [sample for sample in batch] asr.read_audio_file(audio_files, delay, model_stride_in_secs) hyp_list = asr.transcribe(tokens_per_chunk, delay) - - if hasattr(asr, 'all_timestamps'): - for idx, timestamps in enumerate(asr.all_timestamps): - for t_idx, timestamp in enumerate(timestamps): - print("IDX", idx, "Timestamp idx", t_idx, timestamp['word']) - - hyps.extend(hyp_list) - batch.clear() asr.sample_offset += len(batch) From 7845d8bd28111f985da06d69bb4b227f2d8efeff Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Tue, 6 Dec 2022 09:06:41 -0800 Subject: [PATCH 7/7] Add link to HF space to readme Signed-off-by: smajumdar --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 43bb89139045..bf089101c198 100644 --- a/README.rst +++ b/README.rst @@ -61,6 +61,7 @@ Key Features ------------ * Speech processing + * `HuggingFace Space for Audio Transcription (File, Micriphone and YouTube) `_ * `Automatic Speech Recognition (ASR) `_ * Supported models: Jasper, QuartzNet, CitriNet, Conformer-CTC, Conformer-Transducer, Squeezeformer-CTC, Squeezeformer-Transducer, ContextNet, LSTM-Transducer (RNNT), LSTM-CTC, ... * Supports CTC and Transducer/RNNT losses/decoders