Skip to content

Commit

Permalink
TitaNet Batch Verify Speaker (#9337)
Browse files Browse the repository at this point in the history
* add batch_inference for verify_speakers method

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* remove not used package

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* change batch inference logic

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* fixup

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* requested changes

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* add verify_speakers_batch to docs

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* handle None durations in manifest

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* change logging text

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>

* check duration presence

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

* add channel_selector to dataset configs

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>

---------

Signed-off-by: msekoyan@nvidia.com <msekoyan@nvidia.com>
Signed-off-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: monica-sekoyan <monica-sekoyan@users.noreply.github.com>
Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com>
Signed-off-by: Tugrul Konuk <ertkonuk@gmail.com>
  • Loading branch information
3 people authored and ertkonuk committed Jul 19, 2024
1 parent 60204db commit 06949f8
Show file tree
Hide file tree
Showing 9 changed files with 181 additions and 56 deletions.
2 changes: 1 addition & 1 deletion docs/source/asr/speaker_recognition/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ Model Classes
-------------
.. autoclass:: nemo.collections.asr.models.label_models.EncDecSpeakerLabelModel
:show-inheritance:
:members: setup_finetune_model, get_embedding, verify_speakers
:members: setup_finetune_model, get_embedding, verify_speakers, verify_speakers_batch


8 changes: 7 additions & 1 deletion docs/source/asr/speaker_recognition/results.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,20 @@ Speaker Verification Inference

Speaker Verification is a task of verifying if two utterances are from the same speaker or not.

We provide a helper function to verify the audio files and return True if two provided audio files are from the same speaker, False otherwise.
We provide a helper function to verify the audio files (also in a batch) and return True if provided pair of audio files is from the same speaker, False otherwise.

The audio files should be 16KHz mono channel wav files.

.. code-block:: python
speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name="titanet_large")
decision = speaker_model.verify_speakers('path/to/one/audio_file','path/to/other/audio_file')
decisions = speaker_model.verify_speakers_batch([
('/path/to/audio_0_0', '/path/to/audio_0_1'),
('/path/to/audio_1_0', '/path/to/audio_1_1'),
('/path/to/audio_2_0', '/path/to/audio_2_1'),
('/path/to/audio_3_0', '/path/to/audio_3_1')
], batch_size=4, device='cuda')
NGC Pretrained Checkpoints
Expand Down
81 changes: 56 additions & 25 deletions nemo/collections/asr/data/audio_to_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,12 @@ def _speech_collate_fn(batch, pad_id):

def _fixed_seq_collate_fn(self, batch):
"""collate batch of audio sig, audio len, tokens, tokens len
Args:
batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
LongTensor): A tuple of tuples of signal, signal lengths,
encoded tokens, and encoded tokens length. This collate func
assumes the signals are 1d torch tensors (i.e. mono audio).
"""
Args:
batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
LongTensor): A tuple of tuples of signal, signal lengths,
encoded tokens, and encoded tokens length. This collate func
assumes the signals are 1d torch tensors (i.e. mono audio).
"""
_, audio_lengths, _, tokens_lengths = zip(*batch)

has_audio = audio_lengths[0] is not None
Expand Down Expand Up @@ -232,19 +232,23 @@ class _AudioLabelDataset(Dataset):
Defaults to None.
trim (bool): Whether to use trim silence from beginning and end of audio signal using librosa.effects.trim().
Defaults to False.
channel selector (Union[str, int, List[int]]): string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
of integers denoting a subset of channels. Channel selector is using zero-based indexing.
If set to `None`, the original signal will be used.
"""

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
"""Returns definitions of module output ports.
"""
"""Returns definitions of module output ports."""

output_types = {
'audio_signal': NeuralType(
('B', 'T'),
AudioSignal(freq=self._sample_rate)
if self is not None and hasattr(self, '_sample_rate')
else AudioSignal(),
(
AudioSignal(freq=self._sample_rate)
if self is not None and hasattr(self, '_sample_rate')
else AudioSignal()
),
),
'a_sig_length': NeuralType(tuple('B'), LengthsType()),
}
Expand All @@ -259,7 +263,10 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
else:

output_types.update(
{'label': NeuralType(tuple('B'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),}
{
'label': NeuralType(tuple('B'), LabelsType()),
'label_length': NeuralType(tuple('B'), LengthsType()),
}
)

return output_types
Expand All @@ -273,6 +280,7 @@ def __init__(
min_duration: Optional[float] = 0.1,
max_duration: Optional[float] = None,
trim: bool = False,
channel_selector: Union[str, int, List[int]] = None,
is_regression_task: bool = False,
cal_labels_occurrence: Optional[bool] = False,
):
Expand All @@ -290,6 +298,7 @@ def __init__(

self.featurizer = featurizer
self.trim = trim
self.channel_selector = channel_selector
self.is_regression_task = is_regression_task

if not is_regression_task:
Expand Down Expand Up @@ -325,7 +334,13 @@ def __getitem__(self, index):
if offset is None:
offset = 0

features = self.featurizer.process(sample.audio_file, offset=offset, duration=sample.duration, trim=self.trim)
features = self.featurizer.process(
sample.audio_file,
offset=offset,
duration=sample.duration,
trim=self.trim,
channel_selector=self.channel_selector,
)
f, fl = features, torch.tensor(features.shape[0]).long()

if not self.is_regression_task:
Expand Down Expand Up @@ -392,6 +407,9 @@ class AudioToSpeechLabelDataset(_AudioLabelDataset):
trim (bool): Whether to use trim silence from beginning and end
of audio signal using librosa.effects.trim().
Defaults to False.
channel selector (Union[str, int, List[int]]): string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
of integers denoting a subset of channels. Channel selector is using zero-based indexing.
If set to `None`, the original signal will be used.
window_length_in_sec (float): length of window/slice (in seconds)
Use this for speaker recognition and VAD tasks.
shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch
Expand All @@ -413,6 +431,7 @@ def __init__(
min_duration: Optional[float] = 0.1,
max_duration: Optional[float] = None,
trim: bool = False,
channel_selector: Optional[Union[str, int, List[int]]] = None,
window_length_in_sec: Optional[float] = 8,
shift_length_in_sec: Optional[float] = 1,
normalize_audio: bool = False,
Expand All @@ -433,6 +452,7 @@ def __init__(
min_duration=min_duration,
max_duration=max_duration,
trim=trim,
channel_selector=channel_selector,
is_regression_task=is_regression_task,
cal_labels_occurrence=cal_labels_occurrence,
)
Expand Down Expand Up @@ -631,8 +651,7 @@ def _internal_generator(self):
return TarredAudioFilter(self.collection, self.file_occurence)

def _build_sample(self, tup):
"""Builds the training sample by combining the data from the WebDataset with the manifest info.
"""
"""Builds the training sample by combining the data from the WebDataset with the manifest info."""
audio_bytes, audio_filename = tup
# Grab manifest entry from self.collection
file_id, _ = os.path.splitext(os.path.basename(audio_filename))
Expand All @@ -647,7 +666,10 @@ def _build_sample(self, tup):
# Convert audio bytes to IO stream for processing (for SoundFile to read)
audio_filestream = io.BytesIO(audio_bytes)
features = self.featurizer.process(
audio_filestream, offset=offset, duration=manifest_entry.duration, trim=self.trim,
audio_filestream,
offset=offset,
duration=manifest_entry.duration,
trim=self.trim,
)

audio_filestream.close()
Expand Down Expand Up @@ -879,9 +901,12 @@ class AudioToMultiLabelDataset(Dataset):
All training files which have a duration more than max_duration
are dropped. Note: Duration is read from the manifest JSON.
Defaults to None.
trim (bool): Whether to use trim silence from beginning and end
trim_silence (bool): Whether to use trim silence from beginning and end
of audio signal using librosa.effects.trim().
Defaults to False.
channel selector (Union[str, int, List[int]]): string denoting the downmix mode, an integer denoting the channel to be selected, or an iterable
of integers denoting a subset of channels. Channel selector is using zero-based indexing.
If set to `None`, the original signal will be used.
window_length_in_sec (float): length of window/slice (in seconds)
Use this for speaker recognition and VAD tasks.
shift_length_in_sec (float): amount of shift of window for generating the frame for VAD task in a batch
Expand All @@ -898,15 +923,16 @@ class AudioToMultiLabelDataset(Dataset):

@property
def output_types(self) -> Optional[Dict[str, NeuralType]]:
"""Returns definitions of module output ports.
"""
"""Returns definitions of module output ports."""

output_types = {
'audio_signal': NeuralType(
('B', 'T'),
AudioSignal(freq=self._sample_rate)
if self is not None and hasattr(self, '_sample_rate')
else AudioSignal(),
(
AudioSignal(freq=self._sample_rate)
if self is not None and hasattr(self, '_sample_rate')
else AudioSignal()
),
),
'a_sig_length': NeuralType(tuple('B'), LengthsType()),
}
Expand All @@ -920,7 +946,10 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]:
)
else:
output_types.update(
{'label': NeuralType(('B', 'T'), LabelsType()), 'label_length': NeuralType(tuple('B'), LengthsType()),}
{
'label': NeuralType(('B', 'T'), LabelsType()),
'label_length': NeuralType(tuple('B'), LengthsType()),
}
)

return output_types
Expand All @@ -936,6 +965,7 @@ def __init__(
min_duration: Optional[float] = 0.1,
max_duration: Optional[float] = None,
trim_silence: bool = False,
channel_selector: Optional[Union[str, int, List[int]]] = None,
is_regression_task: bool = False,
cal_labels_occurrence: Optional[bool] = False,
delimiter: Optional[str] = None,
Expand All @@ -959,6 +989,7 @@ def __init__(

self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor)
self.trim = trim_silence
self.channel_selector = channel_selector
self.is_regression_task = is_regression_task
self.id2occurrence = {}
self.labels_occurrence = None
Expand Down Expand Up @@ -1016,6 +1047,7 @@ def __getitem__(self, index):
offset=offset,
duration=sample.duration,
trim=self.trim,
channel_selector=self.channel_selector,
normalize_db=self.normalize_audio_db,
)

Expand Down Expand Up @@ -1245,8 +1277,7 @@ def _internal_generator(self):
return TarredAudioFilter(self.collection, self.file_occurence)

def _build_sample(self, tup):
"""Builds the training sample by combining the data from the WebDataset with the manifest info.
"""
"""Builds the training sample by combining the data from the WebDataset with the manifest info."""
audio_bytes, audio_filename = tup
# Grab manifest entry from self.collection
file_id, _ = os.path.splitext(os.path.basename(audio_filename))
Expand Down
7 changes: 0 additions & 7 deletions nemo/collections/asr/models/clustering_diarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,13 +392,6 @@ def _extract_embeddings(self, manifest_file: str, scale_idx: int, num_scales: in
pkl.dump(self.embeddings, open(self._embeddings_file, 'wb'))
logging.info("Saved embedding files to {}".format(embedding_dir))

def path2audio_files_to_manifest(self, paths2audio_files, manifest_filepath):
with open(manifest_filepath, 'w', encoding='utf-8') as fp:
for audio_file in paths2audio_files:
audio_file = audio_file.strip()
entry = {'audio_filepath': audio_file, 'offset': 0.0, 'duration': None, 'text': '-', 'label': 'infer'}
fp.write(json.dumps(entry) + '\n')

def diarize(self, paths2audio_files: List[str] = None, batch_size: int = 0):
"""
Diarize files provided through paths2audio_files or manifest file
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Union

from omegaconf import MISSING

Expand Down Expand Up @@ -46,6 +46,7 @@ class EncDecClassificationDatasetConfig(nemo.core.classes.dataset.DatasetConfig)
max_duration: Optional[float] = None
min_duration: Optional[float] = None
cal_labels_occurrence: Optional[bool] = False
channel_selector: Optional[Union[str, int, List[int]]] = None

# VAD Optional
vad_stream: Optional[bool] = None
Expand Down
Loading

0 comments on commit 06949f8

Please sign in to comment.