From 7a2d05943c7f022d545e0bde6166592f69908a5e Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Wed, 2 Nov 2022 11:14:57 -0400 Subject: [PATCH 1/4] remove zero duration segments for indexing --- lhotse/cut/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py index 8a9b0d093..49d027315 100644 --- a/lhotse/cut/base.py +++ b/lhotse/cut/base.py @@ -456,6 +456,8 @@ def trim_to_supervisions( from .set import CutSet cuts = [] + # Remove zero-duration supervisions from the cut, since these cannot be indexed. + self.supervisions = [s for s in self.supervisions if s.duration > 0] supervisions_index = self.index_supervisions(index_mixed_tracks=True) for segment in self.supervisions: if min_duration is None: From 0c537220e114b5856c1a0662106974f903503ee7 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Wed, 9 Nov 2022 10:57:39 -0500 Subject: [PATCH 2/4] add diarization workflow with speechbrain --- lhotse/bin/modes/workflows.py | 49 +++++++++++++++++++++++++++++++++++ lhotse/cut/base.py | 6 ++--- lhotse/recipes/ami.py | 6 ++--- lhotse/workflows/__init__.py | 1 + lhotse/workflows/whisper.py | 5 ++-- 5 files changed, 59 insertions(+), 8 deletions(-) diff --git a/lhotse/bin/modes/workflows.py b/lhotse/bin/modes/workflows.py index 614a3c3d6..bb78a79e1 100644 --- a/lhotse/bin/modes/workflows.py +++ b/lhotse/bin/modes/workflows.py @@ -164,3 +164,52 @@ def align_with_torchaudio( desc="Aligning", ): writer.write(cut, flush=True) + + +@workflows.command() +@click.argument( + "in_cuts", type=click.Path(exists=True, dir_okay=False, allow_dash=True) +) +@click.argument("out_cuts", type=click.Path(allow_dash=True)) +@click.option( + "-d", "--device", default="cpu", help="Device on which to run the inference." +) +@click.option( + "--num-speakers", + type=int, + default=None, + help="Number of clusters to use for speaker diarization. Will use threshold if not provided.", +) +@click.option( + "--threshold", + type=float, + default=None, + help="Threshold for speaker diarization. Will use num-speakers if not provided.", +) +def diarize_segments_with_speechbrain( + in_cuts: str, + out_cuts: str, + device: str = "cpu", + num_speakers: Optional[int] = None, + threshold: Optional[float] = None, +): + """ + This workflow uses SpeechBrain's pretrained speaker embedding model to compute speaker embeddings + for each cut in the CutSet. The cuts for the same recording are then clustered using + agglomerative hierarchical clustering, and the resulting cluster indices are used to create new cuts + with the speaker labels. + + Please refer to https://huggingface.co/speechbrain/spkrec-xvect-voxceleb for more details + about the speaker embedding extractor. + """ + from lhotse.workflows import diarize_segments_with_speechbrain + + assert exactly_one_not_null( + num_speakers, threshold + ), "Exactly one of --num-speakers and --threshold must be provided." + + cuts = load_manifest_lazy_or_eager(in_cuts) + cuts_with_spk_id = diarize_segments_with_speechbrain( + cuts, device=device, num_speakers=num_speakers, threshold=threshold + ) + cuts_with_spk_id.to_file(out_cuts) diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py index c2001a958..49e38b935 100644 --- a/lhotse/cut/base.py +++ b/lhotse/cut/base.py @@ -491,9 +491,9 @@ def trim_to_supervisions( assert ( len(set(to_hashable(s.channel) for s in trimmed.supervisions)) == 1 ), ( - "Trimmed cut has supervisions with different channels. Either set " - "`ignore_channel=True` to keep original channels or `keep_overlapping=False` " - "to retain only 1 supervision per trimmed cut." + f"Trimmed cut has supervisions with different channels. Either set " + f"`ignore_channel=True` to keep original channels or `keep_overlapping=False` " + f"to retain only 1 supervision per trimmed cut. Offending cut: {trimmed}" ) trimmed.channel = trimmed.supervisions[0].channel diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py index 9c24a1cce..a50ebab84 100644 --- a/lhotse/recipes/ami.py +++ b/lhotse/recipes/ami.py @@ -644,6 +644,9 @@ def prepare_ami( lambda x: x.recording_id in dataset_parts[part] ) + audio_part, supervision_part = fix_manifests(audio_part, supervision_part) + validate_recordings_and_supervisions(audio_part, supervision_part) + # Write to output directory if a path is provided if output_dir is not None: audio_part.to_file(output_dir / f"ami-{mic}_recordings_{part}.jsonl.gz") @@ -651,9 +654,6 @@ def prepare_ami( output_dir / f"ami-{mic}_supervisions_{part}.jsonl.gz" ) - audio_part, supervision_part = fix_manifests(audio_part, supervision_part) - validate_recordings_and_supervisions(audio_part, supervision_part) - # Combine all manifests into one dictionary manifests[part] = {"recordings": audio_part, "supervisions": supervision_part} diff --git a/lhotse/workflows/__init__.py b/lhotse/workflows/__init__.py index 3e2ee0f22..540e21b83 100644 --- a/lhotse/workflows/__init__.py +++ b/lhotse/workflows/__init__.py @@ -1,2 +1,3 @@ +from .diarization import diarize_segments_with_speechbrain from .forced_alignment import align_with_torchaudio from .whisper import annotate_with_whisper diff --git a/lhotse/workflows/whisper.py b/lhotse/workflows/whisper.py index fee0e883a..77e1b2015 100644 --- a/lhotse/workflows/whisper.py +++ b/lhotse/workflows/whisper.py @@ -2,6 +2,7 @@ from typing import Any, Generator, List, Optional, Union import torch +from tqdm import tqdm from lhotse import ( CutSet, @@ -61,7 +62,7 @@ def _annotate_recordings( model = whisper.load_model(model_name, device=device) - for recording in recordings: + for recording in tqdm(recordings): if recording.num_channels > 1: logging.warning( f"Skipping recording '{recording.id}'. It has {recording.num_channels} channels, " @@ -102,7 +103,7 @@ def _annotate_cuts(cuts: CutSet, language: str, model_name: str, device: str): model = whisper.load_model(model_name, device=device) - for cut in cuts: + for cut in tqdm(cuts): if cut.num_channels > 1: logging.warning( f"Skipping cut '{cut.id}'. It has {cut.num_channels} channels, " From 33783d3b2e76c4a290cb453a87b9e37dfc324416 Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 17 Apr 2023 14:29:37 -0400 Subject: [PATCH 3/4] add missing file --- lhotse/workflows/diarization.py | 117 ++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 lhotse/workflows/diarization.py diff --git a/lhotse/workflows/diarization.py b/lhotse/workflows/diarization.py new file mode 100644 index 000000000..c149efa20 --- /dev/null +++ b/lhotse/workflows/diarization.py @@ -0,0 +1,117 @@ +import logging +import shutil +import tempfile + +import numpy as np +import torch +from attr import frozen +from cytoolz.itertoolz import groupby +from tqdm import tqdm + +from lhotse import CutSet, Recording +from lhotse.utils import fastcopy, is_module_available + +logging.basicConfig( + format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%d:%H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def diarize_segments_with_speechbrain( + cuts: CutSet, + device: str = "cpu", + num_speakers: int = None, + threshold: float = 0.5, +) -> CutSet: + """ + This workflow uses SpeechBrain's pretrained speaker embedding model to compute speaker embeddings + for each cut in the CutSet. The cuts for the same recording are then clustered using + agglomerative hierarchical clustering, and the resulting cluster indices are used to create new cuts + with the speaker labels. + + Please refer to https://huggingface.co/speechbrain/spkrec-xvect-voxceleb for more details + about the speaker embedding extractor. + + :param manifest: a ``CutSet`` object. + :param device: Where to run the inference (cpu, cuda, etc.). + :param num_speakers: Number of speakers to cluster the cuts into. If not specified, we will use + the threshold parameter to determine the number of speakers. + :param threshold: The threshold for agglomerative clustering. + :return: a new ``CutSet`` with speaker labels. + """ + assert is_module_available("speechbrain"), ( + "This function expects SpeechBrain to be installed. " + "You can install it via 'pip install speechbrain' " + ) + + assert is_module_available("sklearn"), ( + "This function expects scikit-learn to be installed. " + "You can install it via 'pip install scikit-learn' " + ) + + from sklearn.cluster import AgglomerativeClustering + from speechbrain.pretrained import EncoderClassifier + + threshold = None if num_speakers is not None else threshold + dirpath = tempfile.mkdtemp() + + recordings, _, _ = cuts.decompose(dirpath, verbose=True) + recordings = recordings.to_eager() + recording_ids = frozenset(recordings.ids) + + logging.info("Saving cut recordings temporarily to disk...") + cuts_ = [] + for cut in tqdm(cuts): + save_path = f"{dirpath}/{cut.recording_id}.wav" + _ = cut.save_audio(save_path) + cuts_.append(fastcopy(cut, recording=Recording.from_file(save_path))) + + cuts_ = CutSet.from_cuts(cuts_).trim_to_supervisions(keep_overlapping=False) + + # Load the pretrained model + model = EncoderClassifier.from_hparams( + source="speechbrain/spkrec-xvect-voxceleb", + savedir="pretrained_models/spkrec-xvect-voxceleb", + run_opts={"device": device}, + ) + + out_cuts = [] + + for recording_id in tqdm(recording_ids, total=len(recording_ids)): + logging.info(f"Processing recording {recording_id}...") + embeddings = [] + reco_cuts = cuts_.filter(lambda c: c.recording_id == recording_id) + num_cuts = len(frozenset(reco_cuts.ids)) + if num_cuts == 0: + continue + for cut in tqdm(reco_cuts, total=num_cuts): + audio = torch.from_numpy(cut.load_audio()) + embedding = model.encode_batch(audio).cpu().numpy() + embeddings.append(embedding.squeeze()) + + embeddings = np.vstack(embeddings) + clusterer = AgglomerativeClustering( + n_clusters=num_speakers, + affinity="euclidean", + linkage="ward", + distance_threshold=threshold, + ) + clusterer.fit(embeddings) + + # Assign the cluster indices to the cuts + for cut, cluster_idx in zip(reco_cuts, clusterer.labels_): + sup = fastcopy(cut.supervisions[0], speaker=f"spk{cluster_idx}") + out_cuts.append( + fastcopy( + cut, + recording=recordings[cut.recording_id], + supervisions=[sup], + ) + ) + + # Remove the temporary directory + shutil.rmtree(dirpath) + + return CutSet.from_cuts(out_cuts) From 6c9ce1aa6dfcd7a6581a44581d8cc0570fac860c Mon Sep 17 00:00:00 2001 From: Desh Raj Date: Mon, 17 Apr 2023 14:37:55 -0400 Subject: [PATCH 4/4] remove unwanted change --- lhotse/cut/base.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py index 738e0aaf7..4df14a017 100644 --- a/lhotse/cut/base.py +++ b/lhotse/cut/base.py @@ -457,8 +457,6 @@ def trim_to_supervisions( from .set import CutSet cuts = [] - # Remove zero-duration supervisions from the cut, since these cannot be indexed. - self.supervisions = [s for s in self.supervisions if s.duration > 0] supervisions_index = self.index_supervisions(index_mixed_tracks=True) for segment in self.supervisions: if min_duration is None: @@ -491,9 +489,9 @@ def trim_to_supervisions( assert ( len(set(to_hashable(s.channel) for s in trimmed.supervisions)) == 1 ), ( - f"Trimmed cut has supervisions with different channels. Either set " - f"`ignore_channel=True` to keep original channels or `keep_overlapping=False` " - f"to retain only 1 supervision per trimmed cut. Offending cut: {trimmed}" + "Trimmed cut has supervisions with different channels. Either set " + "`ignore_channel=True` to keep original channels or `keep_overlapping=False` " + "to retain only 1 supervision per trimmed cut." ) trimmed.channel = trimmed.supervisions[0].channel