From 7a2d05943c7f022d545e0bde6166592f69908a5e Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Wed, 2 Nov 2022 11:14:57 -0400
Subject: [PATCH 1/4] remove zero duration segments for indexing

---
 lhotse/cut/base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index 8a9b0d093..49d027315 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -456,6 +456,8 @@ def trim_to_supervisions(
         from .set import CutSet
 
         cuts = []
+        # Remove zero-duration supervisions from the cut, since these cannot be indexed.
+        self.supervisions = [s for s in self.supervisions if s.duration > 0]
         supervisions_index = self.index_supervisions(index_mixed_tracks=True)
         for segment in self.supervisions:
             if min_duration is None:

From 0c537220e114b5856c1a0662106974f903503ee7 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Wed, 9 Nov 2022 10:57:39 -0500
Subject: [PATCH 2/4] add diarization workflow with speechbrain

---
 lhotse/bin/modes/workflows.py | 49 +++++++++++++++++++++++++++++++++++
 lhotse/cut/base.py            |  6 ++---
 lhotse/recipes/ami.py         |  6 ++---
 lhotse/workflows/__init__.py  |  1 +
 lhotse/workflows/whisper.py   |  5 ++--
 5 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/lhotse/bin/modes/workflows.py b/lhotse/bin/modes/workflows.py
index 614a3c3d6..bb78a79e1 100644
--- a/lhotse/bin/modes/workflows.py
+++ b/lhotse/bin/modes/workflows.py
@@ -164,3 +164,52 @@ def align_with_torchaudio(
             desc="Aligning",
         ):
             writer.write(cut, flush=True)
+
+
+@workflows.command()
+@click.argument(
+    "in_cuts", type=click.Path(exists=True, dir_okay=False, allow_dash=True)
+)
+@click.argument("out_cuts", type=click.Path(allow_dash=True))
+@click.option(
+    "-d", "--device", default="cpu", help="Device on which to run the inference."
+)
+@click.option(
+    "--num-speakers",
+    type=int,
+    default=None,
+    help="Number of clusters to use for speaker diarization. Will use threshold if not provided.",
+)
+@click.option(
+    "--threshold",
+    type=float,
+    default=None,
+    help="Threshold for speaker diarization. Will use num-speakers if not provided.",
+)
+def diarize_segments_with_speechbrain(
+    in_cuts: str,
+    out_cuts: str,
+    device: str = "cpu",
+    num_speakers: Optional[int] = None,
+    threshold: Optional[float] = None,
+):
+    """
+    This workflow uses SpeechBrain's pretrained speaker embedding model to compute speaker embeddings
+    for each cut in the CutSet. The cuts for the same recording are then clustered using
+    agglomerative hierarchical clustering, and the resulting cluster indices are used to create new cuts
+    with the speaker labels.
+
+    Please refer to https://huggingface.co/speechbrain/spkrec-xvect-voxceleb for more details
+    about the speaker embedding extractor.
+    """
+    from lhotse.workflows import diarize_segments_with_speechbrain
+
+    assert exactly_one_not_null(
+        num_speakers, threshold
+    ), "Exactly one of --num-speakers and --threshold must be provided."
+
+    cuts = load_manifest_lazy_or_eager(in_cuts)
+    cuts_with_spk_id = diarize_segments_with_speechbrain(
+        cuts, device=device, num_speakers=num_speakers, threshold=threshold
+    )
+    cuts_with_spk_id.to_file(out_cuts)
diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index c2001a958..49e38b935 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -491,9 +491,9 @@ def trim_to_supervisions(
                 assert (
                     len(set(to_hashable(s.channel) for s in trimmed.supervisions)) == 1
                 ), (
-                    "Trimmed cut has supervisions with different channels. Either set "
-                    "`ignore_channel=True` to keep original channels or `keep_overlapping=False` "
-                    "to retain only 1 supervision per trimmed cut."
+                    f"Trimmed cut has supervisions with different channels. Either set "
+                    f"`ignore_channel=True` to keep original channels or `keep_overlapping=False` "
+                    f"to retain only 1 supervision per trimmed cut. Offending cut: {trimmed}"
                 )
                 trimmed.channel = trimmed.supervisions[0].channel
 
diff --git a/lhotse/recipes/ami.py b/lhotse/recipes/ami.py
index 9c24a1cce..a50ebab84 100644
--- a/lhotse/recipes/ami.py
+++ b/lhotse/recipes/ami.py
@@ -644,6 +644,9 @@ def prepare_ami(
             lambda x: x.recording_id in dataset_parts[part]
         )
 
+        audio_part, supervision_part = fix_manifests(audio_part, supervision_part)
+        validate_recordings_and_supervisions(audio_part, supervision_part)
+
         # Write to output directory if a path is provided
         if output_dir is not None:
             audio_part.to_file(output_dir / f"ami-{mic}_recordings_{part}.jsonl.gz")
@@ -651,9 +654,6 @@ def prepare_ami(
                 output_dir / f"ami-{mic}_supervisions_{part}.jsonl.gz"
             )
 
-        audio_part, supervision_part = fix_manifests(audio_part, supervision_part)
-        validate_recordings_and_supervisions(audio_part, supervision_part)
-
         # Combine all manifests into one dictionary
         manifests[part] = {"recordings": audio_part, "supervisions": supervision_part}
 
diff --git a/lhotse/workflows/__init__.py b/lhotse/workflows/__init__.py
index 3e2ee0f22..540e21b83 100644
--- a/lhotse/workflows/__init__.py
+++ b/lhotse/workflows/__init__.py
@@ -1,2 +1,3 @@
+from .diarization import diarize_segments_with_speechbrain
 from .forced_alignment import align_with_torchaudio
 from .whisper import annotate_with_whisper
diff --git a/lhotse/workflows/whisper.py b/lhotse/workflows/whisper.py
index fee0e883a..77e1b2015 100644
--- a/lhotse/workflows/whisper.py
+++ b/lhotse/workflows/whisper.py
@@ -2,6 +2,7 @@
 from typing import Any, Generator, List, Optional, Union
 
 import torch
+from tqdm import tqdm
 
 from lhotse import (
     CutSet,
@@ -61,7 +62,7 @@ def _annotate_recordings(
 
     model = whisper.load_model(model_name, device=device)
 
-    for recording in recordings:
+    for recording in tqdm(recordings):
         if recording.num_channels > 1:
             logging.warning(
                 f"Skipping recording '{recording.id}'. It has {recording.num_channels} channels, "
@@ -102,7 +103,7 @@ def _annotate_cuts(cuts: CutSet, language: str, model_name: str, device: str):
 
     model = whisper.load_model(model_name, device=device)
 
-    for cut in cuts:
+    for cut in tqdm(cuts):
         if cut.num_channels > 1:
             logging.warning(
                 f"Skipping cut '{cut.id}'. It has {cut.num_channels} channels, "

From 33783d3b2e76c4a290cb453a87b9e37dfc324416 Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 17 Apr 2023 14:29:37 -0400
Subject: [PATCH 3/4] add missing file

---
 lhotse/workflows/diarization.py | 117 ++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 lhotse/workflows/diarization.py

diff --git a/lhotse/workflows/diarization.py b/lhotse/workflows/diarization.py
new file mode 100644
index 000000000..c149efa20
--- /dev/null
+++ b/lhotse/workflows/diarization.py
@@ -0,0 +1,117 @@
+import logging
+import shutil
+import tempfile
+
+import numpy as np
+import torch
+from attr import frozen
+from cytoolz.itertoolz import groupby
+from tqdm import tqdm
+
+from lhotse import CutSet, Recording
+from lhotse.utils import fastcopy, is_module_available
+
+logging.basicConfig(
+    format="%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+    datefmt="%Y-%m-%d:%H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+
+def diarize_segments_with_speechbrain(
+    cuts: CutSet,
+    device: str = "cpu",
+    num_speakers: int = None,
+    threshold: float = 0.5,
+) -> CutSet:
+    """
+    This workflow uses SpeechBrain's pretrained speaker embedding model to compute speaker embeddings
+    for each cut in the CutSet. The cuts for the same recording are then clustered using
+    agglomerative hierarchical clustering, and the resulting cluster indices are used to create new cuts
+    with the speaker labels.
+
+    Please refer to https://huggingface.co/speechbrain/spkrec-xvect-voxceleb for more details
+    about the speaker embedding extractor.
+
+    :param manifest: a ``CutSet`` object.
+    :param device: Where to run the inference (cpu, cuda, etc.).
+    :param num_speakers: Number of speakers to cluster the cuts into. If not specified, we will use
+        the threshold parameter to determine the number of speakers.
+    :param threshold: The threshold for agglomerative clustering.
+    :return: a new ``CutSet`` with speaker labels.
+    """
+    assert is_module_available("speechbrain"), (
+        "This function expects SpeechBrain to be installed. "
+        "You can install it via 'pip install speechbrain' "
+    )
+
+    assert is_module_available("sklearn"), (
+        "This function expects scikit-learn to be installed. "
+        "You can install it via 'pip install scikit-learn' "
+    )
+
+    from sklearn.cluster import AgglomerativeClustering
+    from speechbrain.pretrained import EncoderClassifier
+
+    threshold = None if num_speakers is not None else threshold
+    dirpath = tempfile.mkdtemp()
+
+    recordings, _, _ = cuts.decompose(dirpath, verbose=True)
+    recordings = recordings.to_eager()
+    recording_ids = frozenset(recordings.ids)
+
+    logging.info("Saving cut recordings temporarily to disk...")
+    cuts_ = []
+    for cut in tqdm(cuts):
+        save_path = f"{dirpath}/{cut.recording_id}.wav"
+        _ = cut.save_audio(save_path)
+        cuts_.append(fastcopy(cut, recording=Recording.from_file(save_path)))
+
+    cuts_ = CutSet.from_cuts(cuts_).trim_to_supervisions(keep_overlapping=False)
+
+    # Load the pretrained model
+    model = EncoderClassifier.from_hparams(
+        source="speechbrain/spkrec-xvect-voxceleb",
+        savedir="pretrained_models/spkrec-xvect-voxceleb",
+        run_opts={"device": device},
+    )
+
+    out_cuts = []
+
+    for recording_id in tqdm(recording_ids, total=len(recording_ids)):
+        logging.info(f"Processing recording {recording_id}...")
+        embeddings = []
+        reco_cuts = cuts_.filter(lambda c: c.recording_id == recording_id)
+        num_cuts = len(frozenset(reco_cuts.ids))
+        if num_cuts == 0:
+            continue
+        for cut in tqdm(reco_cuts, total=num_cuts):
+            audio = torch.from_numpy(cut.load_audio())
+            embedding = model.encode_batch(audio).cpu().numpy()
+            embeddings.append(embedding.squeeze())
+
+        embeddings = np.vstack(embeddings)
+        clusterer = AgglomerativeClustering(
+            n_clusters=num_speakers,
+            affinity="euclidean",
+            linkage="ward",
+            distance_threshold=threshold,
+        )
+        clusterer.fit(embeddings)
+
+        # Assign the cluster indices to the cuts
+        for cut, cluster_idx in zip(reco_cuts, clusterer.labels_):
+            sup = fastcopy(cut.supervisions[0], speaker=f"spk{cluster_idx}")
+            out_cuts.append(
+                fastcopy(
+                    cut,
+                    recording=recordings[cut.recording_id],
+                    supervisions=[sup],
+                )
+            )
+
+    # Remove the temporary directory
+    shutil.rmtree(dirpath)
+
+    return CutSet.from_cuts(out_cuts)

From 6c9ce1aa6dfcd7a6581a44581d8cc0570fac860c Mon Sep 17 00:00:00 2001
From: Desh Raj <r.desh26@gmail.com>
Date: Mon, 17 Apr 2023 14:37:55 -0400
Subject: [PATCH 4/4] remove unwanted change

---
 lhotse/cut/base.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/lhotse/cut/base.py b/lhotse/cut/base.py
index 738e0aaf7..4df14a017 100644
--- a/lhotse/cut/base.py
+++ b/lhotse/cut/base.py
@@ -457,8 +457,6 @@ def trim_to_supervisions(
         from .set import CutSet
 
         cuts = []
-        # Remove zero-duration supervisions from the cut, since these cannot be indexed.
-        self.supervisions = [s for s in self.supervisions if s.duration > 0]
         supervisions_index = self.index_supervisions(index_mixed_tracks=True)
         for segment in self.supervisions:
             if min_duration is None:
@@ -491,9 +489,9 @@ def trim_to_supervisions(
                 assert (
                     len(set(to_hashable(s.channel) for s in trimmed.supervisions)) == 1
                 ), (
-                    f"Trimmed cut has supervisions with different channels. Either set "
-                    f"`ignore_channel=True` to keep original channels or `keep_overlapping=False` "
-                    f"to retain only 1 supervision per trimmed cut. Offending cut: {trimmed}"
+                    "Trimmed cut has supervisions with different channels. Either set "
+                    "`ignore_channel=True` to keep original channels or `keep_overlapping=False` "
+                    "to retain only 1 supervision per trimmed cut."
                 )
                 trimmed.channel = trimmed.supervisions[0].channel