NVIDIA · tango4j · Jan 8, 2024 · Dec 9, 2023 · Dec 9, 2023 · Dec 11, 2023
diff --git a/docs/source/asr/speaker_diarization/datasets.rst b/docs/source/asr/speaker_diarization/datasets.rst
@@ -205,14 +205,14 @@ The following are descriptions about each field in an input manifest JSON file.
 
 ``ctm_filepath`` (Optional):
 
-  CTM file is used for the evaluation of word-level diarization results and word-timestamp alignment. CTM file follows the following convention: ``<uniq-id> <speaker ID> <word start time> <word end time> <word> <confidence>`` Since confidence is not required for evaluating diarization results, it can have any value. Note that the ``<speaker_id>`` should be exactly matched with speaker IDs in RTTM. 
+  The CTM file is used for the evaluation of word-level diarization results and word-timestamp alignment. The CTM file follows this convention: ``<session name> <channel ID> <start time> <duration> <word> <confidence> <type of token> <speaker>``. Note that the ``<speaker>`` should exactly match speaker IDs in RTTM. Since confidence is not required for evaluating diarization results, we assign ``<confidence>`` the value ``NA``. If the type of token is words, we assign ``<type of token>`` as ``lex``.  
 
   Example lines of CTM file:
 
 .. code-block:: bash
   
-   TS3012d.Mix-Headset MTD046ID 12.879 0.32 okay 0
-   TS3012d.Mix-Headset MTD046ID 13.203 0.24 yeah 0
+   TS3012d.Mix-Headset 1 12.879 0.32 okay NA lex MTD046ID
+   TS3012d.Mix-Headset 1 13.203 0.24 yeah NA lex MTD046ID
 
 
 Evaluation on Benchmark Datasets

diff --git a/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py b/examples/speaker_tasks/diarization/clustering_diarizer/offline_diar_with_asr_infer.py
@@ -63,8 +63,6 @@ def main(cfg):
 
     # If RTTM is provided and DER evaluation
     if diar_score is not None:
-        metric, mapping_dict, _ = diar_score
-
         # Get session-level diarization error rate and speaker counting error
         der_results = OfflineDiarWithASR.gather_eval_results(
             diar_score=diar_score,

diff --git a/nemo/collections/asr/parts/utils/data_simulation_utils.py b/nemo/collections/asr/parts/utils/data_simulation_utils.py
@@ -25,7 +25,13 @@
 
 from nemo.collections.asr.parts.preprocessing.perturb import AudioAugmentor
 from nemo.collections.asr.parts.preprocessing.segment import AudioSegment
-from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_ctm, write_manifest, write_text
+from nemo.collections.asr.parts.utils.manifest_utils import (
+    get_ctm_line,
+    read_manifest,
+    write_ctm,
+    write_manifest,
+    write_text,
+)
 from nemo.collections.asr.parts.utils.speaker_utils import labels_to_rttmfile
 from nemo.utils import logging
 
@@ -774,7 +780,16 @@
                 prev_align = 0 if i == 0 else alignments[i - 1]
                 align1 = round(float(prev_align + start), self._params.data_simulator.outputs.output_precision)
                 align2 = round(float(alignments[i] - prev_align), self._params.data_simulator.outputs.output_precision)
-                text = f"{session_name} {speaker_id} {align1} {align2} {word} 0\n"
+                text = get_ctm_line(
+                    source=session_name,
+                    channel=1,
+                    beg_time=align1,
+                    duration=align2,
+                    token=word,
+                    conf=None,
+                    type_token='lex',
+                    speaker=speaker_id,
+                )
                 arr.append((align1, text))
         return arr
 

diff --git a/nemo/collections/asr/parts/utils/manifest_utils.py b/nemo/collections/asr/parts/utils/manifest_utils.py
@@ -33,6 +33,70 @@
 from nemo.utils.data_utils import DataStoreObject
 
 
+def get_ctm_line(
+    source: str,
+    channel: int,
+    beg_time: float,
+    duration: float,
+    token: str,
+    conf: float,
+    type_of_token: str,
+    speaker: str,
+    NA_token: str = 'NA',
+    UNK: str = 'unknown',
+    default_channel: str = '1',
+    output_precision: int = 3,
+) -> str:
+    """
+    Get a line in  Conversation Time Mark (CTM) format. Following CTM format appeared in `Rich Transcription Meeting Eval Plan: RT09` document.
+
+    CTM Format: 
+        <SOURCE><SP><CHANNEL><SP><BEG-TIME><SP><DURATION><SP><TOKEN><SP><CONF><SP><TYPE><SP><SPEAKER><NEWLINE>
+
+    Reference: 
+        https://web.archive.org/web/20170119114252/http://www.itl.nist.gov/iad/mig/tests/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf
+
+    Args:
+        source (str): <SOURCE> is name of the source file, session name or utterance ID
+        channel (int): <CHANNEL> is channel number defaults to 1
+        beg_time (float): <BEG_TIME> is begin time of the word
+        duration (float): <DURATION> is duration of the word
+        token (str): <TOKEN> Token or word for the current entry
+        conf (float): <CONF> is a floating point number between 0 (no confidence) and 1 (certainty). A value of “NA” is used (in CTM format data) 
+                      when no confidence is computed and in the reference data. 
+        type_of_token (str): <TYPE> is the token type. The legal values of <TYPE> are “lex”, “frag”, “fp”, “un-lex”, “for-lex”, “non-lex”, “misc”, or “noscore”
+        speaker (str): <SPEAKER> is a string identifier for the speaker who uttered the token. This should be “null” for non-speech tokens and “unknown” when
+                       the speaker has not been determined. 
+        NA_token (str, optional): A token for  . Defaults to '<NA>'.
+        output_precision (int, optional): The precision of the output floating point number. Defaults to 3.
+
+    Returns:
+        str: Return a line in CTM format filled with the given information.
+    """
+    VALID_TOKEN_TYPES = ["lex", "frag", "fp", "un-lex", "for-lex", "non-lex", "misc", "noscore"]
+    if type(beg_time) != float:
+        beg_time = round(float(beg_time), output_precision)
+    if type(duration) != float:
+        duration = round(float(duration), output_precision)
+    if channel is not None and type(channel) != int:
+        channel = str(channel)
+    if conf is not None and type(conf) != float:
+        raise ValueError(f"`conf` must be a float, but got {type(conf)}")
+    if conf is not None and not (0 <= conf <= 1):
+        raise ValueError(f"`conf` must be between 0 and 1, but got {conf}")
+    if type_of_token is not None and type(type_of_token) != str:
+        raise ValueError(f"`type` must be a string, but got {type(type)}")
+    if type_of_token is not None and type_of_token not in VALID_TOKEN_TYPES:
+        raise ValueError(f"`type` must be one of {VALID_TOKEN_TYPES}, but got {type_of_token}")
+    if speaker is not None and type(speaker) != str:
+        raise ValueError(f"`speaker` must be a string, but got {type(speaker)}")
+    channel = default_channel if channel is None else channel
+    conf = NA_token if conf is None else conf
+    speaker = NA_token if speaker is None else speaker
+    type_of_token = UNK if type_of_token is None else type_of_token
+    return f"{source} {channel} {beg_time} {duration} {token} {conf} {type_of_token} {speaker}\n"
+
+
 def rreplace(s: str, old: str, new: str) -> str:
     """
     Replace end of string.

diff --git a/scripts/speaker_tasks/create_alignment_manifest.py b/scripts/speaker_tasks/create_alignment_manifest.py
@@ -16,12 +16,41 @@
 import os
 import shutil
 from pathlib import Path
+from typing import List
 
-from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_ctm, write_manifest
+from nemo.collections.asr.parts.utils.manifest_utils import get_ctm_line, read_manifest, write_ctm, write_manifest
 from nemo.utils import logging
 
 
-def get_unaligned_files(unaligned_path):
+def get_seg_info_from_ctm_line(
+    ctm_list: List[str],
+    output_precision: int,
+    speaker_index: int = 7,
+    beg_time_index: int = 2,
+    duration_index: int = 3,
+):
+    """
+    Get time stamp information and speaker labels from CTM lines.
+    This is following CTM format appeared in `Rich Transcription Meeting Eval Plan: RT09` document.
+
+    Args:
+        ctm_list (list): List containing CTM items. e.g.: ['sw02001-A', '1', '0.000', '0.200', 'hello', '0.98', 'lex', 'speaker3']
+        output_precision (int): Precision for CTM outputs in integer.
+
+    Returns:
+        start (float): Start time of the segment.
+        end (float): End time of the segment.
+        speaker_id (str): Speaker ID of the segment.
+    """
+    speaker_id = ctm_list[speaker_index]
+    start = float(ctm_list[beg_time_index])
+    end = float(ctm_list[beg_time_index]) + float(ctm_list[duration_index])
+    start = round(start, output_precision)
+    end = round(end, output_precision)
+    return start, end, speaker_id
+
+
+def get_unaligned_files(unaligned_path: str) -> List[str]:
     """
     Get files without alignments in order to filter them out (as they cannot be used for data simulation).
     In the unaligned file, each line contains the file name and the reason for the unalignment, if necessary to specify.
@@ -71,7 +100,17 @@ def create_new_ctm_entry(session_name, speaker_id, wordlist, alignments, output_
             # note that using the current alignments the first word is always empty, so there is no error from indexing the array with i-1
             align1 = float(round(alignments[i - 1], output_precision))
             align2 = float(round(alignments[i] - alignments[i - 1], output_precision,))
-            text = f"{session_name} {speaker_id} {align1} {align2} {word} 0\n"
+            text = get_ctm_line(
+                source=session_name,
+                channel=speaker_id,
+                beg_time=align1,
+                duration=align2,
+                token=word,
+                conf=0,
+                type_of_token='lex',
+                speaker=speaker_id,
+                output_precision=output_precision,
+            )
             arr.append((align1, text))
     return arr
 
@@ -206,11 +245,7 @@ def create_manifest_with_alignments(
         prev_end = 0
         for i in range(len(lines)):
             ctm = lines[i].split(' ')
-            speaker_id = ctm[1]
-            start = float(ctm[2])
-            end = float(ctm[2]) + float(ctm[3])
-            start = round(start, output_precision)
-            end = round(end, output_precision)
+            speaker_id, start, end = get_seg_info_from_ctm_line(ctm_list=ctm, output_precision=output_precision)
             interval = start - prev_end
 
             if (i == 0 and interval > 0) or (i > 0 and interval > silence_dur_threshold):
@@ -231,13 +266,16 @@ def create_manifest_with_alignments(
             end_times.append(f['duration'])
 
         # build target manifest entry
-        target_manifest.append({})
-        target_manifest[tgt_i]['audio_filepath'] = f['audio_filepath']
-        target_manifest[tgt_i]['duration'] = f['duration']
-        target_manifest[tgt_i]['text'] = f['text']
-        target_manifest[tgt_i]['words'] = words
-        target_manifest[tgt_i]['alignments'] = end_times
-        target_manifest[tgt_i]['speaker_id'] = speaker_id
+        target_manifest.append(
+            {
+                'audio_filepath': f['audio_filepath'],
+                'duration': f['duration'],
+                'text': f['text'],
+                'words': words,
+                'alignments': end_times,
+                'speaker_id': speaker_id,
+            }
+        )
 
         src_i += 1
         tgt_i += 1

diff --git a/tests/collections/asr/utils/test_data_simul_utils.py b/tests/collections/asr/utils/test_data_simul_utils.py
@@ -29,6 +29,7 @@
     normalize_audio,
     read_noise_manifest,
 )
+from nemo.collections.asr.parts.utils.manifest_utils import get_ctm_line
 
 
 @pytest.fixture()
@@ -129,6 +130,131 @@
     return words, alignments, speaker_id
 
 
+class TestGetCtmLine:
+    @pytest.mark.unit
+    @pytest.mark.parametrize("conf", [0, 1])
+    def test_wrong_type_conf_values(self, conf):
+        # Test with wrong integer confidence values
+        with pytest.raises(ValueError):
+            result = get_ctm_line(
+                source="test_source",
+                channel=1,
+                beg_time=0.123,
+                duration=0.456,
+                token="word",
+                conf=conf,
+                type_of_token="lex",
+                speaker="speaker1",
+            )
+            expected = f"test_source 1 0.123 0.456 word {conf} lex speaker1\n"
+            assert result == expected, f"Failed on valid conf value {conf}"
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize("conf", [0.0, 0.5, 1.0, 0.001, 0.999])
+    def test_valid_conf_values(self, conf):
+        # Test with valid confidence values
+        result = get_ctm_line(
+            source="test_source",
+            channel=1,
+            beg_time=0.123,
+            duration=0.456,
+            token="word",
+            conf=conf,
+            type_of_token="lex",
+            speaker="speaker1",
+        )
+        expected = f"test_source 1 0.123 0.456 word {conf} lex speaker1\n"
+        assert result == expected, f"Failed on valid conf value {conf}"
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize("conf", [-0.1, 1.1, 2, -1, 100, -100])
+    def test_invalid_conf_ranges(self, conf):
+        # Test with invalid confidence values
+        with pytest.raises(ValueError):
+            get_ctm_line(
+                source="test_source",
+                channel=1,
+                beg_time=0.123,
+                duration=0.456,
+                token="word",
+                conf=conf,
+                type_of_token="lex",
+                speaker="speaker1",
+            )
+
+    @pytest.mark.unit
+    def test_valid_input(self):
+        # Test with completely valid inputs
+        result = get_ctm_line(
+            source="test_source",
+            channel=1,
+            beg_time=0.123,
+            duration=0.456,
+            token="word",
+            conf=0.789,
+            type_token="lex",
+            speaker="speaker1",
+        )
+        expected = "test_source 1 0.123 0.456 word 0.789 lex speaker1\n"
+        assert result == expected, "Failed on valid input"
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize(
+        "beg_time, duration",
+        [
+            ("not a float", 1.0),
+            (1.0, "not a float"),
+            (1, 2.0),  # Integers should be converted to float
+            (2.0, 3),  # Same as above
+        ],
+    )
+    def test_invalid_types_for_time_duration(self, beg_time, duration):
+        # Test with invalid types for beg_time and duration
+        with pytest.raises(ValueError):
+            get_ctm_line(
+                source="test_source",
+                channel=1,
+                beg_time=beg_time,
+                duration=duration,
+                token="word",
+                conf=0.5,
+                type_token="lex",
+                speaker="speaker1",
+            )
+
+    @pytest.mark.unit
+    @pytest.mark.parametrize("conf", [-0.1, 1.1, "not a float"])
+    def test_invalid_conf_values(self, conf):
+        # Test with invalid values for conf
+        with pytest.raises(ValueError):
+            get_ctm_line(
+                source="test_source",
+                channel=1,
+                beg_time=0.123,
+                duration=0.456,
+                token="word",
+                conf=conf,
+                type_token="lex",
+                speaker="speaker1",
+            )
+
+    @pytest.mark.unit
+    def test_default_values(self):
+        # Test with missing optional parameters
+        result = get_ctm_line(
+            source="test_source",
+            channel=None,
+            beg_time=0.123,
+            duration=0.456,
+            token="word",
+            conf=None,
+            type_token=None,
+            speaker=None,
+        )
+        expected = "test_source 1 0.123 0.456 word NA unknown NA\n"
+        assert result == expected, "Failed on default values"
+
+
 class TestDataSimulatorUtils:
     # TODO: add tests for all util functions
     @pytest.mark.parametrize("max_audio_read_sec", [2.5, 3.5, 4.5])
@@ -253,11 +379,11 @@
         )
         assert ctm_list[0] == (
             alignments[1],
-            f"{session_name} {speaker_id} {alignments[1]} {alignments[1]-alignments[0]} {words[1]} 0\n",
+            f"{session_name} 1 {alignments[1]} {alignments[1]-alignments[0]} {words[1]} NA lex {speaker_id}\n",
         )
         assert ctm_list[1] == (
             alignments[2],
-            f"{session_name} {speaker_id} {alignments[2]} {alignments[2]-alignments[1]} {words[2]} 0\n",
+            f"{session_name} 1 {alignments[2]} {alignments[2]-alignments[1]} {words[2]} NA lex {speaker_id}\n",
         )