NVIDIA · ekmb · Sep 30, 2022 · Sep 29, 2022 · Sep 29, 2022 · Sep 29, 2022
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -731,7 +731,7 @@ pipeline {
             steps {
             sh 'cd tools/ctc_segmentation && \
             pip install -r requirements.txt && \
-            DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ffmpeg'
+            apt-get update && apt-get install libsox-fmt-all -y'
             }
         }
 

diff --git a/README.rst b/README.rst
@@ -205,6 +205,14 @@ Megatron GPT training requires NVIDIA Apex to be installed.
     git checkout nm_v1.11.0
     pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./
 
+NeMo Text Processing
+~~~~~~~~~~~~~~~~~~~~
+NeMo Text Processing, specifically (Inverse) Text Normalization, requires `Pynini <https://pypi.org/project/pynini/>`_ to be installed.
+
+.. code-block:: bash
+
+    bash NeMo/nemo_text_processing/install_pynini.sh
+
 Docker containers:
 ~~~~~~~~~~~~~~~~~~
 To build a nemo container with Dockerfile from a branch, please run 

diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py
@@ -156,7 +156,7 @@ def normalize_list(
 
         try:
             normalized_texts = Parallel(n_jobs=n_jobs)(
-                delayed(self.__process_batch)(texts[i : i + batch], verbose, punct_pre_process, punct_post_process)
+                delayed(self.process_batch)(texts[i : i + batch], verbose, punct_pre_process, punct_post_process)
                 for i in range(0, len(texts), batch)
             )
         except BaseException as e:
@@ -165,7 +165,7 @@ def normalize_list(
         normalized_texts = list(itertools.chain(*normalized_texts))
         return normalized_texts
 
-    def __process_batch(self, batch, verbose, punct_pre_process, punct_post_process):
+    def process_batch(self, batch, verbose, punct_pre_process, punct_post_process):
         """
         Normalizes batch of text sequences
         Args:

diff --git a/tools/ctc_segmentation/README.md b/tools/ctc_segmentation/README.md
@@ -29,3 +29,9 @@ abstract="Recent end-to-end Automatic Speech Recognition (ASR) systems demonstra
 isbn="978-3-030-60276-5"
 }
 ```
+Requirements
+~~~~~~~~~~~~
+The tool requires:
+- packages listed in requirements.txt
+- NeMo ASR 
+- see pysox’s documentation (https://pysox.readthedocs.io/en/latest/) if you want support for mp3, flac and ogg files
diff --git a/tools/ctc_segmentation/run_filter.sh b/tools/ctc_segmentation/run_filter.sh
@@ -12,6 +12,8 @@ CER_THRESHOLD=30
 WER_THRESHOLD=75
 CER_EDGE_THRESHOLD=60
 LEN_DIFF_RATIO_THRESHOLD=0.3
+MIN_DURATION=1 # in seconds
+MAX_DURATION=20 # in seconds
 
 for ARG in "$@"
 do
@@ -46,6 +48,7 @@ python ${SCRIPTS_DIR}/../../../examples/asr/transcribe_speech.py \
 $ARG_MODEL=$MODEL_NAME_OR_PATH \
 dataset_manifest=$MANIFEST \
 output_filename=${OUT_MANIFEST} \
+batch_size=${BATCH_SIZE} \
 num_workers=0 || exit
 
 echo "--- Calculating metrics and filtering out samples based on thresholds ---"
@@ -60,4 +63,6 @@ python ${SCRIPTS_DIR}/get_metrics_and_filter.py \
 --max_cer=${CER_THRESHOLD} \
 --max_wer=${WER_THRESHOLD} \
 --max_len_diff_ratio=${LEN_DIFF_RATIO_THRESHOLD} \
---max_edge_cer=${CER_EDGE_THRESHOLD}
+--max_edge_cer=${CER_EDGE_THRESHOLD} \
+--min_duration=${MIN_DURATION}
+--max_duration=${MAX_DURATION}
diff --git a/tools/ctc_segmentation/run_segmentation.sh b/tools/ctc_segmentation/run_segmentation.sh
@@ -11,6 +11,7 @@ ADDITIONAL_SPLIT_SYMBOLS=":|;"
 USE_NEMO_NORMALIZATION='True'
 NUM_JOBS=-2 # The maximum number of concurrently running jobs, `-2` - all CPUs but one are used
 SAMPLE_RATE=16000 # Target sample rate (default for ASR data - 16000 Hz)
+MAX_DURATION=20 # Maximum audio segment duration, in seconds. Samples that are longer will be dropped.
 
 for ARG in "$@"
 do
@@ -103,4 +104,5 @@ python $SCRIPTS_DIR/cut_audio.py \
 --alignment=$OUTPUT_DIR/verified_segments \
 --threshold=$MIN_SCORE \
 --offset=$OFFSET \
---sample_rate=$SAMPLE_RATE || exit
+--sample_rate=$SAMPLE_RATE \
+--max_duration=$MAX_DURATION || exit
diff --git a/tools/ctc_segmentation/scripts/cut_audio.py b/tools/ctc_segmentation/scripts/cut_audio.py
@@ -39,6 +39,12 @@
     default=0.05,
 )
 parser.add_argument("--sample_rate", type=int, help="Sample rate, Hz", default=16000)
+parser.add_argument(
+    "--max_duration",
+    type=int,
+    help="Maximum audio duration (seconds). Samples that are longer will be dropped",
+    default=60,
+)
 
 
 def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
@@ -60,6 +66,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
     segments = []
     ref_text_processed = []
     ref_text_no_preprocessing = []
+    ref_text_normalized = []
     with open(alignment_file, "r") as f:
         for line in f:
             line = line.split("|")
@@ -69,6 +76,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
                 continue
             ref_text_processed.append(line[1].strip())
             ref_text_no_preprocessing.append(line[2].strip())
+            ref_text_normalized.append(line[3].strip())
             line = line[0].split()
             segments.append((float(line[0]) + args.offset / 1000, float(line[1]) + args.offset / 1000, float(line[2])))
 
@@ -86,6 +94,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
             if duration > 0:
                 text_processed = ref_text_processed[i].strip()
                 text_no_preprocessing = ref_text_no_preprocessing[i].strip()
+                text_normalized = ref_text_normalized[i].strip()
                 if score >= args.threshold:
                     high_score_dur += duration
                     audio_filepath = os.path.join(clips_dir, f"{base_name}_{i:04}.wav")
@@ -98,6 +107,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
                         "duration": duration,
                         "text": text_processed,
                         "text_no_preprocessing": text_no_preprocessing,
+                        "text_normalized": text_normalized,
                         "score": round(score, 2),
                         "start_abs": float(np.mean(np.abs(segment[:num_samples]))),
                         "end_abs": float(np.mean(np.abs(segment[-num_samples:]))),

diff --git a/tools/ctc_segmentation/scripts/get_metrics_and_filter.py b/tools/ctc_segmentation/scripts/get_metrics_and_filter.py
@@ -45,6 +45,7 @@
 )
 parser.add_argument("--max_edge_cer", type=int, help="Threshold edge CER value, %", default=60)
 parser.add_argument("--max_duration", type=int, help="Max duration of a segment, seconds", default=-1)
+parser.add_argument("--min_duration", type=int, help="Min duration of a segment, seconds", default=1)
 parser.add_argument(
     "--num_jobs",
     default=-2,
@@ -108,7 +109,15 @@ def get_metrics(manifest, manifest_out):
 
 
 def _apply_filters(
-    manifest, manifest_out, max_cer, max_wer, max_edge_cer, max_len_diff_ratio, max_dur=-1, original_duration=0
+    manifest,
+    manifest_out,
+    max_cer,
+    max_wer,
+    max_edge_cer,
+    max_len_diff_ratio,
+    max_dur=-1,
+    min_dur=1,
+    original_duration=0,
 ):
     """ Filters out samples that do not satisfy specified threshold values and saves remaining samples to manifest_out"""
     remaining_duration = 0
@@ -128,6 +137,7 @@ def _apply_filters(
                 and item["end_CER"] <= max_edge_cer
                 and item["start_CER"] <= max_edge_cer
                 and (max_dur == -1 or (max_dur > -1 and duration < max_dur))
+                and duration > min_dur
             ):
                 remaining_duration += duration
                 f_out.write(json.dumps(item) + "\n")
@@ -180,6 +190,7 @@ def filter(manifest):
         max_edge_cer=args.max_edge_cer,
         max_len_diff_ratio=args.max_len_diff_ratio,
         max_dur=args.max_duration,
+        min_dur=args.min_duration,
         original_duration=original_duration,
     )
 

diff --git a/tools/ctc_segmentation/scripts/prepare_data.py b/tools/ctc_segmentation/scripts/prepare_data.py
@@ -15,15 +15,14 @@
 import argparse
 import os
 import re
-from pathlib import Path
-from typing import List
+from glob import glob
+from typing import List, Optional
 
 import regex
 from joblib import Parallel, delayed
 from normalization_helpers import LATIN_TO_RU, RU_ABBREVIATIONS
 from num2words import num2words
-from pydub import AudioSegment
-from pydub.utils import mediainfo
+from sox import Transformer
 from tqdm import tqdm
 
 from nemo.collections.asr.models import ASRModel
@@ -42,6 +41,7 @@
 parser.add_argument("--output_dir", type=str, required=True, help="Path to output directory")
 parser.add_argument("--audio_dir", type=str, help="Path to folder with .mp3 or .wav audio files")
 parser.add_argument("--sample_rate", type=int, default=16000, help="Sampling rate used during ASR model training, Hz")
+parser.add_argument("--bit_depth", type=int, default=16, help="Bit depth to use for processed audio files")
 parser.add_argument("--n_jobs", default=-2, type=int, help="The maximum number of concurrently running jobs")
 parser.add_argument(
     "--language",
@@ -65,32 +65,37 @@
     default="",
     help="Additional symbols to use for \
     sentence split if eos sentence split resulted in sequence longer than --max_length. "
-    "Use '|' as a separator between symbols, for example: ';|:' ",
+    "Use '|' as a separator between symbols, for example: ';|:'. Use '\s' to split by space.",
 )
 parser.add_argument(
     "--use_nemo_normalization",
     action="store_true",
     help="Set to True to use NeMo Normalization tool to convert numbers from written to spoken format.",
 )
+parser.add_argument(
+    "--batch_size", type=int, default=100, help="Batch size for NeMo Normalization tool.",
+)
 
 
-def process_audio(in_file: str, wav_file: str = None, cut_prefix: int = 0, sample_rate: int = 16000):
+def process_audio(
+    in_file: str, wav_file: str = None, cut_prefix: int = 0, sample_rate: int = 16000, bit_depth: int = 16
+):
     """Process audio file: .mp3 to .wav conversion and cut a few seconds from the beginning of the audio
 
     Args:
         in_file: path to the .mp3 or .wav file for processing
         wav_file: path to the output .wav file
         cut_prefix: number of seconds to cut from the beginning of the audio file
         sample_rate: target sampling rate
+        bit_depth: target bit_depth
     """
     try:
-        info = mediainfo(in_file)
-        sound = AudioSegment.from_file(in_file, start_second=cut_prefix)
-        if info["sample_rate"] != str(sample_rate):
-            sound = sound.set_frame_rate(sample_rate)
-        if info["channels"] != 1:
-            sound = sound.set_channels(1)
-        sound.export(wav_file, format="wav")
+        if not os.path.exists(in_file):
+            raise ValueError(f'{in_file} not found')
+        tfm = Transformer()
+        tfm.convert(samplerate=sample_rate, n_channels=1, bitdepth=bit_depth)
+        tfm.trim(cut_prefix)
+        tfm.build(input_filepath=in_file, output_filepath=wav_file)
     except Exception as e:
         print(f'{in_file} skipped - {e}')
 
@@ -100,11 +105,13 @@ def split_text(
     out_file: str,
     vocabulary: List[str],
     language="en",
-    remove_brackets=True,
-    do_lower_case=True,
-    max_length=100,
-    additional_split_symbols=None,
-    use_nemo_normalization=False,
+    remove_brackets: bool = True,
+    do_lower_case: bool = True,
+    max_length: bool = 100,
+    additional_split_symbols: bool = None,
+    use_nemo_normalization: bool = False,
+    n_jobs: Optional[int] = 1,
+    batch_size: Optional[int] = 1.0,
 ):
     """
     Breaks down the in_file roughly into sentences. Each sentence will be on a separate line.
@@ -124,6 +131,10 @@ def split_text(
         use_nemo_normalization: Set to True to use NeMo normalization tool to convert numbers from written to spoken
             format. Normalization using num2words will be applied afterwards to make sure there are no numbers present
             in the text, otherwise they will be replaced with a space and that could deteriorate segmentation results.
+        n_jobs (if use_nemo_normalization=True): the maximum number of concurrently running jobs. If -1 all CPUs are used. If 1 is given,
+                no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1,
+                (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.
+        batch_size (if use_nemo_normalization=True): Number of examples for each process
     """
     print(f"Splitting text in {in_file} into sentences.")
     with open(in_file, "r") as f:
@@ -140,7 +151,6 @@ def split_text(
     )
     # remove extra space
     transcript = re.sub(r" +", " ", transcript)
-    transcript = re.sub(r"(\.+)", ". ", transcript)
 
     if remove_brackets:
         transcript = re.sub(r'(\[.*?\])', ' ', transcript)
@@ -175,7 +185,7 @@ def split_text(
     sentences = [s.strip() for s in sentences if s.strip()]
 
     # Read and split transcript by utterance (roughly, sentences)
-    split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s"
+    split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]+\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s(?![0-9]+[a-z]*\.)"
 
     new_sentences = []
     for sent in sentences:
@@ -215,12 +225,15 @@ def _split(sentences, delimiter):
         for sent in sentences:
             split_sent = [sent]
             for delimiter in split_on_symbols:
-                split_sent = _split(split_sent, delimiter + " ")
+                if len(delimiter) == 0:
+                    continue
+                split_sent = _split(split_sent, delimiter + " " if delimiter != " " else delimiter)
             another_sent_split.extend(split_sent)
 
         sentences = [s.strip() for s in another_sent_split if s.strip()]
         return sentences
 
+    additional_split_symbols = additional_split_symbols.replace("/s", " ")
     sentences = additional_split(sentences, additional_split_symbols)
 
     vocabulary_symbols = []
@@ -267,7 +280,9 @@ def _split(sentences, delimiter):
 
         print("Using NeMo normalization tool...")
         normalizer = Normalizer(input_case="cased", cache_dir=os.path.join(os.path.dirname(out_file), "en_grammars"))
-        sentences_norm = normalizer.normalize_list(sentences, verbose=False, punct_post_process=True)
+        sentences_norm = normalizer.normalize_list(
+            sentences, verbose=False, punct_post_process=True, n_jobs=n_jobs, batch_size=batch_size
+        )
         if len(sentences_norm) != len(sentences):
             raise ValueError("Normalization failed, number of sentences does not match.")
         else:
@@ -338,9 +353,9 @@ def _split(sentences, delimiter):
         vocabulary = asr_model.cfg.decoder.vocabulary
 
         if os.path.isdir(args.in_text):
-            text_files = Path(args.in_text).glob(("*.txt"))
+            text_files = glob(f"{args.in_text}/*.txt")
         else:
-            text_files.append(Path(args.in_text))
+            text_files.append(args.in_text)
         for text in text_files:
             base_name = os.path.basename(text)[:-4]
             out_text_file = os.path.join(args.output_dir, base_name + ".txt")
@@ -353,23 +368,25 @@ def _split(sentences, delimiter):
                 max_length=args.max_length,
                 additional_split_symbols=args.additional_split_symbols,
                 use_nemo_normalization=args.use_nemo_normalization,
+                n_jobs=args.n_jobs,
+                batch_size=args.batch_size,
             )
         print(f"Processed text saved at {args.output_dir}")
 
     if args.audio_dir:
         if not os.path.exists(args.audio_dir):
             raise ValueError(f"{args.audio_dir} not found. '--audio_dir' should contain .mp3 or .wav files.")
 
-        audio_paths = list(Path(args.audio_dir).glob("*"))
+        audio_paths = glob(f"{args.audio_dir}/*")
 
-        normalized_lines = Parallel(n_jobs=args.n_jobs)(
+        Parallel(n_jobs=args.n_jobs)(
             delayed(process_audio)(
                 audio_paths[i],
-                os.path.join(args.output_dir, os.path.splitext(audio_paths[i].name)[0] + ".wav"),
+                os.path.join(args.output_dir, os.path.splitext(os.path.basename(audio_paths[i]))[0] + ".wav"),
                 args.cut_prefix,
                 args.sample_rate,
+                args.bit_depth,
             )
             for i in tqdm(range(len(audio_paths)))
         )
-
     print("Data preparation is complete.")