Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug fixes for parallel mp3 to wav conversion, PC notebook, update Readme for TN requirements #5047

Merged
merged 7 commits into from
Sep 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,7 @@ pipeline {
steps {
sh 'cd tools/ctc_segmentation && \
pip install -r requirements.txt && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends ffmpeg'
apt-get update && apt-get install libsox-fmt-all -y'
}
}

Expand Down
8 changes: 8 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,14 @@ Megatron GPT training requires NVIDIA Apex to be installed.
git checkout nm_v1.11.0
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./

NeMo Text Processing
~~~~~~~~~~~~~~~~~~~~
NeMo Text Processing, specifically (Inverse) Text Normalization, requires `Pynini <https://pypi.org/project/pynini/>`_ to be installed.

.. code-block:: bash

bash NeMo/nemo_text_processing/install_pynini.sh

Docker containers:
~~~~~~~~~~~~~~~~~~
To build a nemo container with Dockerfile from a branch, please run
Expand Down
4 changes: 2 additions & 2 deletions nemo_text_processing/text_normalization/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def normalize_list(

try:
normalized_texts = Parallel(n_jobs=n_jobs)(
delayed(self.__process_batch)(texts[i : i + batch], verbose, punct_pre_process, punct_post_process)
delayed(self.process_batch)(texts[i : i + batch], verbose, punct_pre_process, punct_post_process)
for i in range(0, len(texts), batch)
)
except BaseException as e:
Expand All @@ -165,7 +165,7 @@ def normalize_list(
normalized_texts = list(itertools.chain(*normalized_texts))
return normalized_texts

def __process_batch(self, batch, verbose, punct_pre_process, punct_post_process):
def process_batch(self, batch, verbose, punct_pre_process, punct_post_process):
"""
Normalizes batch of text sequences
Args:
Expand Down
6 changes: 6 additions & 0 deletions tools/ctc_segmentation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,9 @@ abstract="Recent end-to-end Automatic Speech Recognition (ASR) systems demonstra
isbn="978-3-030-60276-5"
}
```
Requirements
~~~~~~~~~~~~
The tool requires:
- packages listed in requirements.txt
- NeMo ASR
- see pysox’s documentation (https://pysox.readthedocs.io/en/latest/) if you want support for mp3, flac and ogg files
7 changes: 6 additions & 1 deletion tools/ctc_segmentation/run_filter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ CER_THRESHOLD=30
WER_THRESHOLD=75
CER_EDGE_THRESHOLD=60
LEN_DIFF_RATIO_THRESHOLD=0.3
MIN_DURATION=1 # in seconds
MAX_DURATION=20 # in seconds

for ARG in "$@"
do
Expand Down Expand Up @@ -46,6 +48,7 @@ python ${SCRIPTS_DIR}/../../../examples/asr/transcribe_speech.py \
$ARG_MODEL=$MODEL_NAME_OR_PATH \
dataset_manifest=$MANIFEST \
output_filename=${OUT_MANIFEST} \
batch_size=${BATCH_SIZE} \
num_workers=0 || exit

echo "--- Calculating metrics and filtering out samples based on thresholds ---"
Expand All @@ -60,4 +63,6 @@ python ${SCRIPTS_DIR}/get_metrics_and_filter.py \
--max_cer=${CER_THRESHOLD} \
--max_wer=${WER_THRESHOLD} \
--max_len_diff_ratio=${LEN_DIFF_RATIO_THRESHOLD} \
--max_edge_cer=${CER_EDGE_THRESHOLD}
--max_edge_cer=${CER_EDGE_THRESHOLD} \
--min_duration=${MIN_DURATION}
--max_duration=${MAX_DURATION}
4 changes: 3 additions & 1 deletion tools/ctc_segmentation/run_segmentation.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ADDITIONAL_SPLIT_SYMBOLS=":|;"
USE_NEMO_NORMALIZATION='True'
NUM_JOBS=-2 # The maximum number of concurrently running jobs, `-2` - all CPUs but one are used
SAMPLE_RATE=16000 # Target sample rate (default for ASR data - 16000 Hz)
MAX_DURATION=20 # Maximum audio segment duration, in seconds. Samples that are longer will be dropped.

for ARG in "$@"
do
Expand Down Expand Up @@ -103,4 +104,5 @@ python $SCRIPTS_DIR/cut_audio.py \
--alignment=$OUTPUT_DIR/verified_segments \
--threshold=$MIN_SCORE \
--offset=$OFFSET \
--sample_rate=$SAMPLE_RATE || exit
--sample_rate=$SAMPLE_RATE \
--max_duration=$MAX_DURATION || exit
10 changes: 10 additions & 0 deletions tools/ctc_segmentation/scripts/cut_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@
default=0.05,
)
parser.add_argument("--sample_rate", type=int, help="Sample rate, Hz", default=16000)
parser.add_argument(
"--max_duration",
type=int,
help="Maximum audio duration (seconds). Samples that are longer will be dropped",
default=60,
)


def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
Expand All @@ -60,6 +66,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
segments = []
ref_text_processed = []
ref_text_no_preprocessing = []
ref_text_normalized = []
with open(alignment_file, "r") as f:
for line in f:
line = line.split("|")
Expand All @@ -69,6 +76,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
continue
ref_text_processed.append(line[1].strip())
ref_text_no_preprocessing.append(line[2].strip())
ref_text_normalized.append(line[3].strip())
line = line[0].split()
segments.append((float(line[0]) + args.offset / 1000, float(line[1]) + args.offset / 1000, float(line[2])))

Expand All @@ -86,6 +94,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
if duration > 0:
text_processed = ref_text_processed[i].strip()
text_no_preprocessing = ref_text_no_preprocessing[i].strip()
text_normalized = ref_text_normalized[i].strip()
if score >= args.threshold:
high_score_dur += duration
audio_filepath = os.path.join(clips_dir, f"{base_name}_{i:04}.wav")
Expand All @@ -98,6 +107,7 @@ def process_alignment(alignment_file: str, manifest: str, clips_dir: str, args):
"duration": duration,
"text": text_processed,
"text_no_preprocessing": text_no_preprocessing,
"text_normalized": text_normalized,
"score": round(score, 2),
"start_abs": float(np.mean(np.abs(segment[:num_samples]))),
"end_abs": float(np.mean(np.abs(segment[-num_samples:]))),
Expand Down
13 changes: 12 additions & 1 deletion tools/ctc_segmentation/scripts/get_metrics_and_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
)
parser.add_argument("--max_edge_cer", type=int, help="Threshold edge CER value, %", default=60)
parser.add_argument("--max_duration", type=int, help="Max duration of a segment, seconds", default=-1)
parser.add_argument("--min_duration", type=int, help="Min duration of a segment, seconds", default=1)
parser.add_argument(
"--num_jobs",
default=-2,
Expand Down Expand Up @@ -108,7 +109,15 @@ def get_metrics(manifest, manifest_out):


def _apply_filters(
manifest, manifest_out, max_cer, max_wer, max_edge_cer, max_len_diff_ratio, max_dur=-1, original_duration=0
manifest,
manifest_out,
max_cer,
max_wer,
max_edge_cer,
max_len_diff_ratio,
max_dur=-1,
min_dur=1,
original_duration=0,
):
""" Filters out samples that do not satisfy specified threshold values and saves remaining samples to manifest_out"""
remaining_duration = 0
Expand All @@ -128,6 +137,7 @@ def _apply_filters(
and item["end_CER"] <= max_edge_cer
and item["start_CER"] <= max_edge_cer
and (max_dur == -1 or (max_dur > -1 and duration < max_dur))
and duration > min_dur
):
remaining_duration += duration
f_out.write(json.dumps(item) + "\n")
Expand Down Expand Up @@ -180,6 +190,7 @@ def filter(manifest):
max_edge_cer=args.max_edge_cer,
max_len_diff_ratio=args.max_len_diff_ratio,
max_dur=args.max_duration,
min_dur=args.min_duration,
original_duration=original_duration,
)

Expand Down
73 changes: 45 additions & 28 deletions tools/ctc_segmentation/scripts/prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,14 @@
import argparse
import os
import re
from pathlib import Path
from typing import List
from glob import glob
from typing import List, Optional

import regex
from joblib import Parallel, delayed
from normalization_helpers import LATIN_TO_RU, RU_ABBREVIATIONS
from num2words import num2words
from pydub import AudioSegment
from pydub.utils import mediainfo
from sox import Transformer
from tqdm import tqdm

from nemo.collections.asr.models import ASRModel
Expand All @@ -42,6 +41,7 @@
parser.add_argument("--output_dir", type=str, required=True, help="Path to output directory")
parser.add_argument("--audio_dir", type=str, help="Path to folder with .mp3 or .wav audio files")
parser.add_argument("--sample_rate", type=int, default=16000, help="Sampling rate used during ASR model training, Hz")
parser.add_argument("--bit_depth", type=int, default=16, help="Bit depth to use for processed audio files")
parser.add_argument("--n_jobs", default=-2, type=int, help="The maximum number of concurrently running jobs")
parser.add_argument(
"--language",
Expand All @@ -65,32 +65,37 @@
default="",
help="Additional symbols to use for \
sentence split if eos sentence split resulted in sequence longer than --max_length. "
"Use '|' as a separator between symbols, for example: ';|:' ",
"Use '|' as a separator between symbols, for example: ';|:'. Use '\s' to split by space.",
)
parser.add_argument(
"--use_nemo_normalization",
action="store_true",
help="Set to True to use NeMo Normalization tool to convert numbers from written to spoken format.",
)
parser.add_argument(
"--batch_size", type=int, default=100, help="Batch size for NeMo Normalization tool.",
)


def process_audio(in_file: str, wav_file: str = None, cut_prefix: int = 0, sample_rate: int = 16000):
def process_audio(
in_file: str, wav_file: str = None, cut_prefix: int = 0, sample_rate: int = 16000, bit_depth: int = 16
):
"""Process audio file: .mp3 to .wav conversion and cut a few seconds from the beginning of the audio

Args:
in_file: path to the .mp3 or .wav file for processing
wav_file: path to the output .wav file
cut_prefix: number of seconds to cut from the beginning of the audio file
sample_rate: target sampling rate
bit_depth: target bit_depth
"""
try:
info = mediainfo(in_file)
sound = AudioSegment.from_file(in_file, start_second=cut_prefix)
if info["sample_rate"] != str(sample_rate):
sound = sound.set_frame_rate(sample_rate)
if info["channels"] != 1:
sound = sound.set_channels(1)
sound.export(wav_file, format="wav")
if not os.path.exists(in_file):
raise ValueError(f'{in_file} not found')
tfm = Transformer()
tfm.convert(samplerate=sample_rate, n_channels=1, bitdepth=bit_depth)
tfm.trim(cut_prefix)
tfm.build(input_filepath=in_file, output_filepath=wav_file)
except Exception as e:
print(f'{in_file} skipped - {e}')

Expand All @@ -100,11 +105,13 @@ def split_text(
out_file: str,
vocabulary: List[str],
language="en",
remove_brackets=True,
do_lower_case=True,
max_length=100,
additional_split_symbols=None,
use_nemo_normalization=False,
remove_brackets: bool = True,
do_lower_case: bool = True,
max_length: bool = 100,
additional_split_symbols: bool = None,
use_nemo_normalization: bool = False,
n_jobs: Optional[int] = 1,
batch_size: Optional[int] = 1.0,
):
"""
Breaks down the in_file roughly into sentences. Each sentence will be on a separate line.
Expand All @@ -124,6 +131,10 @@ def split_text(
use_nemo_normalization: Set to True to use NeMo normalization tool to convert numbers from written to spoken
format. Normalization using num2words will be applied afterwards to make sure there are no numbers present
in the text, otherwise they will be replaced with a space and that could deteriorate segmentation results.
n_jobs (if use_nemo_normalization=True): the maximum number of concurrently running jobs. If -1 all CPUs are used. If 1 is given,
no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1,
(n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used.
batch_size (if use_nemo_normalization=True): Number of examples for each process
"""
print(f"Splitting text in {in_file} into sentences.")
with open(in_file, "r") as f:
Expand All @@ -140,7 +151,6 @@ def split_text(
)
# remove extra space
transcript = re.sub(r" +", " ", transcript)
transcript = re.sub(r"(\.+)", ". ", transcript)

if remove_brackets:
transcript = re.sub(r'(\[.*?\])', ' ', transcript)
Expand Down Expand Up @@ -175,7 +185,7 @@ def split_text(
sentences = [s.strip() for s in sentences if s.strip()]

# Read and split transcript by utterance (roughly, sentences)
split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s"
split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]+\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s(?![0-9]+[a-z]*\.)"

new_sentences = []
for sent in sentences:
Expand Down Expand Up @@ -215,12 +225,15 @@ def _split(sentences, delimiter):
for sent in sentences:
split_sent = [sent]
for delimiter in split_on_symbols:
split_sent = _split(split_sent, delimiter + " ")
if len(delimiter) == 0:
continue
split_sent = _split(split_sent, delimiter + " " if delimiter != " " else delimiter)
another_sent_split.extend(split_sent)

sentences = [s.strip() for s in another_sent_split if s.strip()]
return sentences

additional_split_symbols = additional_split_symbols.replace("/s", " ")
sentences = additional_split(sentences, additional_split_symbols)

vocabulary_symbols = []
Expand Down Expand Up @@ -267,7 +280,9 @@ def _split(sentences, delimiter):

print("Using NeMo normalization tool...")
normalizer = Normalizer(input_case="cased", cache_dir=os.path.join(os.path.dirname(out_file), "en_grammars"))
sentences_norm = normalizer.normalize_list(sentences, verbose=False, punct_post_process=True)
sentences_norm = normalizer.normalize_list(
sentences, verbose=False, punct_post_process=True, n_jobs=n_jobs, batch_size=batch_size
)
if len(sentences_norm) != len(sentences):
raise ValueError("Normalization failed, number of sentences does not match.")
else:
Expand Down Expand Up @@ -338,9 +353,9 @@ def _split(sentences, delimiter):
vocabulary = asr_model.cfg.decoder.vocabulary

if os.path.isdir(args.in_text):
text_files = Path(args.in_text).glob(("*.txt"))
text_files = glob(f"{args.in_text}/*.txt")
else:
text_files.append(Path(args.in_text))
text_files.append(args.in_text)
for text in text_files:
base_name = os.path.basename(text)[:-4]
out_text_file = os.path.join(args.output_dir, base_name + ".txt")
Expand All @@ -353,23 +368,25 @@ def _split(sentences, delimiter):
max_length=args.max_length,
additional_split_symbols=args.additional_split_symbols,
use_nemo_normalization=args.use_nemo_normalization,
n_jobs=args.n_jobs,
batch_size=args.batch_size,
)
print(f"Processed text saved at {args.output_dir}")

if args.audio_dir:
if not os.path.exists(args.audio_dir):
raise ValueError(f"{args.audio_dir} not found. '--audio_dir' should contain .mp3 or .wav files.")

audio_paths = list(Path(args.audio_dir).glob("*"))
audio_paths = glob(f"{args.audio_dir}/*")

normalized_lines = Parallel(n_jobs=args.n_jobs)(
Parallel(n_jobs=args.n_jobs)(
delayed(process_audio)(
audio_paths[i],
os.path.join(args.output_dir, os.path.splitext(audio_paths[i].name)[0] + ".wav"),
os.path.join(args.output_dir, os.path.splitext(os.path.basename(audio_paths[i]))[0] + ".wav"),
args.cut_prefix,
args.sample_rate,
args.bit_depth,
)
for i in tqdm(range(len(audio_paths)))
)

print("Data preparation is complete.")
Loading