From 0e95b67fff0e1b2b39e231cd8c55de7e979de528 Mon Sep 17 00:00:00 2001 From: Nithin Rao Date: Wed, 11 May 2022 10:28:55 -0700 Subject: [PATCH] Multiprocess improvements (#4127) * initial commit Signed-off-by: nithinraok * start fix Signed-off-by: nithinraok * improve multiprocessing speed while creating speaker dataset Signed-off-by: nithinraok * updated scp to filelist Signed-off-by: nithinraok --- scripts/speaker_tasks/filelist_to_manifest.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/scripts/speaker_tasks/filelist_to_manifest.py b/scripts/speaker_tasks/filelist_to_manifest.py index 3a6c27d39377..bbff045c265d 100644 --- a/scripts/speaker_tasks/filelist_to_manifest.py +++ b/scripts/speaker_tasks/filelist_to_manifest.py @@ -30,21 +30,21 @@ This scipt converts a filelist file where each line contains to a manifest json file. Optionally post processes the manifest file to create dev and train split for speaker embedding -training, also optionally segment an audio file in to segments of random DURATIONS and create those +training, also optionally chunk an audio file in to segments of random DURATIONS and create those wav files in CWD. -While creating segments, if audio is not sampled at 16kHz, it resamples to 16kHz and write the wav file. +While creating chunks, if audio is not sampled at 16Khz, it resamples to 16Khz and write the wav file. Args: --filelist: path to file containing list of audio files ---manifest(optional): if you already have manifest file, but would like to process it for creating - segments and splitting then use manifest ignoring filelist +--manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring filelist --id: index of speaker label in filename present in filelist file that is separated by '/' --out: output manifest file name --split: if you would want to split the manifest file for training purposes - you may not need this for test set. output file names is _.json, defaults to False ---create_segments: if you would want to segment each manifest line to segments of [1,2,3,4] sec or less - you may not need this for test set, defaults to False ---min_spkrs_count: min number of samples per speaker to consider and ignore otherwise, defaults to 0 (all speakers) + you may not need this for test set. output file names is _.json + Defaults to False +--create_chunks:if you would want to chunk each manifest line to chunks of 4 sec or less + you may not need this for test set, Defaults to False +--min_spkrs_count: min number of samples per speaker to consider and ignore otherwise """ DURATIONS = sorted([1, 2, 3, 4], reverse=True) @@ -60,7 +60,7 @@ def filter_manifest_line(manifest_line): dur = manifest_line['duration'] label = manifest_line['label'] endname = os.path.splitext(audio_path.split(label, 1)[-1])[0] - to_path = os.path.join(CWD, 'segments', label) + to_path = os.path.join(CWD, 'chunks', label) to_path = os.path.join(to_path, endname[1:]) os.makedirs(os.path.dirname(to_path), exist_ok=True) @@ -87,8 +87,8 @@ def filter_manifest_line(manifest_line): c_start = int(float(start * sr)) c_end = c_start + int(float(temp_dur * sr)) - segment = signal[c_start:c_end] - sf.write(to_file, segment, sr) + chunk = signal[c_start:c_end] + sf.write(to_file, chunk, sr) meta = manifest_line.copy() meta['audio_filepath'] = to_file @@ -172,7 +172,7 @@ def get_labels(lines): return labels -def main(filelist, manifest, id, out, split=False, create_segments=False, min_count=10): +def main(filelist, manifest, id, out, split=False, create_chunks=False, min_count=10): if os.path.exists(out): os.remove(out) if filelist: @@ -185,8 +185,8 @@ def main(filelist, manifest, id, out, split=False, create_segments=False, min_co lines = process_map(get_duration, lines, chunksize=100) - if create_segments: - print(f"creating and writing segments to {CWD}") + if create_chunks: + print(f"creating and writing chunks to {CWD}") lines = process_map(filter_manifest_line, lines, chunksize=100) temp = [] for line in lines: @@ -232,8 +232,8 @@ def main(filelist, manifest, id, out, split=False, create_segments=False, min_co action='store_true', ) parser.add_argument( - "--create_segments", - help="bool if you would want to segment each manifest line to segments of 4 sec or less", + "--create_chunks", + help="bool if you would want to chunk each manifest line to chunks of 4 sec or less", required=False, action='store_true', ) @@ -247,5 +247,5 @@ def main(filelist, manifest, id, out, split=False, create_segments=False, min_co args = parser.parse_args() main( - args.filelist, args.manifest, args.id, args.out, args.split, args.create_segments, args.min_spkrs_count, + args.filelist, args.manifest, args.id, args.out, args.split, args.create_chunks, args.min_spkrs_count, )