Multiprocess improvements (#4127)

* initial commit Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * start fix Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * improve multiprocessing speed while creating speaker dataset Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * updated scp to filelist Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
NVIDIA · Jun 3, 2022 · 0e95b67 · 0e95b67
1 parent b4699ac
commit 0e95b67
Showing 1 changed file with 17 additions and 17 deletions.
diff --git a/scripts/speaker_tasks/filelist_to_manifest.py b/scripts/speaker_tasks/filelist_to_manifest.py
@@ -30,21 +30,21 @@
 This scipt converts a filelist file where each line contains  
 <absolute path of wav file> to a manifest json file. 
 Optionally post processes the manifest file to create dev and train split for speaker embedding 
-training, also optionally segment an audio file in to segments of random DURATIONS and create those
+training, also optionally chunk an audio file in to segments of random DURATIONS and create those
 wav files in CWD. 
 
-While creating segments, if audio is not sampled at 16kHz, it resamples to 16kHz and write the wav file.
+While creating chunks, if audio is not sampled at 16Khz, it resamples to 16Khz and write the wav file.
 Args: 
 --filelist: path to file containing list of audio files
---manifest(optional): if you already have manifest file, but would like to process it for creating 
-    segments and splitting then use manifest ignoring filelist
+--manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring filelist
 --id: index of speaker label in filename present in filelist file that is separated by '/'
 --out: output manifest file name
 --split: if you would want to split the  manifest file for training purposes
-    you may not need this for test set. output file names is <out>_<train/dev>.json, defaults to False
---create_segments: if you would want to segment each manifest line to segments of [1,2,3,4] sec or less
-    you may not need this for test set, defaults to False
---min_spkrs_count: min number of samples per speaker to consider and ignore otherwise, defaults to 0 (all speakers)
+        you may not need this for test set. output file names is <out>_<train/dev>.json
+        Defaults to False
+--create_chunks:if you would want to chunk each manifest line to chunks of 4 sec or less
+        you may not need this for test set, Defaults to False
+--min_spkrs_count: min number of samples per speaker to consider and ignore otherwise
 """
 
 DURATIONS = sorted([1, 2, 3, 4], reverse=True)
@@ -60,7 +60,7 @@ def filter_manifest_line(manifest_line):
     dur = manifest_line['duration']
     label = manifest_line['label']
     endname = os.path.splitext(audio_path.split(label, 1)[-1])[0]
-    to_path = os.path.join(CWD, 'segments', label)
+    to_path = os.path.join(CWD, 'chunks', label)
     to_path = os.path.join(to_path, endname[1:])
     os.makedirs(os.path.dirname(to_path), exist_ok=True)
 
@@ -87,8 +87,8 @@ def filter_manifest_line(manifest_line):
 
                 c_start = int(float(start * sr))
                 c_end = c_start + int(float(temp_dur * sr))
-                segment = signal[c_start:c_end]
-                sf.write(to_file, segment, sr)
+                chunk = signal[c_start:c_end]
+                sf.write(to_file, chunk, sr)
 
                 meta = manifest_line.copy()
                 meta['audio_filepath'] = to_file
@@ -172,7 +172,7 @@ def get_labels(lines):
     return labels
 
 
-def main(filelist, manifest, id, out, split=False, create_segments=False, min_count=10):
+def main(filelist, manifest, id, out, split=False, create_chunks=False, min_count=10):
     if os.path.exists(out):
         os.remove(out)
     if filelist:
@@ -185,8 +185,8 @@ def main(filelist, manifest, id, out, split=False, create_segments=False, min_co
 
     lines = process_map(get_duration, lines, chunksize=100)
 
-    if create_segments:
-        print(f"creating and writing segments to {CWD}")
+    if create_chunks:
+        print(f"creating and writing chunks to {CWD}")
         lines = process_map(filter_manifest_line, lines, chunksize=100)
         temp = []
         for line in lines:
@@ -232,8 +232,8 @@ def main(filelist, manifest, id, out, split=False, create_segments=False, min_co
         action='store_true',
     )
     parser.add_argument(
-        "--create_segments",
-        help="bool if you would want to segment each manifest line to segments of 4 sec or less",
+        "--create_chunks",
+        help="bool if you would want to chunk each manifest line to chunks of 4 sec or less",
         required=False,
         action='store_true',
     )
@@ -247,5 +247,5 @@ def main(filelist, manifest, id, out, split=False, create_segments=False, min_co
     args = parser.parse_args()
 
     main(
-        args.filelist, args.manifest, args.id, args.out, args.split, args.create_segments, args.min_spkrs_count,
+        args.filelist, args.manifest, args.id, args.out, args.split, args.create_chunks, args.min_spkrs_count,
     )