Skip to content

Commit

Permalink
Multiprocess improvements (#4127)
Browse files Browse the repository at this point in the history
* initial commit

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* start fix

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* improve multiprocessing speed while creating speaker dataset

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>

* updated scp to filelist

Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
  • Loading branch information
nithinraok authored and ericharper committed Jun 3, 2022
1 parent b4699ac commit 0e95b67
Showing 1 changed file with 17 additions and 17 deletions.
34 changes: 17 additions & 17 deletions scripts/speaker_tasks/filelist_to_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,21 @@
This scipt converts a filelist file where each line contains
<absolute path of wav file> to a manifest json file.
Optionally post processes the manifest file to create dev and train split for speaker embedding
training, also optionally segment an audio file in to segments of random DURATIONS and create those
training, also optionally chunk an audio file in to segments of random DURATIONS and create those
wav files in CWD.
While creating segments, if audio is not sampled at 16kHz, it resamples to 16kHz and write the wav file.
While creating chunks, if audio is not sampled at 16Khz, it resamples to 16Khz and write the wav file.
Args:
--filelist: path to file containing list of audio files
--manifest(optional): if you already have manifest file, but would like to process it for creating
segments and splitting then use manifest ignoring filelist
--manifest(optional): if you already have manifest file, but would like to process it for creating chunks and splitting then use manifest ignoring filelist
--id: index of speaker label in filename present in filelist file that is separated by '/'
--out: output manifest file name
--split: if you would want to split the manifest file for training purposes
you may not need this for test set. output file names is <out>_<train/dev>.json, defaults to False
--create_segments: if you would want to segment each manifest line to segments of [1,2,3,4] sec or less
you may not need this for test set, defaults to False
--min_spkrs_count: min number of samples per speaker to consider and ignore otherwise, defaults to 0 (all speakers)
you may not need this for test set. output file names is <out>_<train/dev>.json
Defaults to False
--create_chunks:if you would want to chunk each manifest line to chunks of 4 sec or less
you may not need this for test set, Defaults to False
--min_spkrs_count: min number of samples per speaker to consider and ignore otherwise
"""

DURATIONS = sorted([1, 2, 3, 4], reverse=True)
Expand All @@ -60,7 +60,7 @@ def filter_manifest_line(manifest_line):
dur = manifest_line['duration']
label = manifest_line['label']
endname = os.path.splitext(audio_path.split(label, 1)[-1])[0]
to_path = os.path.join(CWD, 'segments', label)
to_path = os.path.join(CWD, 'chunks', label)
to_path = os.path.join(to_path, endname[1:])
os.makedirs(os.path.dirname(to_path), exist_ok=True)

Expand All @@ -87,8 +87,8 @@ def filter_manifest_line(manifest_line):

c_start = int(float(start * sr))
c_end = c_start + int(float(temp_dur * sr))
segment = signal[c_start:c_end]
sf.write(to_file, segment, sr)
chunk = signal[c_start:c_end]
sf.write(to_file, chunk, sr)

meta = manifest_line.copy()
meta['audio_filepath'] = to_file
Expand Down Expand Up @@ -172,7 +172,7 @@ def get_labels(lines):
return labels


def main(filelist, manifest, id, out, split=False, create_segments=False, min_count=10):
def main(filelist, manifest, id, out, split=False, create_chunks=False, min_count=10):
if os.path.exists(out):
os.remove(out)
if filelist:
Expand All @@ -185,8 +185,8 @@ def main(filelist, manifest, id, out, split=False, create_segments=False, min_co

lines = process_map(get_duration, lines, chunksize=100)

if create_segments:
print(f"creating and writing segments to {CWD}")
if create_chunks:
print(f"creating and writing chunks to {CWD}")
lines = process_map(filter_manifest_line, lines, chunksize=100)
temp = []
for line in lines:
Expand Down Expand Up @@ -232,8 +232,8 @@ def main(filelist, manifest, id, out, split=False, create_segments=False, min_co
action='store_true',
)
parser.add_argument(
"--create_segments",
help="bool if you would want to segment each manifest line to segments of 4 sec or less",
"--create_chunks",
help="bool if you would want to chunk each manifest line to chunks of 4 sec or less",
required=False,
action='store_true',
)
Expand All @@ -247,5 +247,5 @@ def main(filelist, manifest, id, out, split=False, create_segments=False, min_co
args = parser.parse_args()

main(
args.filelist, args.manifest, args.id, args.out, args.split, args.create_segments, args.min_spkrs_count,
args.filelist, args.manifest, args.id, args.out, args.split, args.create_chunks, args.min_spkrs_count,
)

0 comments on commit 0e95b67

Please sign in to comment.