Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code-Switching dataset creation - upgrading to aggregate tokenizer manifest format #6448

Merged
merged 7 commits into from
Apr 21, 2023
Merged
4 changes: 3 additions & 1 deletion scripts/speech_recognition/code_switching/README.md
bmwshop marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ Follow the 2 steps listed below in order -

2. Create the synthetic audio data and the corresponding manifest file using `code_switching_audio_data_creation.py` It's usage is as follows:

`python code_switching_audio_data_creation.py --manifest_path <absolute path to intermediate CS manifest generated in step 1> --audio_save_folder_path <absolute path to directory where you want to save the synthesized audios> --manifest_save_path <absolute path to save the created manifest> --audio_normalized_amplitude <scaled normalized amplitude desired> --cs_data_sampling_rate <desired sampling rate for generated audios> --sample_beginning_pause_msec <pause to be added to the beginning of the generated sample in milli seconds> --sample_joining_pause_msec <pause to be added between segments while joining, in milli seconds> --sample_end_pause_msec <pause to be added to the end of the generated sample in milli seconds> --workers <number of worker processes>`
`python code_switching_audio_data_creation.py --manifest_path <absolute path to intermediate CS manifest generated in step 1> --audio_save_folder_path <absolute path to directory where you want to save the synthesized audios> --manifest_save_path <absolute path to save the created manifest> --audio_normalized_amplitude <scaled normalized amplitude desired> --cs_data_sampling_rate <desired sampling rate for generated audios> --sample_beginning_pause_msec <pause to be added to the beginning of the generated sample in milli seconds> --sample_joining_pause_msec <pause to be added between segments while joining, in milli seconds> --sample_end_pause_msec <pause to be added to the end of the generated sample in milli seconds> --is_lid_manifest <boolean to create manifest in the multi-sample lid format for the text field, true by default> --workers <number of worker processes>`

Example of the multi-sample LID format: ```[{“str”:“esta muestra ” “lang”:”es”},{“str”:“was generated synthetically”: “lang”:”en”}]```

Estimated runtime for generating a 10,000 hour corpus is ~40 hrs with a single worker
bmwshop marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@
parser.add_argument(
"--sample_end_pause_msec", default=20, type=int, help='Pause to be added at the end of the sample (msec)'
)
parser.add_argument(
"--is_lid_manifest",
default=True,
type=bool,
help='If true, generate manifest in the multi-sample lid format, else the standard manifest format',
)
parser.add_argument("--workers", default=1, type=int, help='Number of worker processes')

args = parser.parse_args()
Expand Down Expand Up @@ -116,6 +122,7 @@ def create_cs_data(
pause_join_msec: int,
pause_end_msec: int,
cs_data_sampling_rate: int,
is_lid_manifest: bool,
):

"""
Expand All @@ -128,6 +135,7 @@ def create_cs_data(
pause_join_msec: Pause to be added between different phrases of the sample (msec)
pause_end_msec: Pause to be added at the end of the sample (msec)
cs_data_sampling_rate: Desired sampling rate of the generated samples
is_lid_manifest: If true, generate manifest in the multi-sample lid format, else the standard manifest format

Returns:

Expand All @@ -144,8 +152,12 @@ def create_cs_data(
staring_pause = np.zeros(int(pause_beg_msec * fs / 1000))
combined_audio += list(staring_pause)

text_entry_list = []
for index in range(len(data['lang_ids'])):

phrase_entry = {}
# dictionary to store the phrase information which will be added to the complete sentence

data_sample, fs_sample = librosa.load(data['paths'][index], sr=fs)
# Alternative- fs_sample, data_sample = wavfile.read(data['paths'][index])

Expand All @@ -170,7 +182,12 @@ def create_cs_data(

combined_audio += list(data_sample_norm)

# adding small pause between gemgments
phrase_entry['str'] = data['texts'][index]
phrase_entry['lang'] = data['lang_ids'][index]

text_entry_list.append(phrase_entry)

# adding small pause between semgments
if index != (len(data['lang_ids']) - 1):
pause = np.zeros(int(pause_join_msec * fs / 1000))
combined_audio += list(pause)
Expand All @@ -192,7 +209,10 @@ def create_cs_data(
metadata_json = {}
metadata_json['audio_filepath'] = audio_file_path
metadata_json['duration'] = float(len(combined_audio) / fs)
metadata_json['text'] = ' '.join(data['texts'])
if is_lid_manifest:
metadata_json['text'] = text_entry_list
else:
metadata_json['text'] = ' '.join(data['texts'])

metadata_json['language_ids'] = data['lang_ids']
metadata_json['original_texts'] = data['texts']
Expand All @@ -213,6 +233,7 @@ def main():
pause_join_msec = args.sample_joining_pause_msec
pause_end_msec = args.sample_end_pause_msec
cs_data_sampling_rate = args.cs_data_sampling_rate
is_lid_manifest = args.is_lid_manifest
num_process = args.workers

# Sanity Checks
Expand Down Expand Up @@ -249,6 +270,7 @@ def main():
pause_join_msec,
pause_end_msec,
cs_data_sampling_rate,
is_lid_manifest,
)
for idx, split_manifest in enumerate(data_split)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

# Checks -
# (Recommendation) Please normalize the text for each language (avoid numbers, special characters, punctuation)
# Please ensure that the audio_fielpaths are absolute locations
# Please ensure that the audio_filepaths are absolute locations


parser = argparse.ArgumentParser(description='Create synthetic code-switching data manifest from monolingual data')
Expand Down