Skip to content

Commit

Permalink
[Add] automaticly resolving relative audio path (#4277)
Browse files Browse the repository at this point in the history
* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix relative manifest

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

* debug

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix bug by moving modifications to __parse_item()

Signed-off-by: stevehuang52 <heh@nvidia.com>

* fix style

Signed-off-by: stevehuang52 <heh@nvidia.com>

* update

Signed-off-by: stevehuang52 <heh@nvidia.com>

Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
  • Loading branch information
stevehuang52 and titu1994 authored May 31, 2022
1 parent e5d048e commit f6936ce
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 3 deletions.
7 changes: 6 additions & 1 deletion examples/asr/transcribe_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import json
import os
from dataclasses import dataclass, is_dataclass
from pathlib import Path
from typing import Optional

import pytorch_lightning as pl
Expand Down Expand Up @@ -158,6 +159,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
logging.error(f"The input dataset_manifest {cfg.dataset_manifest} is empty. Exiting!")
return None

manifest_dir = Path(cfg.dataset_manifest).parent
with open(cfg.dataset_manifest, 'r') as f:
has_two_fields = []
for line in f:
Expand All @@ -166,7 +168,10 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
has_two_fields.append(True)
else:
has_two_fields.append(False)
filepaths.append(item['audio_filepath'])
audio_file = Path(item['audio_filepath'])
if not audio_file.is_file() and not audio_file.is_absolute():
audio_file = manifest_dir / audio_file
filepaths.append(str(audio_file.absolute()))
partial_audio = all(has_two_fields)

logging.info(f"\nTranscribing {len(filepaths)} files...\n")
Expand Down
16 changes: 15 additions & 1 deletion nemo/collections/common/parts/preprocessing/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import json
from os.path import expanduser
from pathlib import Path
from typing import Any, Callable, Dict, Iterator, List, Optional, Union


Expand Down Expand Up @@ -87,7 +88,20 @@ def __parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
raise ValueError(
f"Manifest file {manifest_file} has invalid json line structure: {line} without proper audio file key."
)
item['audio_file'] = expanduser(item['audio_file'])

# If the audio path is relative, and not using tarred dataset,
# attach the parent directory of manifest to the audio path.
# Assume "audio_file" starts with a dir, such as "wavs/xxxxx.wav".
# If using a tarred dataset, the "audio_path" is like "_home_data_tarred_wavs_xxxx.wav",
# so we will just ignore it.
manifest_dir = Path(manifest_file).parent
audio_file = Path(item['audio_file'])
if not audio_file.is_file() and not audio_file.is_absolute() and audio_file.parent != Path("."):
# assume the wavs/ dir and manifest are under the same parent dir
audio_file = manifest_dir / audio_file
item['audio_file'] = str(audio_file.absolute())
else:
item['audio_file'] = expanduser(item['audio_file'])

# Duration.
if 'duration' not in item:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import json
import os
import pickle
from pathlib import Path

import editdistance
import kenlm_utils
Expand Down Expand Up @@ -230,12 +231,16 @@ def main():
)

target_transcripts = []
manifest_dir = Path(args.input_manifest).parent
with open(args.input_manifest, 'r') as manifest_file:
audio_file_paths = []
for line in tqdm(manifest_file, desc=f"Reading Manifest {args.input_manifest} ...", ncols=120):
data = json.loads(line)
audio_file = Path(data['audio_filepath'])
if not audio_file.is_file() and not audio_file.is_absolute():
audio_file = manifest_dir / audio_file
target_transcripts.append(data['text'])
audio_file_paths.append(data['audio_filepath'])
audio_file_paths.append(str(audio_file.absolute()))

if args.probs_cache_file and os.path.exists(args.probs_cache_file):
logging.info(f"Found a pickle file of probabilities at '{args.probs_cache_file}'.")
Expand Down

0 comments on commit f6936ce

Please sign in to comment.