[Add] automaticly resolving relative audio path (#4277)

* update Signed-off-by: stevehuang52 <heh@nvidia.com> * update Signed-off-by: stevehuang52 <heh@nvidia.com> * update Signed-off-by: stevehuang52 <heh@nvidia.com> * fix relative manifest Signed-off-by: stevehuang52 <heh@nvidia.com> * update Signed-off-by: stevehuang52 <heh@nvidia.com> * debug Signed-off-by: stevehuang52 <heh@nvidia.com> * fix bug by moving modifications to __parse_item() Signed-off-by: stevehuang52 <heh@nvidia.com> * fix style Signed-off-by: stevehuang52 <heh@nvidia.com> * update Signed-off-by: stevehuang52 <heh@nvidia.com> Co-authored-by: Somshubra Majumdar <titu1994@gmail.com>
NVIDIA · May 31, 2022 · f6936ce · f6936ce
1 parent e5d048e
commit f6936ce
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 3 deletions.
diff --git a/examples/asr/transcribe_speech.py b/examples/asr/transcribe_speech.py
@@ -17,6 +17,7 @@
 import json
 import os
 from dataclasses import dataclass, is_dataclass
+from pathlib import Path
 from typing import Optional
 
 import pytorch_lightning as pl
@@ -158,6 +159,7 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
             logging.error(f"The input dataset_manifest {cfg.dataset_manifest} is empty. Exiting!")
             return None
 
+        manifest_dir = Path(cfg.dataset_manifest).parent
         with open(cfg.dataset_manifest, 'r') as f:
             has_two_fields = []
             for line in f:
@@ -166,7 +168,10 @@ def main(cfg: TranscriptionConfig) -> TranscriptionConfig:
                     has_two_fields.append(True)
                 else:
                     has_two_fields.append(False)
-                filepaths.append(item['audio_filepath'])
+                audio_file = Path(item['audio_filepath'])
+                if not audio_file.is_file() and not audio_file.is_absolute():
+                    audio_file = manifest_dir / audio_file
+                filepaths.append(str(audio_file.absolute()))
         partial_audio = all(has_two_fields)
 
     logging.info(f"\nTranscribing {len(filepaths)} files...\n")

diff --git a/nemo/collections/common/parts/preprocessing/manifest.py b/nemo/collections/common/parts/preprocessing/manifest.py
@@ -14,6 +14,7 @@
 
 import json
 from os.path import expanduser
+from pathlib import Path
 from typing import Any, Callable, Dict, Iterator, List, Optional, Union
 
 
@@ -87,7 +88,20 @@ def __parse_item(line: str, manifest_file: str) -> Dict[str, Any]:
         raise ValueError(
             f"Manifest file {manifest_file} has invalid json line structure: {line} without proper audio file key."
         )
-    item['audio_file'] = expanduser(item['audio_file'])
+
+    # If the audio path is relative, and not using tarred dataset,
+    # attach the parent directory of manifest to the audio path.
+    # Assume "audio_file" starts with a dir, such as "wavs/xxxxx.wav".
+    # If using a tarred dataset, the "audio_path" is like "_home_data_tarred_wavs_xxxx.wav",
+    # so we will just ignore it.
+    manifest_dir = Path(manifest_file).parent
+    audio_file = Path(item['audio_file'])
+    if not audio_file.is_file() and not audio_file.is_absolute() and audio_file.parent != Path("."):
+        # assume the wavs/ dir and manifest are under the same parent dir
+        audio_file = manifest_dir / audio_file
+        item['audio_file'] = str(audio_file.absolute())
+    else:
+        item['audio_file'] = expanduser(item['audio_file'])
 
     # Duration.
     if 'duration' not in item:

diff --git a/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py b/scripts/asr_language_modeling/ngram_lm/eval_beamsearch_ngram.py
@@ -41,6 +41,7 @@
 import json
 import os
 import pickle
+from pathlib import Path
 
 import editdistance
 import kenlm_utils
@@ -230,12 +231,16 @@ def main():
         )
 
     target_transcripts = []
+    manifest_dir = Path(args.input_manifest).parent
     with open(args.input_manifest, 'r') as manifest_file:
         audio_file_paths = []
         for line in tqdm(manifest_file, desc=f"Reading Manifest {args.input_manifest} ...", ncols=120):
             data = json.loads(line)
+            audio_file = Path(data['audio_filepath'])
+            if not audio_file.is_file() and not audio_file.is_absolute():
+                audio_file = manifest_dir / audio_file
             target_transcripts.append(data['text'])
-            audio_file_paths.append(data['audio_filepath'])
+            audio_file_paths.append(str(audio_file.absolute()))
 
     if args.probs_cache_file and os.path.exists(args.probs_cache_file):
         logging.info(f"Found a pickle file of probabilities at '{args.probs_cache_file}'.")