1
+ from __future__ import annotations
1
2
from collections import Counter
2
3
import copy
3
4
import os
@@ -134,6 +135,10 @@ def files_from_dir(annot_dir, annot_format):
134
135
return annot_files
135
136
136
137
138
+ class AudioFilenameNotFound (Exception ):
139
+ """Error raised by ``audio_stem_from_path``"""
140
+
141
+
137
142
def audio_stem_from_path (path ):
138
143
"""Find the name of an audio file within a filename
139
144
by removing extensions until finding an audio extension,
@@ -175,42 +180,90 @@ def audio_stem_from_path(path):
175
180
new_stem , ext = os .path .splitext (stem )
176
181
ext = ext .replace ("." , "" ).lower ()
177
182
if new_stem == stem :
178
- raise ValueError (f"unable to compute stem of { path } " )
183
+ raise AudioFilenameNotFound (
184
+ f"Unable to find a valid audio filename in path:\n { path } .\n "
185
+ f"Valid audio file extensions are:\n { constants .VALID_AUDIO_FORMATS } "
186
+ )
179
187
else :
180
188
stem = new_stem
181
189
return stem
182
190
183
191
184
- def map_annotated_to_annot (source_files , annot_list ):
185
- """map source files, i.e. audio or spectrogram files, to annotations
192
+ def map_annotated_to_annot (annotated_files : list ,
193
+ annot_list : crowsetta .Annotation ) -> dict :
194
+ """Map annotated files,
195
+ i.e. audio or spectrogram files,
196
+ to their corresponding annotations.
186
197
187
- returns a ``dict`` where each key is a path to a source audio file
188
- or array file containing a spectrogram, and the corresponding value
189
- is the annotation for the source file
198
+ Returns a ``dict`` where each key
199
+ is a path to an annotated file,
200
+ and the value for each key
201
+ is a ``crowsetta.Annotation``.
190
202
191
203
Parameters
192
204
----------
193
- source_files : list
194
- of audio or spectrogram files. The names of the files must begin with the
195
- audio_path attribute of the corresponding annotations. E.g., if an audio file is
196
- 'bird0-2016-05-04-133027.wav', then there must be an annotation whose
197
- file attribute equals that filename. Spectrogram files should include
198
- the audio file name, e.g. 'bird0-2016-05-04-133027.wav.mat' or
199
- 'bird0-2016-05-04-133027.spect.npz' would match an annotation with the
200
- audio_path attribute 'bird0-2016-05-04-133027.wav'.
205
+ annotated_files : list
206
+ Of paths to audio or spectrogram files.
201
207
annot_list : list
202
- of Annotations corresponding to files in source_files
208
+ of Annotations corresponding to files in annotated_files
209
+
210
+ Notes
211
+ -----
212
+ The filenames of the ``annotated_files`` must
213
+ begin with the filename of the ``audio_path``
214
+ attribute of the corresponding
215
+ ``crowsetta.Annotation`` instances.
216
+ E.g., if `annotated_files` includes
217
+ an audio file named
218
+ 'bird0-2016-05-04-133027.wav',
219
+ then it will be mapped to an ``Annotation``
220
+ with an `audio_path` attribute
221
+ whose filename matches it.
222
+ Spectrogram files should also include
223
+ the audio file name,
224
+ e.g. 'bird0-2016-05-04-133027.wav.mat'
225
+ or 'bird0-2016-05-04-133027.spect.npz'
226
+ would match an ``Annotation`` with the
227
+ ``audio_path`` attribute '/some/path/bird0-2016-05-04-133027.wav'.
228
+
229
+ For more detail, please see
230
+ the page on file naming conventions in the
231
+ reference section of the documentation:
232
+ https://vak.readthedocs.io/en/latest/reference/filenames.html
203
233
"""
204
- if type (source_files ) == np .ndarray : # e.g., vak DataFrame['spect_path'].values
205
- source_files = source_files .tolist ()
234
+ if type (annotated_files ) == np .ndarray : # e.g., vak DataFrame['spect_path'].values
235
+ annotated_files = annotated_files .tolist ()
206
236
207
237
# to pair audio files with annotations, make list of tuples
208
- source_annot_map = {}
238
+ annotated_annot_map = {}
209
239
210
240
# ----> make a dict with audio stems as keys,
211
241
# so we can look up annotations by stemming source files and using as keys.
212
242
# First check that we don't have duplicate keys that would cause this to fail silently
213
- keys = [audio_stem_from_path (annot .audio_path ) for annot in annot_list ]
243
+ keys = []
244
+ for annot in annot_list :
245
+ try :
246
+ key = audio_stem_from_path (annot .audio_path )
247
+ except AudioFilenameNotFound as e :
248
+ # Do this as a loop with a super verbose error
249
+ # instead of e.g. a single-line list comprehension
250
+ # so we can help users troubleshoot,
251
+ # see https://github.com/vocalpy/vak/issues/525
252
+ raise ValueError (
253
+ "The ``audio_path`` attribute of a ``crowsetta.Annotation`` was "
254
+ "not recognized as a valid audio filename.\n "
255
+ f"The ``audio_path`` attribute was:\n { annot .audio_path } \n "
256
+ f"The annotation was loaded from this path:\n { annot .annot_path } \n "
257
+ "For some annotation formats, audio filenames are inferred from annotation filenames.\n "
258
+ "Please check that your annotation files are named "
259
+ "according to the conventions:\n "
260
+ "https://vak.readthedocs.io/en/latest/reference/filenames.html\n "
261
+ "It may also be helpful to read the page on converting custom formats "
262
+ "to annotations that ``vak`` can work with:\n "
263
+ "https://vak.readthedocs.io/en/latest/howto/howto_user_annot.html"
264
+ ) from e
265
+ keys .append (key )
266
+
214
267
keys_set = set (keys )
215
268
if len (keys_set ) < len (keys ):
216
269
duplicates = [item for item , count in Counter (keys ).items () if count > 1 ]
@@ -225,33 +278,25 @@ def map_annotated_to_annot(source_files, annot_list):
225
278
# Make a copy from which we remove source files after mapping them to annotation,
226
279
# to validate that function worked,
227
280
# by making sure there are no items left in this copy after the loop
228
- source_files_copy = copy .deepcopy (source_files )
229
- for source_file in list (
230
- source_files
281
+ annotated_files_copy = copy .deepcopy (annotated_files )
282
+ for annotated_file in list (
283
+ annotated_files
231
284
): # list() to copy, so we can pop off items while iterating
232
285
# remove stem so we can find .spect files that match with audio files,
233
286
# e.g. find 'llb3_0003_2018_04_23_14_18_54.mat' that should match
234
287
# with 'llb3_0003_2018_04_23_14_18_54.wav'
235
- source_file_stem = audio_stem_from_path (source_file )
236
-
237
- try :
238
- annot = audio_stem_annot_map [source_file_stem ]
239
- except KeyError :
240
- raise ValueError (
241
- f"could not find annotation for source file: { source_file } .\n "
242
- f"No annotation had an audio file whose stem matched the source file stem: { source_file_stem } "
243
- )
244
-
245
- source_annot_map [source_file ] = annot
246
- source_files_copy .remove (source_file )
288
+ annotated_file_stem = audio_stem_from_path (annotated_file )
289
+ annot = audio_stem_annot_map [annotated_file_stem ]
290
+ annotated_annot_map [annotated_file ] = annot
291
+ annotated_files_copy .remove (annotated_file )
247
292
248
- if len (source_files_copy ) > 0 :
293
+ if len (annotated_files_copy ) > 0 :
249
294
raise ValueError (
250
295
"could not map the following source files to annotations: "
251
- f"{ source_files_copy } "
296
+ f"{ annotated_files_copy } "
252
297
)
253
298
254
- return source_annot_map
299
+ return annotated_annot_map
255
300
256
301
257
302
def has_unlabeled (annot : crowsetta .Annotation ,
0 commit comments