Skip to content

Commit 40d27f5

Browse files
authoredMar 8, 2023
Merge pull request #4633 from JOJ0/refactor_id_extraction
Refactor metadata source ID extraction utilities
2 parents 8bbaefb + c6746ed commit 40d27f5

File tree

8 files changed

+161
-50
lines changed

8 files changed

+161
-50
lines changed
 

‎beets/plugins.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -705,22 +705,27 @@ def get_artist(artists, id_key='id', name_key='name', join_key=None):
705705

706706
return artist_string, artist_id
707707

708-
def _get_id(self, url_type, id_):
708+
@staticmethod
709+
def _get_id(url_type, id_, id_regex):
709710
"""Parse an ID from its URL if necessary.
710711
711712
:param url_type: Type of URL. Either 'album' or 'track'.
712713
:type url_type: str
713714
:param id_: Album/track ID or URL.
714715
:type id_: str
716+
:param id_regex: A dictionary containing a regular expression
717+
extracting an ID from an URL (if it's not an ID already) in
718+
'pattern' and the number of the match group in 'match_group'.
719+
:type id_regex: dict
715720
:return: Album/track ID.
716721
:rtype: str
717722
"""
718-
self._log.debug(
719-
"Searching {} for {} '{}'", self.data_source, url_type, id_
723+
log.debug(
724+
"Extracting {} ID from '{}'", url_type, id_
720725
)
721-
match = re.search(self.id_regex['pattern'].format(url_type), str(id_))
726+
match = re.search(id_regex['pattern'].format(url_type), str(id_))
722727
if match:
723-
id_ = match.group(self.id_regex['match_group'])
728+
id_ = match.group(id_regex['match_group'])
724729
if id_:
725730
return id_
726731
return None

‎beets/util/id_extractors.py

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# This file is part of beets.
2+
# Copyright 2016, Adrian Sampson.
3+
#
4+
# Permission is hereby granted, free of charge, to any person obtaining
5+
# a copy of this software and associated documentation files (the
6+
# "Software"), to deal in the Software without restriction, including
7+
# without limitation the rights to use, copy, modify, merge, publish,
8+
# distribute, sublicense, and/or sell copies of the Software, and to
9+
# permit persons to whom the Software is furnished to do so, subject to
10+
# the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be
13+
# included in all copies or substantial portions of the Software.
14+
15+
"""Helpers around the extraction of album/track ID's from metadata sources."""
16+
17+
import re
18+
19+
# Spotify IDs consist of 22 alphanumeric characters
20+
# (zero-left-padded base62 representation of randomly generated UUID4)
21+
spotify_id_regex = {
22+
'pattern': r'(^|open\.spotify\.com/{}/)([0-9A-Za-z]{{22}})',
23+
'match_group': 2,
24+
}
25+
26+
deezer_id_regex = {
27+
'pattern': r'(^|deezer\.com/)([a-z]*/)?({}/)?(\d+)',
28+
'match_group': 4,
29+
}
30+
31+
beatport_id_regex = {
32+
'pattern': r'(^|beatport\.com/release/.+/)(\d+)$',
33+
'match_group': 2,
34+
}
35+
36+
# A note on Bandcamp: There is no such thing as a Bandcamp album or artist ID,
37+
# the URL can be used as the identifier. The Bandcamp metadata source plugin
38+
# works that way - https://github.com/unrblt/beets-bandcamp. Bandcamp album
39+
# URLs usually look like: https://nameofartist.bandcamp.com/album/nameofalbum
40+
41+
42+
def extract_discogs_id_regex(album_id):
43+
"""Returns the Discogs_id or None."""
44+
# Discogs-IDs are simple integers. In order to avoid confusion with
45+
# other metadata plugins, we only look for very specific formats of the
46+
# input string:
47+
# - plain integer, optionally wrapped in brackets and prefixed by an
48+
# 'r', as this is how discogs displays the release ID on its webpage.
49+
# - legacy url format: discogs.com/<name of release>/release/<id>
50+
# - legacy url short format: discogs.com/release/<id>
51+
# - current url format: discogs.com/release/<id>-<name of release>
52+
# See #291, #4080 and #4085 for the discussions leading up to these
53+
# patterns.
54+
# Regex has been tested here https://regex101.com/r/TOu7kw/1
55+
56+
for pattern in [
57+
r'^\[?r?(?P<id>\d+)\]?$',
58+
r'discogs\.com/release/(?P<id>\d+)-?',
59+
r'discogs\.com/[^/]+/release/(?P<id>\d+)',
60+
]:
61+
match = re.search(pattern, album_id)
62+
if match:
63+
return int(match.group('id'))
64+
65+
return None

‎beetsplug/beatport.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from beets.autotag.hooks import AlbumInfo, TrackInfo
2929
from beets.plugins import BeetsPlugin, MetadataSourcePlugin, get_distance
3030
import confuse
31+
from beets.util.id_extractors import beatport_id_regex
3132

3233

3334
AUTH_ERRORS = (TokenRequestDenied, TokenMissing, VerifierMissing)
@@ -267,6 +268,7 @@ def __init__(self, data):
267268

268269
class BeatportPlugin(BeetsPlugin):
269270
data_source = 'Beatport'
271+
id_regex = beatport_id_regex
270272

271273
def __init__(self):
272274
super().__init__()
@@ -380,11 +382,13 @@ def album_for_id(self, release_id):
380382
or None if the query is not a valid ID or release is not found.
381383
"""
382384
self._log.debug('Searching for release {0}', release_id)
383-
match = re.search(r'(^|beatport\.com/release/.+/)(\d+)$', release_id)
384-
if not match:
385+
386+
release_id = self._get_id('album', release_id, self.id_regex)
387+
if release_id is None:
385388
self._log.debug('Not a valid Beatport release ID.')
386389
return None
387-
release = self.client.get_release(match.group(2))
390+
391+
release = self.client.get_release(release_id)
388392
if release:
389393
return self._get_album_info(release)
390394
return None

‎beetsplug/deezer.py

+4-6
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from beets import ui
2424
from beets.autotag import AlbumInfo, TrackInfo
2525
from beets.plugins import MetadataSourcePlugin, BeetsPlugin
26+
from betts.utils.id_extractors import deezer_id_regex
2627

2728

2829
class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
@@ -34,10 +35,7 @@ class DeezerPlugin(MetadataSourcePlugin, BeetsPlugin):
3435
album_url = 'https://api.deezer.com/album/'
3536
track_url = 'https://api.deezer.com/track/'
3637

37-
id_regex = {
38-
'pattern': r'(^|deezer\.com/)([a-z]*/)?({}/)?(\d+)',
39-
'match_group': 4,
40-
}
38+
id_regex = deezer_id_regex
4139

4240
def __init__(self):
4341
super().__init__()
@@ -51,7 +49,7 @@ def album_for_id(self, album_id):
5149
:return: AlbumInfo object for album.
5250
:rtype: beets.autotag.hooks.AlbumInfo or None
5351
"""
54-
deezer_id = self._get_id('album', album_id)
52+
deezer_id = self._get_id('album', album_id, self.id_regex)
5553
if deezer_id is None:
5654
return None
5755

@@ -154,7 +152,7 @@ def track_for_id(self, track_id=None, track_data=None):
154152
:rtype: beets.autotag.hooks.TrackInfo or None
155153
"""
156154
if track_data is None:
157-
deezer_id = self._get_id('track', track_id)
155+
deezer_id = self._get_id('track', track_id, self.id_regex)
158156
if deezer_id is None:
159157
return None
160158
track_data = requests.get(self.track_url + deezer_id).json()

‎beetsplug/discogs.py

+3-27
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
import beets.ui
2020
from beets import config
21+
from beets.util.id_extractors import extract_discogs_id_regex
2122
from beets.autotag.hooks import AlbumInfo, TrackInfo
2223
from beets.plugins import MetadataSourcePlugin, BeetsPlugin, get_distance
2324
import confuse
@@ -218,31 +219,6 @@ def item_candidates(self, item, artist, title):
218219
# first 10 results, don't overwhelm with options
219220
return candidates[:10]
220221

221-
@staticmethod
222-
def extract_release_id_regex(album_id):
223-
"""Returns the Discogs_id or None."""
224-
# Discogs-IDs are simple integers. In order to avoid confusion with
225-
# other metadata plugins, we only look for very specific formats of the
226-
# input string:
227-
# - plain integer, optionally wrapped in brackets and prefixed by an
228-
# 'r', as this is how discogs displays the release ID on its webpage.
229-
# - legacy url format: discogs.com/<name of release>/release/<id>
230-
# - current url format: discogs.com/release/<id>-<name of release>
231-
# See #291, #4080 and #4085 for the discussions leading up to these
232-
# patterns.
233-
# Regex has been tested here https://regex101.com/r/wyLdB4/2
234-
235-
for pattern in [
236-
r'^\[?r?(?P<id>\d+)\]?$',
237-
r'discogs\.com/release/(?P<id>\d+)-',
238-
r'discogs\.com/[^/]+/release/(?P<id>\d+)',
239-
]:
240-
match = re.search(pattern, album_id)
241-
if match:
242-
return int(match.group('id'))
243-
244-
return None
245-
246222
def album_for_id(self, album_id):
247223
"""Fetches an album by its Discogs ID and returns an AlbumInfo object
248224
or None if the album is not found.
@@ -252,7 +228,7 @@ def album_for_id(self, album_id):
252228

253229
self._log.debug('Searching for release {0}', album_id)
254230

255-
discogs_id = self.extract_release_id_regex(album_id)
231+
discogs_id = extract_discogs_id_regex(album_id)
256232

257233
if not discogs_id:
258234
return None
@@ -365,7 +341,7 @@ def get_album_info(self, result):
365341
else:
366342
genre = base_genre
367343

368-
discogs_albumid = self.extract_release_id_regex(result.data.get('uri'))
344+
discogs_albumid = extract_discogs_id_regex(result.data.get('uri'))
369345

370346
# Extract information for the optional AlbumInfo fields that are
371347
# contained on nested discogs fields.

‎beetsplug/spotify.py

+4-8
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from beets.dbcore import types
3333
from beets.library import DateType
3434
from beets.plugins import BeetsPlugin, MetadataSourcePlugin
35+
from beets.util.id_extractors import spotify_id_regex
3536

3637
DEFAULT_WAITING_TIME = 5
3738

@@ -69,12 +70,7 @@ class SpotifyPlugin(MetadataSourcePlugin, BeetsPlugin):
6970
track_url = 'https://api.spotify.com/v1/tracks/'
7071
audio_features_url = 'https://api.spotify.com/v1/audio-features/'
7172

72-
# Spotify IDs consist of 22 alphanumeric characters
73-
# (zero-left-padded base62 representation of randomly generated UUID4)
74-
id_regex = {
75-
'pattern': r'(^|open\.spotify\.com/{}/)([0-9A-Za-z]{{22}})',
76-
'match_group': 2,
77-
}
73+
id_regex = spotify_id_regex
7874

7975
spotify_audio_features = {
8076
'acousticness': 'spotify_acousticness',
@@ -216,7 +212,7 @@ def album_for_id(self, album_id):
216212
:return: AlbumInfo object for album
217213
:rtype: beets.autotag.hooks.AlbumInfo or None
218214
"""
219-
spotify_id = self._get_id('album', album_id)
215+
spotify_id = self._get_id('album', album_id, self.id_regex)
220216
if spotify_id is None:
221217
return None
222218

@@ -330,7 +326,7 @@ def track_for_id(self, track_id=None, track_data=None):
330326
:rtype: beets.autotag.hooks.TrackInfo or None
331327
"""
332328
if track_data is None:
333-
spotify_id = self._get_id('track', track_id)
329+
spotify_id = self._get_id('track', track_id, self.id_regex)
334330
if spotify_id is None:
335331
return None
336332
track_data = self._handle_response(

‎test/test_discogs.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from test.helper import capture_log
2222

2323
from beets import config
24+
from beets.util.id_extractors import extract_discogs_id_regex
2425

2526
from beetsplug.discogs import DiscogsPlugin
2627

@@ -371,7 +372,7 @@ def test_album_for_id(self):
371372
('005b84a0-ecd6-39f1-b2f6-6eb48756b268', ''),
372373
]
373374
for test_pattern, expected in test_patterns:
374-
match = DiscogsPlugin.extract_release_id_regex(test_pattern)
375+
match = extract_discogs_id_regex(test_pattern)
375376
if not match:
376377
match = ''
377378
self.assertEqual(match, expected)

‎test/test_plugins.py

+66
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
from beets.dbcore import types
2727
from mediafile import MediaFile
2828
from beets.util import displayable_path, bytestring_path, syspath
29+
from beets.plugins import MetadataSourcePlugin
30+
from beets.util.id_extractors import spotify_id_regex, deezer_id_regex, \
31+
beatport_id_regex
2932

3033
from test.test_importer import ImportHelper, AutotagStub
3134
from test.test_ui_importer import TerminalImportSessionSetup
@@ -558,6 +561,69 @@ def foo(self, session, task):
558561
require=ANY)
559562

560563

564+
class ParseSpotifyIDTest(unittest.TestCase):
565+
def test_parse_id_correct(self):
566+
id_string = "39WqpoPgZxygo6YQjehLJJ"
567+
out = MetadataSourcePlugin._get_id(
568+
"album", id_string, spotify_id_regex)
569+
self.assertEqual(out, id_string)
570+
571+
def test_parse_id_non_id_returns_none(self):
572+
id_string = "blah blah"
573+
out = MetadataSourcePlugin._get_id(
574+
"album", id_string, spotify_id_regex)
575+
self.assertEqual(out, None)
576+
577+
def test_parse_id_url_finds_id(self):
578+
id_string = "39WqpoPgZxygo6YQjehLJJ"
579+
id_url = "https://open.spotify.com/album/%s" % id_string
580+
out = MetadataSourcePlugin._get_id(
581+
"album", id_url, spotify_id_regex)
582+
self.assertEqual(out, id_string)
583+
584+
585+
class ParseDeezerIDTest(unittest.TestCase):
586+
def test_parse_id_correct(self):
587+
id_string = "176356382"
588+
out = MetadataSourcePlugin._get_id(
589+
"album", id_string, deezer_id_regex)
590+
self.assertEqual(out, id_string)
591+
592+
def test_parse_id_non_id_returns_none(self):
593+
id_string = "blah blah"
594+
out = MetadataSourcePlugin._get_id(
595+
"album", id_string, deezer_id_regex)
596+
self.assertEqual(out, None)
597+
598+
def test_parse_id_url_finds_id(self):
599+
id_string = "176356382"
600+
id_url = "https://www.deezer.com/album/%s" % id_string
601+
out = MetadataSourcePlugin._get_id(
602+
"album", id_url, deezer_id_regex)
603+
self.assertEqual(out, id_string)
604+
605+
606+
class ParseBeatportIDTest(unittest.TestCase):
607+
def test_parse_id_correct(self):
608+
id_string = "3089651"
609+
out = MetadataSourcePlugin._get_id(
610+
"album", id_string, beatport_id_regex)
611+
self.assertEqual(out, id_string)
612+
613+
def test_parse_id_non_id_returns_none(self):
614+
id_string = "blah blah"
615+
out = MetadataSourcePlugin._get_id(
616+
"album", id_string, beatport_id_regex)
617+
self.assertEqual(out, None)
618+
619+
def test_parse_id_url_finds_id(self):
620+
id_string = "3089651"
621+
id_url = "https://www.beatport.com/release/album-name/%s" % id_string
622+
out = MetadataSourcePlugin._get_id(
623+
"album", id_url, beatport_id_regex)
624+
self.assertEqual(out, id_string)
625+
626+
561627
def suite():
562628
return unittest.TestLoader().loadTestsFromName(__name__)
563629

0 commit comments

Comments
 (0)
Please sign in to comment.