-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_mining.py
142 lines (104 loc) · 4.54 KB
/
data_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# std imports
import os
import re
import csv
import json
import logging
import requests
# API imports
import wptools
import wikipedia
import pylast
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
last_fm_api_key = "194ebdf5b49fa996adb5ffb9bfcab1db"
passwd_path = "hidden/passwdData.csv"
albums_info_path = "album_info/albums.csv"
dest_path = "album_info/album_info.json"
album_covers = "album_info/album_covers/"
os.environ['SPOTIPY_CLIENT_ID'] = None # Here the spotify client ID key should be pasted
os.environ['SPOTIPY_CLIENT_SECRET'] = None # Here the spotify client secret key should be pasted
features_list = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
'liveness', 'valence', 'tempo']
def read_album_info_from_csv(path):
result = []
with open(path, "r", newline='') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=";")
for line in csv_reader:
result.append((line[0], line[1]))
return result
def get_features(album, artist, spotify):
album_info = spotify.search(q="album:{} artist:{}".format(album, artist),
type="album")['albums']['items']
if len(album_info) == 0:
logging.warning(f"Album {artist} - {album} hasn't been found on spotify")
return {}
cover = requests.get(album_info[0]['images'][2]['url'])
with open(album_covers + "{} - {}.png".format(artist, album).replace('/', ' ')
.replace('?', ' ').replace(':', ' '), "wb") as file:
file.write(cover.content)
album_features = {feature: 0 for feature in features_list}
tracks = spotify.album_tracks(album_info[0]['uri'])
album_duration = 0
for track in tracks['items']:
track_features = spotify.audio_features([track['id']])[0]
track_duration = track_features['duration_ms']
for feature in features_list:
album_features[feature] += track_features[feature] * track_duration
album_duration += track_duration
for feature in features_list:
album_features[feature] /= album_duration
return album_features
def get_tags(artist, title, network):
logging.info(f'Artist: {artist:30},Title: {title:30}')
# "&" is more frequent on last.fm than "and", so we choose "&"
if " and " in artist or " and " in title:
title = title.replace(" and ", " & ")
artist = artist.replace(" and ", " & ")
try:
album = network.get_album(title=title, artist=artist)
tags = album.get_top_tags(limit=10)
if len(tags) == 0:
logging.warning(f'Album {artist} - {title} has 0 tags on last.fm')
return {tag.item.name: int(tag.weight) for tag in tags}
except pylast.WSError:
logging.warning(f"Album {artist} - {title} hasn't been found on last.fm")
return {}
def format_genres(genres_str):
genres = re.findall('\[\[.*?]]', genres_str)
genres = [genre[2:-2].lower() for genre in genres]
genres = [min(genre.split("|"), key=len) for genre in genres]
return genres
def remove_duplicates(arr):
return list(set(arr))
def get_genre(album, artist):
names_list = wikipedia.search(album + " (" + artist + ")")
for name_index in range(min(5, len(names_list))):
so = wptools.page(names_list[name_index], silent=True).get_parse()
infobox = so.data.get('infobox') if so else None
genres_str = infobox.get('genre') if infobox else None
if genres_str:
return remove_duplicates(format_genres(genres_str))
logging.warning(f"Album {artist} - {album} hasn't been found on wikipedia")
return []
def get_albums_info(api_key, albums_names):
# connecting with API clients
network = pylast.LastFMNetwork(api_key=api_key)
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())
# returning the full list of albums
return [{"title": title,
"artist": artist,
"tags": get_tags(artist, title, network),
"features": get_features(title, artist, spotify),
"genre": get_genre(title, artist)}
for artist, title in albums_names]
def write_to_json(data, path):
with open(path, 'w') as json_file:
json_file.write(json.dumps(data, indent=4, sort_keys=True))
def main():
albums_names = read_album_info_from_csv(albums_info_path)
albums_info = get_albums_info(api_key=last_fm_api_key,
albums_names=albums_names)
write_to_json(albums_info, dest_path)
if __name__ == '__main__':
main()