-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathhelpers.py
108 lines (91 loc) · 3.37 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import lyricsgenius as genius
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('wordnet')
def search_data(query,n,access_token):
"""
This function uses the library lyricsgenius to extract the fields
title, artist, album, date and lyrics and stores them into a pandas dataframe
parameters:
query = artist or band to search
n = max numbers of songs
access_token = your access token of the genius api
"""
api = genius.Genius(access_token)
list_lyrics = []
list_title = []
list_artist = []
list_album = []
list_year = []
artist = api.search_artist(query,max_songs=n,sort='popularity')
songs = artist.songs
for song in songs:
list_lyrics.append(song.lyrics)
list_title.append(song.title)
list_artist.append(song.artist)
list_album.append(song.album)
list_year.append(song.year)
df = pd.DataFrame({'artist':list_artist,'title':list_title,'album':list_album,
'date':list_year,'lyric':list_lyrics})
return df
def clean_lyrics(df,column):
"""
This function cleans the words without importance and fix the format of the dataframe's column lyrics
parameters:
df = dataframe
column = name of the column to clean
"""
df = df
df[column] = df[column].str.lower()
df[column] = df[column].str.replace(r"verse |[1|2|3]|chorus|bridge|outro","").str.replace("[","").str.replace("]","")
df[column] = df[column].str.lower().str.replace(r"instrumental|intro|guitar|solo","")
df[column] = df[column].str.replace("\n"," ").str.replace(r"[^\w\d'\s]+","").str.replace("efil ym fo flah","")
df[column] = df[column].str.strip()
return df
def lyrics_to_words(document):
"""
This function splits the text of lyrics to single words, removing stopwords and doing the lemmatization to each word
parameters:
document: text to split to single words
"""
stop_words = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
stopwordremoval = " ".join([i for i in document.lower().split() if i not in stop_words])
punctuationremoval = ''.join(ch for ch in stopwordremoval if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punctuationremoval.split())
return normalized
def create_decades(df):
"""
This function creates a new column called decades used to group the songs and lyrics by decade based on the date released
for each song
parameters:
df = dataframe
"""
years = []
decades = []
df['date'].fillna(0)
df['date'] = df['date'].astype("str")
for i in df.index:
years.append(df['date'].str.split("-")[i][0])
df['year'] = years
df['year'] = df['year'].astype("int")
for year in df['year']:
if 1970 <= year < 1980:
decades.append("70s")
if 1980 <= year < 1990:
decades.append("80s")
if 1990 <= year < 2000:
decades.append("90s")
if 2000 <= year < 2010:
decades.append("00s")
if 2010 <= year :
decades.append("10s")
df['decade'] = decades
df = df[['artist','title','album','decade','year','date','lyric']]
return df