-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdb.py
127 lines (114 loc) · 4.07 KB
/
db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import psycopg2
from psycopg2 import extras
import pandas as pd
from dotenv import dotenv_values
config = dotenv_values(".env")
POSTGRE_PW = config['POSTGRE_PW']
df = pd.read_parquet("spotify_final.parquet")
# Connect to database
try:
conn = psycopg2.connect(
host="192.168.1.225",
dbname="postgres",
user="postgres",
password=POSTGRE_PW,
port=5432
)
print("Connection successful")
except Exception as e:
print(f"Error: {e}")
cur = conn.cursor()
# First, ensure all the dimension tables are defined:
cur.execute("""
CREATE TABLE IF NOT EXISTS artist (
artistID VARCHAR(22) PRIMARY KEY,
name VARCHAR(255),
popularity INT
)
""")
cur.execute("""
CREATE TABLE IF NOT EXISTS album (
albumID VARCHAR(22) PRIMARY KEY,
name VARCHAR(255),
releaseDate TIMESTAMP,
popularity INT
)
""")
cur.execute("""
CREATE TABLE IF NOT EXISTS trackFeatures (
trackID VARCHAR(22) PRIMARY KEY,
danceability FLOAT,
energyLevel FLOAT,
instrumentalness FLOAT,
liveness FLOAT,
loudness FLOAT,
speechiness FLOAT,
tempo FLOAT,
duration_ms FLOAT,
timeSignature FLOAT
)
""")
# Then, define the track table with a foreign key reference:
cur.execute("""
CREATE TABLE IF NOT EXISTS track (
trackID VARCHAR(22) PRIMARY KEY,
artistID VARCHAR(22),
albumID VARCHAR(22),
name VARCHAR(255),
popularity INT,
genres TEXT[],
playlistSources TEXT[],
playlistOccurrences INT,
FOREIGN KEY (artistID) REFERENCES artist(artistID), -- Adding foreign key relationship; maintaining referential integrity within the database
FOREIGN KEY (albumID) REFERENCES album(albumID),
FOREIGN KEY (trackID) REFERENCES trackFeatures(trackID)
)
""")
# Insert data into the dimension tables first
artist_data_to_insert = df[['artist id', 'artist', 'artist popularity']].drop_duplicates().values.tolist()
extras.execute_batch(cur, """
INSERT INTO artist (artistID, name, popularity) VALUES
(%s, %s, %s)
ON CONFLICT (artistID) DO NOTHING
""", artist_data_to_insert)
album_data_to_insert = df[['album id', 'album', 'album release date', 'album popularity']].drop_duplicates().values.tolist()
extras.execute_batch(cur, """
INSERT INTO album (albumID, name, releaseDate, popularity) VALUES
(%s, %s, %s, %s)
ON CONFLICT (albumID) DO NOTHING
""", album_data_to_insert)
track_features_data_to_insert = df[['id', 'danceability', 'energy level', 'instrumentalness',
'liveness', 'loudness', 'speechiness', 'tempo',
'duration_ms', 'time signature']].drop_duplicates().values.tolist()
extras.execute_batch(cur, """
INSERT INTO trackFeatures (trackID, danceability, energyLevel, instrumentalness,
liveness, loudness, speechiness, tempo,
duration_ms, timeSignature) VALUES
(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (trackID) DO NOTHING
""", track_features_data_to_insert)
conn.commit() # Commit after inserting into dimension tables
# Prepare and insert data into the track table
track_data_to_insert = [
(
row['id'],
row['artist id'],
row['album id'],
row['name'],
int(row['popularity']),
row['genres'],
row['playlist sources'],
int(row['playlist occurrences'])
)
for index, row in df.iterrows()
]
extras.execute_batch(cur, """
INSERT INTO track (trackID, artistID, albumID, name, popularity, genres,
playlistSources, playlistOccurrences) VALUES
(%s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (trackID) DO NOTHING
""", track_data_to_insert)
conn.commit() # Commit after inserting into track
# Clean up
cur.close()
conn.close()