-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
59 lines (55 loc) · 1.77 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# python 3
# OP
import pandas as pd, numpy as np
# ML
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
##########################################################
# ---------------- TODO ----------------
# 1) predict on test data
# 2) tune the super-parameter /
# 3) modify train func let users can randomly rate songs
# ---------------- TODO ----------------
##########################################################
# -----------------------------
# ML
def main():
# load data
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
# data preprocess
X = df_train[['acousticness',
'danceability',
'energy',
'instrumentalness',
'key',
'liveness',
'loudness',
'mode',
'speechiness',
'tempo',
'valence']]
y = df_train['ratings']
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# ML
# ---------------- model 1) : RF ----------------
# config
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# need to tune super-parameter later
RF_params = {'max_depth': range(1,11),'min_samples_split' : range(5,20)}
# model train
RF = RandomForestClassifier()
RF_grid = GridSearchCV(RF,RF_params ,cv=skf, n_jobs=-1, verbose=True)
RF_grid.fit(X_train, y_train)
RF_grid.best_estimator_, RF_grid.best_score_
# prdict on test set
predict_rate = RF_grid.predict(X_test)
X_test['predict_rate'] = predict_rate
# merge back to origin df
prediction_ = pd.merge(X_test,df_train,left_index=True,right_index=True)
# print final predict result
print (prediction_[['id','predict_rate','ratings' ]])
return prediction_
if __name__ == '__main__':
main()