-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpredictor.py
90 lines (78 loc) · 4.02 KB
/
predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import logging
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import datetime
import matplotlib.pyplot as plt
from constants import Metrics
class Predictor:
def __init__(self, use_data_after_date: str = None):
self.accuracy = None
self.use_data_after_date = use_data_after_date
self.predictions = []
self.metrics = None
def parse_bookings(self):
df_rolling = pd.read_csv('bookings.csv', parse_dates=['date', 'start_time', 'end_time'], index_col=[0])
df_rolling = df_rolling.drop(['start_time', 'end_time'], axis=1)
df_rolling[df_rolling > 0] = 1
df_rolling = df_rolling.reset_index()
df_rolling['days_since_booked'] = (df_rolling.date - np.roll(df_rolling.date, shift=1))
df_rolling.days_since_booked = df_rolling.days_since_booked.apply(lambda x: x.days)
df_rolling.rename(columns={'total_time_hours': 'booked'}, inplace=True)
# drop first row
df_rolling = df_rolling.drop(df_rolling.index[0])
df_rolling_filled_dates = df_rolling.set_index('date').resample('D').mean()
# increment by 1 each day until the next booking
for _, row in df_rolling_filled_dates.iterrows():
if row.booked == 1:
counter = 1
else:
row.days_since_booked = counter
counter += 1
df_rolling_filled_dates.replace(np.nan, 0, inplace=True)
self.data = df_rolling_filled_dates
# Hack to fix large gap in data when I was away on exchange for 4 months
if self.use_data_after_date:
self.data = self.data.loc[self.use_data_after_date:]
def fit_model(self):
# Beacuse of uneven distribution of labels, we adjust weights inversely proportional to class frequencies
X, y = self.data.days_since_booked.values.reshape(-1, 1), self.data.booked.values.ravel()
# split
N = len(X)
self.X_train, self.X_test, self.y_train, self.y_test = X[:int(N*0.8)], X[int(N*0.8):], y[:int(N*0.8)], y[int(N*0.8):]
self.model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=1/y.mean())
self.model.fit(self.X_train, self.y_train)
self.generate_metrics()
def generate_metrics(self):
self.metrics = {
Metrics.F1_SCORE: f1_score(self.y_test, self.model.predict(self.X_test)),
Metrics.ACCURACY: self.model.score(self.X_test, self.y_test)
}
def predict(self, threshold, look_forward_days=30) -> None:
# get date from the last row of df_rolling_filled_dates
latest_booking_date = self.data.index[-1].date()
if latest_booking_date > datetime.date.today():
logging.warning('Laundry slot already booked in the future. No need to book laundry slot.')
return
dates_forward = np.array([latest_booking_date + datetime.timedelta(days=i) for i in range(look_forward_days)])
vals = np.arange(0, look_forward_days).reshape(-1, 1)
predictions = self.model.predict_proba(vals)[:, 1]
self.make_figure(dates_forward, predictions)
self.make_predictions_dict(dates_forward, predictions, threshold)
def make_predictions_dict(self, dates_forward, preds, threshold) -> None:
print(dates_forward[preds >= threshold])
if len(preds[preds >= threshold]) == 0:
logging.info('No predictions above threshold')
return
self.predictions = pd.DataFrame(
{"date": dates_forward[preds >= threshold], "prediction": preds[preds >= threshold]},
columns=['date', 'prediction'])
def make_figure(self, dates_forward, preds):
# TODO make this figure prettier
plt.figure(figsize=(12, 10))
plt.scatter(dates_forward, preds, label='Predicted probability')
# plt.scatter(vals[y_test==1], y_test[y_test==1], label='actual')
plt.legend()
plt.tight_layout()
plt.savefig('current_predictions.png', dpi=150, bbox_inches="tight")