-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_tools.py
90 lines (66 loc) · 2.81 KB
/
data_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
# Data preprocessing
def timeseries_to_XY(samples: list, buffer: int) -> tuple:
"""
A method to create X and Y matrix from a time series.
Before machine learning can be used, time series forecasting problems must be re-framed as supervised learning problems.
From a sequence to pairs of input and output sequences.
"""
x, y = [], []
if len(samples) - buffer <= 0:
x.append(samples) # if not enough samples for the lag, use all the ones you have
else:
for i in range(len(samples) - buffer):
y.append(samples[i + buffer]) #start filling after the 1st buffer samples. Y is [len(samples) - buffer] x 1
x.append(samples[i:(i + buffer)]) # fill buffers of buffer samples. X is [len(samples) - buffer] x buffer
# each Y sample will have an X buffer of size "previous lag samples" associated to it
x, y = np.array(x), np.array(y)
# Reshaping the X array to be compatible with model.predict()
# The LSTM needs data with the format of [samples, time steps and features].
x = np.reshape(x, (x.shape[0], x.shape[1], 1))
return x, y
#TODO check if timeseries is stationary, otherwise detrend
def timeseries_diff (timeseries,interval=1):
"""
A method to remove a possible linear trend in the data by differentiation.
Can be called repeatedly in case a difference order quadratic, cubic, etc is required.
For time series with a seasonal component, the lag may be expected to be the period (width) of the seasonality
TODO be able to specify the order or number of times to perform the differencing operation
See series.diff()
"""
diff = list()
for i in range(interval, len(timeseries)):
value = timeseries[i] - timeseries[i - interval]
diff.append(value)
return diff
# def timeseries_inv_diff (timeseries_diff):
# inverted = list()
# for i in range(len(timeseries_diff)):
# value = yhat[i] + history[-interval]
# # value = inverse_difference(series, differenced[i], len(series)-i)
# inverted.append(value)
# from pandas import read_csv
# from pandas import datetime
# from sklearn.linear_model import LinearRegression
# from matplotlib import pyplot
# import numpy
# def parser(x):
# return datetime.strptime('190'+x, '%Y-%m')
# series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
# # fit linear model
# X = [i for i in range(0, len(series))]
# X = numpy.reshape(X, (len(X), 1))
# y = series.values
# model = LinearRegression()
# model.fit(X, y)
# # calculate trend
# trend = model.predict(X)
# # plot trend
# pyplot.plot(y)
# pyplot.plot(trend)
# pyplot.show()
# # detrend
# detrended = [y[i]-trend[i] for i in range(0, len(series))]
# # plot detrended
# pyplot.plot(detrended)
# pyplot.show()