-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AutoMLBenchmark TimeSeries Prototype. #6
Changes from all commits
fdac87d
acae465
b5723cf
55c63e9
0f38986
f932669
16a165b
758b92d
04872e7
888a1cb
e15de3e
866492f
9252835
18cc6af
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ venv/ | |
.idea/ | ||
*.iml | ||
*.swp | ||
launch.json | ||
|
||
# tmp files | ||
.ipynb_checkpoints/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,7 @@ | |
log = logging.getLogger(__name__) | ||
|
||
|
||
def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dtype=None): | ||
def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dtype=None, timestamp_column=None): | ||
""" | ||
read csv file to DataFrame. | ||
|
||
|
@@ -37,13 +37,19 @@ def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dty | |
:param header: if the columns header should be read. | ||
:param as_data_frame: if the result should be returned as a data frame (default) or a numpy array. | ||
:param dtype: data type for columns. | ||
:param timestamp_column: column name for timestamp, to ensure dates are correctly parsed by pandas. | ||
:return: a DataFrame | ||
""" | ||
if dtype is not None and timestamp_column is not None and timestamp_column in dtype: | ||
dtype = dtype.copy() # to avoid outer context manipulation | ||
del dtype[timestamp_column] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid outer context manipulation, instead copy dtype to a new object and then delete timestamp_column. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added copy() |
||
|
||
df = pd.read_csv(path, | ||
nrows=nrows, | ||
header=0 if header else None, | ||
index_col=0 if index else None, | ||
dtype=dtype) | ||
dtype=dtype, | ||
parse_dates=[timestamp_column] if timestamp_column is not None else None) | ||
return df if as_data_frame else df.values | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -228,12 +228,16 @@ def load_predictions(predictions_file): | |
try: | ||
df = read_csv(predictions_file, dtype=object) | ||
log.debug("Predictions preview:\n %s\n", df.head(10).to_string()) | ||
if rconfig().test_mode: | ||
TaskResult.validate_predictions(df) | ||
if df.shape[1] > 2: | ||
return ClassificationResult(df) | ||
if 'y_past_period_error' in df.columns: | ||
return TimeSeriesResult(df) | ||
else: | ||
return RegressionResult(df) | ||
if rconfig().test_mode: | ||
TaskResult.validate_predictions(df) | ||
|
||
if df.shape[1] > 2: | ||
return ClassificationResult(df) | ||
else: | ||
return RegressionResult(df) | ||
except Exception as e: | ||
return ErrorResult(ResultError(e)) | ||
else: | ||
|
@@ -255,7 +259,8 @@ def save_predictions(dataset: Dataset, output_file: str, | |
predictions: Union[A, DF, S] = None, truth: Union[A, DF, S] = None, | ||
probabilities: Union[A, DF] = None, probabilities_labels: Union[list, A] = None, | ||
target_is_encoded: bool = False, | ||
preview: bool = True): | ||
preview: bool = True, | ||
quantiles: Union[A, DF] = None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add quantiles to docstring There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
""" Save class probabilities and predicted labels to file in csv format. | ||
|
||
:param dataset: | ||
|
@@ -266,6 +271,7 @@ def save_predictions(dataset: Dataset, output_file: str, | |
:param probabilities_labels: | ||
:param target_is_encoded: | ||
:param preview: | ||
:param quantiles: | ||
:return: None | ||
""" | ||
log.debug("Saving predictions to `%s`.", output_file) | ||
|
@@ -308,6 +314,24 @@ def save_predictions(dataset: Dataset, output_file: str, | |
|
||
df = df.assign(predictions=preds) | ||
df = df.assign(truth=truth) | ||
|
||
if dataset.type == DatasetType.timeseries: | ||
if quantiles is not None: | ||
quantiles = quantiles.reset_index(drop=True) | ||
df = pd.concat([df, quantiles], axis=1) | ||
|
||
period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works. | ||
|
||
# we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)| | ||
# 1. retrieve item_ids for each sequence/item | ||
item_ids, inverse_item_ids = np.unique(dataset.test.X[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True) | ||
# 2. capture sequences in a list | ||
y_past = [dataset.test.y.squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))] | ||
# 3. calculate period error per sequence | ||
y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past] | ||
# 4. repeat period error for each sequence, to save one for each element | ||
y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length) | ||
df = df.assign(y_past_period_error=y_past_period_error_rep) | ||
if preview: | ||
log.info("Predictions preview:\n %s\n", df.head(20).to_string()) | ||
backup_file(output_file) | ||
|
@@ -656,6 +680,71 @@ def r2(self): | |
"""R^2""" | ||
return float(r2_score(self.truth, self.predictions)) | ||
|
||
class TimeSeriesResult(RegressionResult): | ||
|
||
def __init__(self, predictions_df, info=None): | ||
super().__init__(predictions_df, info) | ||
self.truth = self.df['truth'].values if self.df is not None else None #.iloc[:, 1].values if self.df is not None else None | ||
self.predictions = self.df['predictions'].values if self.df is not None else None #.iloc[:, -2].values if self.df is not None else None | ||
self.y_past_period_error = self.df['y_past_period_error'].values | ||
self.quantiles = self.df.iloc[:, 2:-1].values | ||
self.quantiles_probs = np.array([float(q) for q in self.df.columns[2:-1]]) | ||
self.truth = self.truth.astype(float, copy=False) | ||
self.predictions = self.predictions.astype(float, copy=False) | ||
self.quantiles = self.quantiles.astype(float, copy=False) | ||
self.y_past_period_error = self.y_past_period_error.astype(float, copy=False) | ||
|
||
self.target = Feature(0, 'target', 'real', is_target=True) | ||
self.type = DatasetType.timeseries | ||
|
||
@metric(higher_is_better=False) | ||
def mase(self): | ||
"""Mean Absolute Scaled Error""" | ||
return float(np.nanmean(np.abs(self.truth/self.y_past_period_error - self.predictions/self.y_past_period_error))) | ||
|
||
@metric(higher_is_better=False) | ||
def smape(self): | ||
"""Symmetric Mean Absolute Percentage Error""" | ||
num = np.abs(self.truth - self.predictions) | ||
denom = (np.abs(self.truth) + np.abs(self.predictions)) / 2 | ||
# If the denominator is 0, we set it to float('inf') such that any division yields 0 (this | ||
# might not be fully mathematically correct, but at least we don't get NaNs) | ||
denom[denom == 0] = math.inf | ||
return np.mean(num / denom) | ||
|
||
@metric(higher_is_better=False) | ||
def mape(self): | ||
"""Symmetric Mean Absolute Percentage Error""" | ||
num = np.abs(self.truth - self.predictions) | ||
denom = np.abs(self.truth) | ||
# If the denominator is 0, we set it to float('inf') such that any division yields 0 (this | ||
# might not be fully mathematically correct, but at least we don't get NaNs) | ||
denom[denom == 0] = math.inf | ||
return np.mean(num / denom) | ||
|
||
@metric(higher_is_better=False) | ||
def nrmse(self): | ||
"""Normalized Root Mean Square Error""" | ||
return self.rmse() / np.mean(np.abs(self.truth)) | ||
|
||
@metric(higher_is_better=False) | ||
def wape(self): | ||
"""Weighted Average Percentage Error""" | ||
return np.sum(np.abs(self.truth - self.predictions)) / np.sum(np.abs(self.truth)) | ||
|
||
@metric(higher_is_better=False) | ||
def ncrps(self): | ||
"""Normalized Continuous Ranked Probability Score""" | ||
quantile_losses = 2 * np.sum( | ||
np.abs( | ||
(self.quantiles - self.truth[:, None]) | ||
* ((self.quantiles >= self.truth[:, None]) - self.quantiles_probs[None, :]) | ||
), | ||
axis=0, | ||
) | ||
denom = np.sum(np.abs(self.truth)) # shape [num_time_series, num_quantiles] | ||
weighted_losses = quantile_losses.sum(0) / denom # shape [num_quantiles] | ||
return weighted_losses.mean() | ||
|
||
_encode_predictions_and_truth_ = False | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
from amlb.benchmark import TaskConfig | ||
from amlb.data import Dataset | ||
from amlb.data import Dataset, DatasetType | ||
from amlb.utils import call_script_in_same_dir | ||
|
||
|
||
|
@@ -10,6 +10,15 @@ def setup(*args, **kwargs): | |
def run(dataset: Dataset, config: TaskConfig): | ||
from frameworks.shared.caller import run_in_venv | ||
|
||
if hasattr(dataset, 'timestamp_column') is False: | ||
dataset.timestamp_column = None | ||
Comment on lines
+13
to
+14
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This edits outer context of dataset. No need to do this. |
||
if hasattr(dataset, 'id_column') is False: | ||
dataset.id_column = None | ||
if hasattr(dataset, 'prediction_length') is False: | ||
raise AttributeError("Unspecified `prediction_length`.") | ||
if dataset.type is not DatasetType.timeseries: | ||
raise ValueError("AutoGluonTS only supports timeseries.") | ||
|
||
data = dict( | ||
# train=dict(path=dataset.train.data_path('parquet')), | ||
# test=dict(path=dataset.test.data_path('parquet')), | ||
|
@@ -19,9 +28,11 @@ def run(dataset: Dataset, config: TaskConfig): | |
name=dataset.target.name, | ||
classes=dataset.target.values | ||
), | ||
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType | ||
problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType | ||
timestamp_column=dataset.timestamp_column, | ||
id_column=dataset.id_column, | ||
prediction_length=dataset.prediction_length | ||
) | ||
|
||
return run_in_venv(__file__, "exec.py", | ||
input_data=data, dataset=dataset, config=config) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
visualcode creates it for debugging