-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AutoMLBenchmark TimeSeries Prototype. #6
Changes from 5 commits
fdac87d
acae465
b5723cf
55c63e9
0f38986
f932669
16a165b
758b92d
04872e7
888a1cb
e15de3e
866492f
9252835
18cc6af
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ venv/ | |
.idea/ | ||
*.iml | ||
*.swp | ||
launch.json | ||
|
||
# tmp files | ||
.ipynb_checkpoints/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,7 +26,7 @@ | |
log = logging.getLogger(__name__) | ||
|
||
|
||
def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dtype=None): | ||
def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dtype=None, timestamp_column=None): | ||
""" | ||
read csv file to DataFrame. | ||
|
||
|
@@ -39,11 +39,15 @@ def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dty | |
:param dtype: data type for columns. | ||
:return: a DataFrame | ||
""" | ||
if dtype is not None and timestamp_column is not None and timestamp_column in dtype: | ||
del dtype[timestamp_column] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid outer context manipulation, instead copy dtype to a new object and then delete timestamp_column. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added copy() |
||
|
||
df = pd.read_csv(path, | ||
nrows=nrows, | ||
header=0 if header else None, | ||
index_col=0 if index else None, | ||
dtype=dtype) | ||
dtype=dtype, | ||
parse_dates=[timestamp_column] if timestamp_column is not None else None) | ||
return df if as_data_frame else df.values | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -228,12 +228,16 @@ def load_predictions(predictions_file): | |
try: | ||
df = read_csv(predictions_file, dtype=object) | ||
log.debug("Predictions preview:\n %s\n", df.head(10).to_string()) | ||
if rconfig().test_mode: | ||
TaskResult.validate_predictions(df) | ||
if df.shape[1] > 2: | ||
return ClassificationResult(df) | ||
if 'y_past_period_error' in df.columns: | ||
return TimeSeriesResult(df) | ||
else: | ||
return RegressionResult(df) | ||
if rconfig().test_mode: | ||
TaskResult.validate_predictions(df) | ||
|
||
if df.shape[1] > 2: | ||
return ClassificationResult(df) | ||
else: | ||
return RegressionResult(df) | ||
except Exception as e: | ||
return ErrorResult(ResultError(e)) | ||
else: | ||
|
@@ -255,7 +259,8 @@ def save_predictions(dataset: Dataset, output_file: str, | |
predictions: Union[A, DF, S] = None, truth: Union[A, DF, S] = None, | ||
probabilities: Union[A, DF] = None, probabilities_labels: Union[list, A] = None, | ||
target_is_encoded: bool = False, | ||
preview: bool = True): | ||
preview: bool = True, | ||
quantiles: Union[A, DF] = None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add quantiles to docstring There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
""" Save class probabilities and predicted labels to file in csv format. | ||
|
||
:param dataset: | ||
|
@@ -308,6 +313,16 @@ def save_predictions(dataset: Dataset, output_file: str, | |
|
||
df = df.assign(predictions=preds) | ||
df = df.assign(truth=truth) | ||
if quantiles is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be inside the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we raise exception if quantiles != None and problem type is not timeseries? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. currently this would make sense. In theory quantiles could be evaluated also for any regression task, but this is currently not done. |
||
quantiles.reset_index(drop=True, inplace=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is bold to do an inplace operation that alters outer context, let's be safe and avoid inplace operations. (I know that it is probably ok here, but trust me when I say that the nastiest bugs are those involving outer context manipulation caused by inplace operations) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
df = pd.concat([df, quantiles], axis=1) | ||
if dataset.type == DatasetType.timeseries: | ||
period_length = 1 # this period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this should be fine. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is a TODO style comment, then mark it as TODO so it isn't forgotten |
||
item_ids, inverse_item_ids = np.unique(dataset.test.X[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True) | ||
y_past = [dataset.test.y.squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))] | ||
y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past] | ||
y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add inline comments to explain this, it is a lot to take in when unfamiliar. |
||
df = df.assign(y_past_period_error=y_past_period_error_rep) | ||
if preview: | ||
log.info("Predictions preview:\n %s\n", df.head(20).to_string()) | ||
backup_file(output_file) | ||
|
@@ -656,6 +671,91 @@ def r2(self): | |
"""R^2""" | ||
return float(r2_score(self.truth, self.predictions)) | ||
|
||
class TimeSeriesResult(Result): | ||
|
||
def __init__(self, predictions_df, info=None): | ||
super().__init__(predictions_df, info) | ||
self.truth = self.df['truth'].values if self.df is not None else None #.iloc[:, 1].values if self.df is not None else None | ||
self.predictions = self.df['predictions'].values if self.df is not None else None #.iloc[:, -2].values if self.df is not None else None | ||
self.y_past_period_error = self.df['y_past_period_error'].values | ||
self.quantiles = self.df.iloc[:, 2:-1].values | ||
self.quantiles_probs = np.array([float(q) for q in self.df.columns[2:-1]]) | ||
self.truth = self.truth.astype(float, copy=False) | ||
self.predictions = self.predictions.astype(float, copy=False) | ||
self.quantiles = self.quantiles.astype(float, copy=False) | ||
self.y_past_period_error = self.y_past_period_error.astype(float, copy=False) | ||
|
||
self.target = Feature(0, 'target', 'real', is_target=True) | ||
self.type = DatasetType.timeseries | ||
|
||
@metric(higher_is_better=False) | ||
def mae(self): | ||
"""Mean Absolute Error""" | ||
return float(mean_absolute_error(self.truth, self.predictions)) | ||
|
||
@metric(higher_is_better=False) | ||
def mse(self): | ||
"""Mean Squared Error""" | ||
return float(mean_squared_error(self.truth, self.predictions)) | ||
|
||
@metric(higher_is_better=False) | ||
def msle(self): | ||
"""Mean Squared Logarithmic Error""" | ||
return float(mean_squared_log_error(self.truth, self.predictions)) | ||
|
||
@metric(higher_is_better=False) | ||
def rmse(self): | ||
"""Root Mean Square Error""" | ||
return math.sqrt(self.mse()) | ||
|
||
@metric(higher_is_better=False) | ||
def rmsle(self): | ||
"""Root Mean Square Logarithmic Error""" | ||
return math.sqrt(self.msle()) | ||
|
||
@metric(higher_is_better=True) | ||
def r2(self): | ||
"""R^2""" | ||
return float(r2_score(self.truth, self.predictions)) | ||
|
||
@metric(higher_is_better=False) | ||
def mase(self): | ||
"""Mean Absolute Scaled Error""" | ||
return float(np.nanmean(np.abs(self.truth/self.y_past_period_error - self.predictions/self.y_past_period_error))) | ||
|
||
@metric(higher_is_better=False) | ||
def smape(self): | ||
"""Symmetric Mean Absolute Percentage Error""" | ||
num = np.abs(self.truth - self.predictions) | ||
denom = (np.abs(self.truth) + np.abs(self.predictions)) / 2 | ||
# If the denominator is 0, we set it to float('inf') such that any division yields 0 (this | ||
# might not be fully mathematically correct, but at least we don't get NaNs) | ||
denom[denom == 0] = math.inf | ||
return np.mean(num / denom) | ||
|
||
@metric(higher_is_better=False) | ||
def nrmse(self): | ||
"""Normalized Root Mean Square Error""" | ||
return self.rmse() / np.mean(np.abs(self.truth)) | ||
|
||
@metric(higher_is_better=False) | ||
def nd(self): | ||
"""nd = ?""" | ||
return np.sum(np.abs(self.truth - self.predictions)) / np.sum(np.abs(self.truth)) | ||
|
||
@metric(higher_is_better=False) | ||
def ncrps(self): | ||
"""Normalized Continuous Ranked Probability Score""" | ||
quantile_losses = 2 * np.sum( | ||
np.abs( | ||
(self.quantiles - self.truth[:, None]) | ||
* ((self.quantiles >= self.truth[:, None]) - self.quantiles_probs[None, :]) | ||
), | ||
axis=0, | ||
) | ||
denom = np.sum(np.abs(self.truth)) # shape [num_time_series, num_quantiles] | ||
weighted_losses = quantile_losses.sum(0) / denom # shape [num_quantiles] | ||
return weighted_losses.mean() | ||
|
||
_encode_predictions_and_truth_ = False | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,9 +19,11 @@ def run(dataset: Dataset, config: TaskConfig): | |
name=dataset.target.name, | ||
classes=dataset.target.values | ||
), | ||
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType | ||
problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType | ||
timestamp_column=dataset.timestamp_column if dataset.timestamp_column is not None else None, | ||
id_column=dataset.id_column if dataset.id_column is not None else None, | ||
prediction_length=dataset.prediction_length if dataset.prediction_length is not None else None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we instead check if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently these if/else don't actually do anything |
||
) | ||
|
||
return run_in_venv(__file__, "exec.py", | ||
input_data=data, dataset=dataset, config=config) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
visualcode creates it for debugging