Skip to content

Commit

Permalink
Autogluon timeseries, addressed comments by sebhrusen (#7)
Browse files Browse the repository at this point in the history
* fixed loading test & train, changed pred.-l. 5->30

* ignore launch.json of vscode

* ensuring timestamp parsing

* pass config, save pred, add results

* remove unused code

* add readability, remove slice from timer

* ensure autogluonts has required info

* add comments for readability

* setting defaults for timeseries task

* remove outer context manipulation

* corrected spelling error for quantiles

* adding mape, correct available metrics

* beautify config options

* fixed config for public access

* no outer context manipulation, add dataset subdir

* add more datasets

* include error raising for too large pred. length.

* mergin AutoGluonTS framework folder into AutoGluon

* renaming ts.yaml to timeseries.yaml, plus ext.

* removing presets, correct latest config for AGTS

* move dataset timeseries ext to datasets/file.py

* dont bypass test mode

* move quantiles and y_past_period_error to opt_cols

* remove whitespaces

* deleting merge artifacts

* delete merge artifacts

* renaming prediction_length to forecast_range_in_steps

* use public dataset, reduced range to maximum

* fix format string works

* fix key error bug, remove magic time limit
  • Loading branch information
limpbot authored Oct 6, 2022
1 parent 3f53533 commit 53b816a
Show file tree
Hide file tree
Showing 16 changed files with 178 additions and 183 deletions.
17 changes: 3 additions & 14 deletions amlb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,20 +489,9 @@ def load_data(self):
# TODO
raise NotImplementedError("OpenML datasets without task_id are not supported yet.")
elif hasattr(self._task_def, 'dataset'):
if self._task_def.dataset['type'] == 'timeseries' and self._task_def.dataset['timestamp_column'] is None:
log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.")
self._task_def.dataset['timestamp_column'] = "timestamp"
self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=self._task_def.dataset, fold=self.fold, timestamp_column=self._task_def.dataset['timestamp_column'])
if self._dataset.type == DatasetType.timeseries:
if self._task_def.dataset['id_column'] is None:
log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.")
self._task_def.dataset['id_column'] = "item_id"
if self._task_def.dataset['prediction_length'] is None:
log.warning("Warning: For timeseries task setting undefined prediction length to `1`.")
self._task_def.dataset['prediction_length'] = "1"
self._dataset.timestamp_column=self._task_def.dataset['timestamp_column']
self._dataset.id_column=self._task_def.dataset['id_column']
self._dataset.prediction_length=self._task_def.dataset['prediction_length']
dataset_name_and_config = copy(self._task_def.dataset)
dataset_name_and_config.name = self._task_def.name
self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=dataset_name_and_config, fold=self.fold)
else:
raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].")

Expand Down
57 changes: 49 additions & 8 deletions amlb/datasets/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ def __init__(self, cache_dir=None):
self._cache_dir = cache_dir if cache_dir else tempfile.mkdtemp(prefix='amlb_cache')

@profile(logger=log)
def load(self, dataset, fold=0, timestamp_column=None):
def load(self, dataset, fold=0):
dataset = dataset if isinstance(dataset, ns) else ns(path=dataset)
log.debug("Loading dataset %s", dataset)
paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold)
paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold, name=dataset['name'] if 'name' in dataset else None)
assert fold < len(paths['train']), f"No training dataset available for fold {fold} among dataset files {paths['train']}"
# seed = rget().seed(fold)
# if len(paths['test']) == 0:
Expand All @@ -51,21 +51,27 @@ def load(self, dataset, fold=0, timestamp_column=None):
if ext == '.arff':
return ArffDataset(train_path, test_path, target=target, features=features, type=type_)
elif ext == '.csv':
return CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=timestamp_column)
if DatasetType[dataset['type']] == DatasetType.timeseries and dataset['timestamp_column'] is None:
log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.")
dataset['timestamp_column'] = "timestamp"
csv_dataset = CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=dataset['timestamp_column'] if 'timestamp_column' in dataset else None)
if csv_dataset.type == DatasetType.timeseries:
csv_dataset = self.extend_dataset_with_timeseries_config(csv_dataset, dataset)
return csv_dataset
else:
raise ValueError(f"Unsupported file type: {ext}")

def _extract_train_test_paths(self, dataset, fold=None):
def _extract_train_test_paths(self, dataset, fold=None, name=None):
if isinstance(dataset, (tuple, list)):
assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]."
return self._extract_train_test_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0],
test=[p for i, p in enumerate(dataset) if i % 2 == 1]),
fold=fold)
fold=fold, name=name)
elif isinstance(dataset, ns):
return dict(train=[self._extract_train_test_paths(p)['train'][0]
return dict(train=[self._extract_train_test_paths(p, name=name)['train'][0]
if i == fold else None
for i, p in enumerate(as_list(dataset.train))],
test=[self._extract_train_test_paths(p)['train'][0]
test=[self._extract_train_test_paths(p, name=name)['train'][0]
if i == fold else None
for i, p in enumerate(as_list(dataset.test))])
else:
Expand Down Expand Up @@ -116,7 +122,10 @@ def _extract_train_test_paths(self, dataset, fold=None):
assert len(paths) > 0, f"No dataset file found in {dataset}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..."
return paths
elif is_valid_url(dataset):
cached_file = os.path.join(self._cache_dir, os.path.basename(dataset))
if name is None:
cached_file = os.path.join(self._cache_dir, os.path.basename(dataset))
else:
cached_file = os.path.join(self._cache_dir, name, os.path.basename(dataset))
if not os.path.exists(cached_file): # don't download if previously done
handler = get_file_handler(dataset)
assert handler.exists(dataset), f"Invalid path/url: {dataset}"
Expand All @@ -129,6 +138,38 @@ def __repr__(self):
return repr_def(self)


def extend_dataset_with_timeseries_config(self, dataset, dataset_config):
if dataset_config['id_column'] is None:
log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.")
dataset_config['id_column'] = "item_id"
if dataset_config['forecast_range_in_steps'] is None:
log.warning("Warning: For timeseries task setting undefined forecast_range_in_steps to `1`.")
dataset_config['forecast_range_in_steps'] = "1"

dataset.timestamp_column=dataset_config['timestamp_column']
dataset.id_column=dataset_config['id_column']
dataset.forecast_range_in_steps=int(dataset_config['forecast_range_in_steps'])

train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count()
test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count()
forecast_range_in_steps_mean_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
if not dataset.forecast_range_in_steps == forecast_range_in_steps_mean_diff_train_test:
msg = f"Warning: Forecast range {dataset.forecast_range_in_steps}, does not equal mean difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
log.warning(msg)
if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
msg = f"Error: Not all sequences of train and test set have same sequence length difference."
raise ValueError(msg)
if dataset.forecast_range_in_steps > forecast_range_in_steps_mean_diff_train_test:
msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
raise ValueError(msg)
if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test:
msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than minimum sequence length + 1, {forecast_range_in_steps_max_min_train_test}."
raise ValueError(msg)
return dataset



class FileDataset(Dataset):

def __init__(self, train: Datasplit, test: Datasplit,
Expand Down
33 changes: 10 additions & 23 deletions amlb/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,13 @@ def load_predictions(predictions_file):
try:
df = read_csv(predictions_file, dtype=object)
log.debug("Predictions preview:\n %s\n", df.head(10).to_string())

if rconfig().test_mode:
TaskResult.validate_predictions(df)

if 'y_past_period_error' in df.columns:
return TimeSeriesResult(df)
else:
if rconfig().test_mode:
TaskResult.validate_predictions(df)

if df.shape[1] > 2:
return ClassificationResult(df)
else:
Expand All @@ -258,9 +259,9 @@ def load_metadata(metadata_file):
def save_predictions(dataset: Dataset, output_file: str,
predictions: Union[A, DF, S] = None, truth: Union[A, DF, S] = None,
probabilities: Union[A, DF] = None, probabilities_labels: Union[list, A] = None,
optional_columns: Union[A, DF] = None,
target_is_encoded: bool = False,
preview: bool = True,
quantiles: Union[A, DF] = None):
preview: bool = True):
""" Save class probabilities and predicted labels to file in csv format.
:param dataset:
Expand All @@ -269,9 +270,9 @@ def save_predictions(dataset: Dataset, output_file: str,
:param predictions:
:param truth:
:param probabilities_labels:
:param optional_columns:
:param target_is_encoded:
:param preview:
:param quantiles:
:return: None
"""
log.debug("Saving predictions to `%s`.", output_file)
Expand Down Expand Up @@ -315,23 +316,9 @@ def save_predictions(dataset: Dataset, output_file: str,
df = df.assign(predictions=preds)
df = df.assign(truth=truth)

if dataset.type == DatasetType.timeseries:
if quantiles is not None:
quantiles = quantiles.reset_index(drop=True)
df = pd.concat([df, quantiles], axis=1)

period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works.

# we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)|
# 1. retrieve item_ids for each sequence/item
item_ids, inverse_item_ids = np.unique(dataset.test.X[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True)
# 2. capture sequences in a list
y_past = [dataset.test.y.squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))]
# 3. calculate period error per sequence
y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
# 4. repeat period error for each sequence, to save one for each element
y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length)
df = df.assign(y_past_period_error=y_past_period_error_rep)
if optional_columns is not None:
df = pd.concat([df, optional_columns], axis=1)

if preview:
log.info("Predictions preview:\n %s\n", df.head(20).to_string())
backup_file(output_file)
Expand Down
16 changes: 16 additions & 0 deletions frameworks/AutoGluon/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# AutoGluon

To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...```

To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ...```


# AutoGluonTS

AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems.

## Run Steps

To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts timeseries ...```

To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest timeseries ...```
58 changes: 43 additions & 15 deletions frameworks/AutoGluon/__init__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,53 @@
from amlb.benchmark import TaskConfig
from amlb.data import Dataset

from amlb.utils import call_script_in_same_dir
from amlb.benchmark import TaskConfig
from amlb.data import Dataset, DatasetType
from copy import deepcopy


def setup(*args, **kwargs):
call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)


def run(dataset: Dataset, config: TaskConfig):
from frameworks.shared.caller import run_in_venv

data = dict(
train=dict(path=dataset.train.data_path('parquet')),
test=dict(path=dataset.test.data_path('parquet')),
target=dict(
name=dataset.target.name,
classes=dataset.target.values
),
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType
)

return run_in_venv(__file__, "exec.py",
input_data=data, dataset=dataset, config=config)
if dataset.type is not DatasetType.timeseries:

data = dict(
train=dict(path=dataset.train.data_path('parquet')),
test=dict(path=dataset.test.data_path('parquet')),
target=dict(
name=dataset.target.name,
classes=dataset.target.values
),
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType
)
exec_file = "exec.py"

else:
dataset = deepcopy(dataset)
if not hasattr(dataset, 'timestamp_column'):
dataset.timestamp_column = None
if not hasattr(dataset, 'id_column'):
dataset.id_column = None
if not hasattr(dataset, 'forecast_range_in_steps'):
raise AttributeError("Unspecified `forecast_range_in_steps`.")

data = dict(
# train=dict(path=dataset.train.data_path('parquet')),
# test=dict(path=dataset.test.data_path('parquet')),
train=dict(path=dataset.train.path),
test=dict(path=dataset.test.path),
target=dict(
name=dataset.target.name,
classes=dataset.target.values
),
problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType
timestamp_column=dataset.timestamp_column,
id_column=dataset.id_column,
forecast_range_in_steps=dataset.forecast_range_in_steps
)
exec_file = "exec_ts.py"

return run_in_venv(__file__, exec_file,
input_data=data, dataset=dataset, config=config)
23 changes: 20 additions & 3 deletions frameworks/AutoGluonTS/exec.py → frameworks/AutoGluon/exec_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import warnings
import sys
import tempfile
import numpy as np
warnings.simplefilter("ignore")

if sys.platform == 'darwin':
Expand All @@ -27,7 +28,7 @@ def run(dataset, config):

timestamp_column = dataset.timestamp_column
id_column = dataset.id_column
prediction_length = dataset.prediction_length
prediction_length = dataset.forecast_range_in_steps

eval_metric = get_eval_metric(config)
label = dataset.target.name
Expand Down Expand Up @@ -76,6 +77,23 @@ def run(dataset, config):
save_artifacts(predictor=predictor, leaderboard=leaderboard, config=config)
shutil.rmtree(predictor.path, ignore_errors=True)

quantiles = predictions.drop(columns=['mean']).reset_index(drop=True)
period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works.

# we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)|
# 1. retrieve item_ids for each sequence/item
#dataset..X /. y
item_ids, inverse_item_ids = np.unique(test_data.reset_index()["item_id"].squeeze().to_numpy(), return_index=False, return_inverse=True)
# 2. capture sequences in a list
y_past = [test_data[label].squeeze().to_numpy()[inverse_item_ids == i][:-prediction_length] for i in range(len(item_ids))]
# 3. calculate period error per sequence
y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
# 4. repeat period error for each sequence, to save one for each element
y_past_period_error_rep = np.repeat(y_past_period_error, prediction_length)

optional_columns = quantiles
optional_columns = optional_columns.assign(y_past_period_error=y_past_period_error_rep)

return result(output_file=config.output_predictions_file,
predictions=predictions_only,
truth=truth_only,
Expand All @@ -85,8 +103,7 @@ def run(dataset, config):
models_count=num_models_trained,
training_duration=training.duration,
predict_duration=predict.duration,
quantiles=predictions.drop(columns=['mean']))

optional_columns=optional_columns)

def load_data(train_path, test_path, timestamp_column, id_column):

Expand Down
9 changes: 8 additions & 1 deletion frameworks/AutoGluon/setup.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env bash

HERE=$(dirname "$0")
VERSION=${1:-"stable"}
REPO=${2:-"https://github.com/awslabs/autogluon.git"}
Expand Down Expand Up @@ -36,4 +37,10 @@ else
PIP install -e tabular/[skex]
fi

PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
if [[ ${MODULE} == "timeseries" ]]; then
PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
# TODO: GPU version install
PIP install "mxnet<2.0"
else
PY -c "from autogluon.timeseries.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
fi
16 changes: 0 additions & 16 deletions frameworks/AutoGluonTS/README.md

This file was deleted.

Loading

0 comments on commit 53b816a

Please sign in to comment.