Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Autogluon timeseries, addressed comments by sebhrusen #7

Merged
merged 32 commits into from
Oct 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
fdac87d
fixed loading test & train, changed pred.-l. 5->30
limpbot Sep 14, 2022
acae465
ignore launch.json of vscode
limpbot Sep 14, 2022
b5723cf
ensuring timestamp parsing
limpbot Sep 16, 2022
55c63e9
pass config, save pred, add results
limpbot Sep 16, 2022
0f38986
remove unused code
limpbot Sep 16, 2022
f932669
add readability, remove slice from timer
limpbot Sep 20, 2022
16a165b
ensure autogluonts has required info
limpbot Sep 20, 2022
758b92d
add comments for readability
limpbot Sep 20, 2022
04872e7
setting defaults for timeseries task
limpbot Sep 20, 2022
888a1cb
remove outer context manipulation
limpbot Sep 20, 2022
e15de3e
corrected spelling error for quantiles
limpbot Sep 20, 2022
866492f
adding mape, correct available metrics
limpbot Sep 21, 2022
9252835
beautify config options
limpbot Sep 21, 2022
18cc6af
fixed config for public access
limpbot Sep 21, 2022
3e8945a
no outer context manipulation, add dataset subdir
limpbot Sep 23, 2022
4ca2118
add more datasets
limpbot Sep 23, 2022
f7f21fc
include error raising for too large pred. length.
limpbot Sep 26, 2022
fb429c6
mergin AutoGluonTS framework folder into AutoGluon
limpbot Oct 5, 2022
23d057a
renaming ts.yaml to timeseries.yaml, plus ext.
limpbot Oct 5, 2022
1396d20
removing presets, correct latest config for AGTS
limpbot Oct 5, 2022
8332960
move dataset timeseries ext to datasets/file.py
limpbot Oct 5, 2022
d41f632
dont bypass test mode
limpbot Oct 5, 2022
3935e9e
move quantiles and y_past_period_error to opt_cols
limpbot Oct 5, 2022
1f7c574
remove whitespaces
limpbot Oct 5, 2022
537d9c7
merge innxima into ours
limpbot Oct 6, 2022
79e54c9
deleting merge artifacts
limpbot Oct 6, 2022
6a25170
delete merge artifacts
limpbot Oct 6, 2022
5862ace
Merge pull request #2 from limpbot/Innixma-autogluon_timeseries
limpbot Oct 6, 2022
928c2cf
renaming prediction_length to forecast_range_in_steps
limpbot Oct 6, 2022
47d311c
use public dataset, reduced range to maximum
limpbot Oct 6, 2022
b244e9c
fix format string works
limpbot Oct 6, 2022
3074f42
fix key error bug, remove magic time limit
limpbot Oct 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 3 additions & 14 deletions amlb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,20 +489,9 @@ def load_data(self):
# TODO
raise NotImplementedError("OpenML datasets without task_id are not supported yet.")
elif hasattr(self._task_def, 'dataset'):
if self._task_def.dataset['type'] == 'timeseries' and self._task_def.dataset['timestamp_column'] is None:
log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.")
self._task_def.dataset['timestamp_column'] = "timestamp"
self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=self._task_def.dataset, fold=self.fold, timestamp_column=self._task_def.dataset['timestamp_column'])
if self._dataset.type == DatasetType.timeseries:
if self._task_def.dataset['id_column'] is None:
log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.")
self._task_def.dataset['id_column'] = "item_id"
if self._task_def.dataset['prediction_length'] is None:
log.warning("Warning: For timeseries task setting undefined prediction length to `1`.")
self._task_def.dataset['prediction_length'] = "1"
self._dataset.timestamp_column=self._task_def.dataset['timestamp_column']
self._dataset.id_column=self._task_def.dataset['id_column']
self._dataset.prediction_length=self._task_def.dataset['prediction_length']
dataset_name_and_config = copy(self._task_def.dataset)
dataset_name_and_config.name = self._task_def.name
self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=dataset_name_and_config, fold=self.fold)
else:
raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].")

Expand Down
57 changes: 49 additions & 8 deletions amlb/datasets/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ def __init__(self, cache_dir=None):
self._cache_dir = cache_dir if cache_dir else tempfile.mkdtemp(prefix='amlb_cache')

@profile(logger=log)
def load(self, dataset, fold=0, timestamp_column=None):
def load(self, dataset, fold=0):
dataset = dataset if isinstance(dataset, ns) else ns(path=dataset)
log.debug("Loading dataset %s", dataset)
paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold)
paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold, name=dataset['name'] if 'name' in dataset else None)
assert fold < len(paths['train']), f"No training dataset available for fold {fold} among dataset files {paths['train']}"
# seed = rget().seed(fold)
# if len(paths['test']) == 0:
Expand All @@ -51,21 +51,27 @@ def load(self, dataset, fold=0, timestamp_column=None):
if ext == '.arff':
return ArffDataset(train_path, test_path, target=target, features=features, type=type_)
elif ext == '.csv':
return CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=timestamp_column)
if DatasetType[dataset['type']] == DatasetType.timeseries and dataset['timestamp_column'] is None:
log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.")
dataset['timestamp_column'] = "timestamp"
csv_dataset = CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=dataset['timestamp_column'] if 'timestamp_column' in dataset else None)
if csv_dataset.type == DatasetType.timeseries:
csv_dataset = self.extend_dataset_with_timeseries_config(csv_dataset, dataset)
return csv_dataset
else:
raise ValueError(f"Unsupported file type: {ext}")

def _extract_train_test_paths(self, dataset, fold=None):
def _extract_train_test_paths(self, dataset, fold=None, name=None):
if isinstance(dataset, (tuple, list)):
assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]."
return self._extract_train_test_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0],
test=[p for i, p in enumerate(dataset) if i % 2 == 1]),
fold=fold)
fold=fold, name=name)
elif isinstance(dataset, ns):
return dict(train=[self._extract_train_test_paths(p)['train'][0]
return dict(train=[self._extract_train_test_paths(p, name=name)['train'][0]
if i == fold else None
for i, p in enumerate(as_list(dataset.train))],
test=[self._extract_train_test_paths(p)['train'][0]
test=[self._extract_train_test_paths(p, name=name)['train'][0]
if i == fold else None
for i, p in enumerate(as_list(dataset.test))])
else:
Expand Down Expand Up @@ -116,7 +122,10 @@ def _extract_train_test_paths(self, dataset, fold=None):
assert len(paths) > 0, f"No dataset file found in {dataset}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..."
return paths
elif is_valid_url(dataset):
cached_file = os.path.join(self._cache_dir, os.path.basename(dataset))
if name is None:
cached_file = os.path.join(self._cache_dir, os.path.basename(dataset))
else:
cached_file = os.path.join(self._cache_dir, name, os.path.basename(dataset))
if not os.path.exists(cached_file): # don't download if previously done
handler = get_file_handler(dataset)
assert handler.exists(dataset), f"Invalid path/url: {dataset}"
Expand All @@ -129,6 +138,38 @@ def __repr__(self):
return repr_def(self)


def extend_dataset_with_timeseries_config(self, dataset, dataset_config):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are manipulating the outer context of dataset_config, maybe this is ok but want to mention that it is happening.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same with manipulating outer context of dataset. Consider adding documentation stating that this is intended or otherwise do a deep copy on the objects.

if dataset_config['id_column'] is None:
log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.")
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

refer to it in the warning with the correct key 'id_column'

dataset_config['id_column'] = "item_id"
if dataset_config['forecast_range_in_steps'] is None:
log.warning("Warning: For timeseries task setting undefined forecast_range_in_steps to `1`.")
dataset_config['forecast_range_in_steps'] = "1"

dataset.timestamp_column=dataset_config['timestamp_column']
dataset.id_column=dataset_config['id_column']
dataset.forecast_range_in_steps=int(dataset_config['forecast_range_in_steps'])

train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count()
test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count()
forecast_range_in_steps_mean_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
if not dataset.forecast_range_in_steps == forecast_range_in_steps_mean_diff_train_test:
msg = f"Warning: Forecast range {dataset.forecast_range_in_steps}, does not equal mean difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
log.warning(msg)
if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
msg = f"Error: Not all sequences of train and test set have same sequence length difference."
raise ValueError(msg)
if dataset.forecast_range_in_steps > forecast_range_in_steps_mean_diff_train_test:
msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
raise ValueError(msg)
if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test:
msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than minimum sequence length + 1, {forecast_range_in_steps_max_min_train_test}."
raise ValueError(msg)
return dataset



class FileDataset(Dataset):

def __init__(self, train: Datasplit, test: Datasplit,
Expand Down
33 changes: 10 additions & 23 deletions amlb/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,12 +228,13 @@ def load_predictions(predictions_file):
try:
df = read_csv(predictions_file, dtype=object)
log.debug("Predictions preview:\n %s\n", df.head(10).to_string())

if rconfig().test_mode:
TaskResult.validate_predictions(df)

if 'y_past_period_error' in df.columns:
return TimeSeriesResult(df)
else:
if rconfig().test_mode:
TaskResult.validate_predictions(df)

if df.shape[1] > 2:
return ClassificationResult(df)
else:
Expand All @@ -258,9 +259,9 @@ def load_metadata(metadata_file):
def save_predictions(dataset: Dataset, output_file: str,
predictions: Union[A, DF, S] = None, truth: Union[A, DF, S] = None,
probabilities: Union[A, DF] = None, probabilities_labels: Union[list, A] = None,
optional_columns: Union[A, DF] = None,
target_is_encoded: bool = False,
preview: bool = True,
quantiles: Union[A, DF] = None):
preview: bool = True):
""" Save class probabilities and predicted labels to file in csv format.

:param dataset:
Expand All @@ -269,9 +270,9 @@ def save_predictions(dataset: Dataset, output_file: str,
:param predictions:
:param truth:
:param probabilities_labels:
:param optional_columns:
:param target_is_encoded:
:param preview:
:param quantiles:
:return: None
"""
log.debug("Saving predictions to `%s`.", output_file)
Expand Down Expand Up @@ -315,23 +316,9 @@ def save_predictions(dataset: Dataset, output_file: str,
df = df.assign(predictions=preds)
df = df.assign(truth=truth)

if dataset.type == DatasetType.timeseries:
if quantiles is not None:
quantiles = quantiles.reset_index(drop=True)
df = pd.concat([df, quantiles], axis=1)

period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works.

# we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)|
# 1. retrieve item_ids for each sequence/item
item_ids, inverse_item_ids = np.unique(dataset.test.X[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True)
# 2. capture sequences in a list
y_past = [dataset.test.y.squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))]
# 3. calculate period error per sequence
y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
# 4. repeat period error for each sequence, to save one for each element
y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length)
df = df.assign(y_past_period_error=y_past_period_error_rep)
if optional_columns is not None:
df = pd.concat([df, optional_columns], axis=1)

if preview:
log.info("Predictions preview:\n %s\n", df.head(20).to_string())
backup_file(output_file)
Expand Down
16 changes: 16 additions & 0 deletions frameworks/AutoGluon/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# AutoGluon

To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...```

To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ...```


# AutoGluonTS

AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems.

## Run Steps

To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts timeseries ...```

To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest timeseries ...```
58 changes: 43 additions & 15 deletions frameworks/AutoGluon/__init__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,53 @@
from amlb.benchmark import TaskConfig
from amlb.data import Dataset

from amlb.utils import call_script_in_same_dir
from amlb.benchmark import TaskConfig
from amlb.data import Dataset, DatasetType
from copy import deepcopy


def setup(*args, **kwargs):
call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)


def run(dataset: Dataset, config: TaskConfig):
from frameworks.shared.caller import run_in_venv

data = dict(
train=dict(path=dataset.train.data_path('parquet')),
test=dict(path=dataset.test.data_path('parquet')),
target=dict(
name=dataset.target.name,
classes=dataset.target.values
),
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType
)

return run_in_venv(__file__, "exec.py",
input_data=data, dataset=dataset, config=config)
if dataset.type is not DatasetType.timeseries:

data = dict(
train=dict(path=dataset.train.data_path('parquet')),
test=dict(path=dataset.test.data_path('parquet')),
target=dict(
name=dataset.target.name,
classes=dataset.target.values
),
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType
)
exec_file = "exec.py"

else:
dataset = deepcopy(dataset)
if not hasattr(dataset, 'timestamp_column'):
dataset.timestamp_column = None
if not hasattr(dataset, 'id_column'):
dataset.id_column = None
if not hasattr(dataset, 'forecast_range_in_steps'):
raise AttributeError("Unspecified `forecast_range_in_steps`.")

data = dict(
# train=dict(path=dataset.train.data_path('parquet')),
# test=dict(path=dataset.test.data_path('parquet')),
train=dict(path=dataset.train.path),
test=dict(path=dataset.test.path),
target=dict(
name=dataset.target.name,
classes=dataset.target.values
),
problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType
timestamp_column=dataset.timestamp_column,
id_column=dataset.id_column,
forecast_range_in_steps=dataset.forecast_range_in_steps
)
exec_file = "exec_ts.py"
Comment on lines +14 to +50
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if/else could be better broken up into dedicated functions for each modality to avoid an overly long function with a bunch of if/elif/elif/elif/else in future


return run_in_venv(__file__, exec_file,
input_data=data, dataset=dataset, config=config)
23 changes: 20 additions & 3 deletions frameworks/AutoGluonTS/exec.py → frameworks/AutoGluon/exec_ts.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import warnings
import sys
import tempfile
import numpy as np
warnings.simplefilter("ignore")

if sys.platform == 'darwin':
Expand All @@ -27,7 +28,7 @@ def run(dataset, config):

timestamp_column = dataset.timestamp_column
id_column = dataset.id_column
prediction_length = dataset.prediction_length
prediction_length = dataset.forecast_range_in_steps

eval_metric = get_eval_metric(config)
label = dataset.target.name
Expand Down Expand Up @@ -76,6 +77,23 @@ def run(dataset, config):
save_artifacts(predictor=predictor, leaderboard=leaderboard, config=config)
shutil.rmtree(predictor.path, ignore_errors=True)

quantiles = predictions.drop(columns=['mean']).reset_index(drop=True)
period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works.

# we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)|
# 1. retrieve item_ids for each sequence/item
#dataset..X /. y
item_ids, inverse_item_ids = np.unique(test_data.reset_index()["item_id"].squeeze().to_numpy(), return_index=False, return_inverse=True)
# 2. capture sequences in a list
y_past = [test_data[label].squeeze().to_numpy()[inverse_item_ids == i][:-prediction_length] for i in range(len(item_ids))]
# 3. calculate period error per sequence
y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
# 4. repeat period error for each sequence, to save one for each element
y_past_period_error_rep = np.repeat(y_past_period_error, prediction_length)

optional_columns = quantiles
optional_columns = optional_columns.assign(y_past_period_error=y_past_period_error_rep)

return result(output_file=config.output_predictions_file,
predictions=predictions_only,
truth=truth_only,
Expand All @@ -85,8 +103,7 @@ def run(dataset, config):
models_count=num_models_trained,
training_duration=training.duration,
predict_duration=predict.duration,
quantiles=predictions.drop(columns=['mean']))

optional_columns=optional_columns)

def load_data(train_path, test_path, timestamp_column, id_column):

Expand Down
9 changes: 8 additions & 1 deletion frameworks/AutoGluon/setup.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env bash

HERE=$(dirname "$0")
VERSION=${1:-"stable"}
REPO=${2:-"https://github.com/awslabs/autogluon.git"}
Expand Down Expand Up @@ -36,4 +37,10 @@ else
PIP install -e tabular/[skex]
fi

PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
if [[ ${MODULE} == "timeseries" ]]; then
PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
# TODO: GPU version install
PIP install "mxnet<2.0"
else
PY -c "from autogluon.timeseries.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
fi
16 changes: 0 additions & 16 deletions frameworks/AutoGluonTS/README.md

This file was deleted.

Loading