-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Autogluon timeseries, addressed comments by sebhrusen #7
Changes from all commits
fdac87d
acae465
b5723cf
55c63e9
0f38986
f932669
16a165b
758b92d
04872e7
888a1cb
e15de3e
866492f
9252835
18cc6af
3e8945a
4ca2118
f7f21fc
fb429c6
23d057a
1396d20
8332960
d41f632
3935e9e
1f7c574
537d9c7
79e54c9
6a25170
5862ace
928c2cf
47d311c
b244e9c
3074f42
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,10 +30,10 @@ def __init__(self, cache_dir=None): | |
self._cache_dir = cache_dir if cache_dir else tempfile.mkdtemp(prefix='amlb_cache') | ||
|
||
@profile(logger=log) | ||
def load(self, dataset, fold=0, timestamp_column=None): | ||
def load(self, dataset, fold=0): | ||
dataset = dataset if isinstance(dataset, ns) else ns(path=dataset) | ||
log.debug("Loading dataset %s", dataset) | ||
paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold) | ||
paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold, name=dataset['name'] if 'name' in dataset else None) | ||
assert fold < len(paths['train']), f"No training dataset available for fold {fold} among dataset files {paths['train']}" | ||
# seed = rget().seed(fold) | ||
# if len(paths['test']) == 0: | ||
|
@@ -51,21 +51,27 @@ def load(self, dataset, fold=0, timestamp_column=None): | |
if ext == '.arff': | ||
return ArffDataset(train_path, test_path, target=target, features=features, type=type_) | ||
elif ext == '.csv': | ||
return CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=timestamp_column) | ||
if DatasetType[dataset['type']] == DatasetType.timeseries and dataset['timestamp_column'] is None: | ||
log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.") | ||
dataset['timestamp_column'] = "timestamp" | ||
csv_dataset = CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=dataset['timestamp_column'] if 'timestamp_column' in dataset else None) | ||
if csv_dataset.type == DatasetType.timeseries: | ||
csv_dataset = self.extend_dataset_with_timeseries_config(csv_dataset, dataset) | ||
return csv_dataset | ||
else: | ||
raise ValueError(f"Unsupported file type: {ext}") | ||
|
||
def _extract_train_test_paths(self, dataset, fold=None): | ||
def _extract_train_test_paths(self, dataset, fold=None, name=None): | ||
if isinstance(dataset, (tuple, list)): | ||
assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." | ||
return self._extract_train_test_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0], | ||
test=[p for i, p in enumerate(dataset) if i % 2 == 1]), | ||
fold=fold) | ||
fold=fold, name=name) | ||
elif isinstance(dataset, ns): | ||
return dict(train=[self._extract_train_test_paths(p)['train'][0] | ||
return dict(train=[self._extract_train_test_paths(p, name=name)['train'][0] | ||
if i == fold else None | ||
for i, p in enumerate(as_list(dataset.train))], | ||
test=[self._extract_train_test_paths(p)['train'][0] | ||
test=[self._extract_train_test_paths(p, name=name)['train'][0] | ||
if i == fold else None | ||
for i, p in enumerate(as_list(dataset.test))]) | ||
else: | ||
|
@@ -116,7 +122,10 @@ def _extract_train_test_paths(self, dataset, fold=None): | |
assert len(paths) > 0, f"No dataset file found in {dataset}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..." | ||
return paths | ||
elif is_valid_url(dataset): | ||
cached_file = os.path.join(self._cache_dir, os.path.basename(dataset)) | ||
if name is None: | ||
cached_file = os.path.join(self._cache_dir, os.path.basename(dataset)) | ||
else: | ||
cached_file = os.path.join(self._cache_dir, name, os.path.basename(dataset)) | ||
if not os.path.exists(cached_file): # don't download if previously done | ||
handler = get_file_handler(dataset) | ||
assert handler.exists(dataset), f"Invalid path/url: {dataset}" | ||
|
@@ -129,6 +138,38 @@ def __repr__(self): | |
return repr_def(self) | ||
|
||
|
||
def extend_dataset_with_timeseries_config(self, dataset, dataset_config): | ||
if dataset_config['id_column'] is None: | ||
log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. refer to it in the warning with the correct key 'id_column' |
||
dataset_config['id_column'] = "item_id" | ||
if dataset_config['forecast_range_in_steps'] is None: | ||
log.warning("Warning: For timeseries task setting undefined forecast_range_in_steps to `1`.") | ||
dataset_config['forecast_range_in_steps'] = "1" | ||
|
||
dataset.timestamp_column=dataset_config['timestamp_column'] | ||
dataset.id_column=dataset_config['id_column'] | ||
dataset.forecast_range_in_steps=int(dataset_config['forecast_range_in_steps']) | ||
|
||
train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count() | ||
test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count() | ||
forecast_range_in_steps_mean_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean()) | ||
forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1 | ||
if not dataset.forecast_range_in_steps == forecast_range_in_steps_mean_diff_train_test: | ||
msg = f"Warning: Forecast range {dataset.forecast_range_in_steps}, does not equal mean difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}." | ||
log.warning(msg) | ||
if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.: | ||
msg = f"Error: Not all sequences of train and test set have same sequence length difference." | ||
raise ValueError(msg) | ||
if dataset.forecast_range_in_steps > forecast_range_in_steps_mean_diff_train_test: | ||
msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}." | ||
raise ValueError(msg) | ||
if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test: | ||
msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than minimum sequence length + 1, {forecast_range_in_steps_max_min_train_test}." | ||
raise ValueError(msg) | ||
return dataset | ||
|
||
|
||
|
||
class FileDataset(Dataset): | ||
|
||
def __init__(self, train: Datasplit, test: Datasplit, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# AutoGluon | ||
|
||
To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...``` | ||
|
||
To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ...``` | ||
|
||
|
||
# AutoGluonTS | ||
|
||
AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems. | ||
|
||
## Run Steps | ||
|
||
To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts timeseries ...``` | ||
|
||
To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest timeseries ...``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,53 @@ | ||
from amlb.benchmark import TaskConfig | ||
from amlb.data import Dataset | ||
|
||
from amlb.utils import call_script_in_same_dir | ||
from amlb.benchmark import TaskConfig | ||
from amlb.data import Dataset, DatasetType | ||
from copy import deepcopy | ||
|
||
|
||
def setup(*args, **kwargs): | ||
call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs) | ||
|
||
|
||
def run(dataset: Dataset, config: TaskConfig): | ||
from frameworks.shared.caller import run_in_venv | ||
|
||
data = dict( | ||
train=dict(path=dataset.train.data_path('parquet')), | ||
test=dict(path=dataset.test.data_path('parquet')), | ||
target=dict( | ||
name=dataset.target.name, | ||
classes=dataset.target.values | ||
), | ||
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType | ||
) | ||
|
||
return run_in_venv(__file__, "exec.py", | ||
input_data=data, dataset=dataset, config=config) | ||
if dataset.type is not DatasetType.timeseries: | ||
|
||
data = dict( | ||
train=dict(path=dataset.train.data_path('parquet')), | ||
test=dict(path=dataset.test.data_path('parquet')), | ||
target=dict( | ||
name=dataset.target.name, | ||
classes=dataset.target.values | ||
), | ||
problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType | ||
) | ||
exec_file = "exec.py" | ||
|
||
else: | ||
dataset = deepcopy(dataset) | ||
if not hasattr(dataset, 'timestamp_column'): | ||
dataset.timestamp_column = None | ||
if not hasattr(dataset, 'id_column'): | ||
dataset.id_column = None | ||
if not hasattr(dataset, 'forecast_range_in_steps'): | ||
raise AttributeError("Unspecified `forecast_range_in_steps`.") | ||
|
||
data = dict( | ||
# train=dict(path=dataset.train.data_path('parquet')), | ||
# test=dict(path=dataset.test.data_path('parquet')), | ||
train=dict(path=dataset.train.path), | ||
test=dict(path=dataset.test.path), | ||
target=dict( | ||
name=dataset.target.name, | ||
classes=dataset.target.values | ||
), | ||
problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType | ||
timestamp_column=dataset.timestamp_column, | ||
id_column=dataset.id_column, | ||
forecast_range_in_steps=dataset.forecast_range_in_steps | ||
) | ||
exec_file = "exec_ts.py" | ||
Comment on lines
+14
to
+50
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if/else could be better broken up into dedicated functions for each modality to avoid an overly long function with a bunch of if/elif/elif/elif/else in future |
||
|
||
return run_in_venv(__file__, exec_file, | ||
input_data=data, dataset=dataset, config=config) |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we are manipulating the outer context of dataset_config, maybe this is ok but want to mention that it is happening.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same with manipulating outer context of dataset. Consider adding documentation stating that this is intended or otherwise do a deep copy on the objects.