diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 4ed79d6a1..7c54a344c 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -489,20 +489,9 @@ def load_data(self): # TODO raise NotImplementedError("OpenML datasets without task_id are not supported yet.") elif hasattr(self._task_def, 'dataset'): - if self._task_def.dataset['type'] == 'timeseries' and self._task_def.dataset['timestamp_column'] is None: - log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.") - self._task_def.dataset['timestamp_column'] = "timestamp" - self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=self._task_def.dataset, fold=self.fold, timestamp_column=self._task_def.dataset['timestamp_column']) - if self._dataset.type == DatasetType.timeseries: - if self._task_def.dataset['id_column'] is None: - log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.") - self._task_def.dataset['id_column'] = "item_id" - if self._task_def.dataset['prediction_length'] is None: - log.warning("Warning: For timeseries task setting undefined prediction length to `1`.") - self._task_def.dataset['prediction_length'] = "1" - self._dataset.timestamp_column=self._task_def.dataset['timestamp_column'] - self._dataset.id_column=self._task_def.dataset['id_column'] - self._dataset.prediction_length=self._task_def.dataset['prediction_length'] + dataset_name_and_config = copy(self._task_def.dataset) + dataset_name_and_config.name = self._task_def.name + self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=dataset_name_and_config, fold=self.fold) else: raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].") diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index 0bfa9453b..ffcfa8094 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -30,10 +30,10 @@ def __init__(self, cache_dir=None): self._cache_dir = cache_dir if cache_dir else tempfile.mkdtemp(prefix='amlb_cache') @profile(logger=log) - def load(self, dataset, fold=0, timestamp_column=None): + def load(self, dataset, fold=0): dataset = dataset if isinstance(dataset, ns) else ns(path=dataset) log.debug("Loading dataset %s", dataset) - paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold) + paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold, name=dataset['name'] if 'name' in dataset else None) assert fold < len(paths['train']), f"No training dataset available for fold {fold} among dataset files {paths['train']}" # seed = rget().seed(fold) # if len(paths['test']) == 0: @@ -51,21 +51,27 @@ def load(self, dataset, fold=0, timestamp_column=None): if ext == '.arff': return ArffDataset(train_path, test_path, target=target, features=features, type=type_) elif ext == '.csv': - return CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=timestamp_column) + if DatasetType[dataset['type']] == DatasetType.timeseries and dataset['timestamp_column'] is None: + log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.") + dataset['timestamp_column'] = "timestamp" + csv_dataset = CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=dataset['timestamp_column'] if 'timestamp_column' in dataset else None) + if csv_dataset.type == DatasetType.timeseries: + csv_dataset = self.extend_dataset_with_timeseries_config(csv_dataset, dataset) + return csv_dataset else: raise ValueError(f"Unsupported file type: {ext}") - def _extract_train_test_paths(self, dataset, fold=None): + def _extract_train_test_paths(self, dataset, fold=None, name=None): if isinstance(dataset, (tuple, list)): assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]." return self._extract_train_test_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0], test=[p for i, p in enumerate(dataset) if i % 2 == 1]), - fold=fold) + fold=fold, name=name) elif isinstance(dataset, ns): - return dict(train=[self._extract_train_test_paths(p)['train'][0] + return dict(train=[self._extract_train_test_paths(p, name=name)['train'][0] if i == fold else None for i, p in enumerate(as_list(dataset.train))], - test=[self._extract_train_test_paths(p)['train'][0] + test=[self._extract_train_test_paths(p, name=name)['train'][0] if i == fold else None for i, p in enumerate(as_list(dataset.test))]) else: @@ -116,7 +122,10 @@ def _extract_train_test_paths(self, dataset, fold=None): assert len(paths) > 0, f"No dataset file found in {dataset}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..." return paths elif is_valid_url(dataset): - cached_file = os.path.join(self._cache_dir, os.path.basename(dataset)) + if name is None: + cached_file = os.path.join(self._cache_dir, os.path.basename(dataset)) + else: + cached_file = os.path.join(self._cache_dir, name, os.path.basename(dataset)) if not os.path.exists(cached_file): # don't download if previously done handler = get_file_handler(dataset) assert handler.exists(dataset), f"Invalid path/url: {dataset}" @@ -129,6 +138,38 @@ def __repr__(self): return repr_def(self) + def extend_dataset_with_timeseries_config(self, dataset, dataset_config): + if dataset_config['id_column'] is None: + log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.") + dataset_config['id_column'] = "item_id" + if dataset_config['forecast_range_in_steps'] is None: + log.warning("Warning: For timeseries task setting undefined forecast_range_in_steps to `1`.") + dataset_config['forecast_range_in_steps'] = "1" + + dataset.timestamp_column=dataset_config['timestamp_column'] + dataset.id_column=dataset_config['id_column'] + dataset.forecast_range_in_steps=int(dataset_config['forecast_range_in_steps']) + + train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count() + test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count() + forecast_range_in_steps_mean_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean()) + forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1 + if not dataset.forecast_range_in_steps == forecast_range_in_steps_mean_diff_train_test: + msg = f"Warning: Forecast range {dataset.forecast_range_in_steps}, does not equal mean difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}." + log.warning(msg) + if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.: + msg = f"Error: Not all sequences of train and test set have same sequence length difference." + raise ValueError(msg) + if dataset.forecast_range_in_steps > forecast_range_in_steps_mean_diff_train_test: + msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}." + raise ValueError(msg) + if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test: + msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than minimum sequence length + 1, {forecast_range_in_steps_max_min_train_test}." + raise ValueError(msg) + return dataset + + + class FileDataset(Dataset): def __init__(self, train: Datasplit, test: Datasplit, diff --git a/amlb/results.py b/amlb/results.py index 91228ca4e..3887203f6 100644 --- a/amlb/results.py +++ b/amlb/results.py @@ -228,12 +228,13 @@ def load_predictions(predictions_file): try: df = read_csv(predictions_file, dtype=object) log.debug("Predictions preview:\n %s\n", df.head(10).to_string()) + + if rconfig().test_mode: + TaskResult.validate_predictions(df) + if 'y_past_period_error' in df.columns: return TimeSeriesResult(df) else: - if rconfig().test_mode: - TaskResult.validate_predictions(df) - if df.shape[1] > 2: return ClassificationResult(df) else: @@ -258,9 +259,9 @@ def load_metadata(metadata_file): def save_predictions(dataset: Dataset, output_file: str, predictions: Union[A, DF, S] = None, truth: Union[A, DF, S] = None, probabilities: Union[A, DF] = None, probabilities_labels: Union[list, A] = None, + optional_columns: Union[A, DF] = None, target_is_encoded: bool = False, - preview: bool = True, - quantiles: Union[A, DF] = None): + preview: bool = True): """ Save class probabilities and predicted labels to file in csv format. :param dataset: @@ -269,9 +270,9 @@ def save_predictions(dataset: Dataset, output_file: str, :param predictions: :param truth: :param probabilities_labels: + :param optional_columns: :param target_is_encoded: :param preview: - :param quantiles: :return: None """ log.debug("Saving predictions to `%s`.", output_file) @@ -315,23 +316,9 @@ def save_predictions(dataset: Dataset, output_file: str, df = df.assign(predictions=preds) df = df.assign(truth=truth) - if dataset.type == DatasetType.timeseries: - if quantiles is not None: - quantiles = quantiles.reset_index(drop=True) - df = pd.concat([df, quantiles], axis=1) - - period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works. - - # we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)| - # 1. retrieve item_ids for each sequence/item - item_ids, inverse_item_ids = np.unique(dataset.test.X[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True) - # 2. capture sequences in a list - y_past = [dataset.test.y.squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))] - # 3. calculate period error per sequence - y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past] - # 4. repeat period error for each sequence, to save one for each element - y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length) - df = df.assign(y_past_period_error=y_past_period_error_rep) + if optional_columns is not None: + df = pd.concat([df, optional_columns], axis=1) + if preview: log.info("Predictions preview:\n %s\n", df.head(20).to_string()) backup_file(output_file) diff --git a/frameworks/AutoGluon/README.md b/frameworks/AutoGluon/README.md new file mode 100644 index 000000000..51286533e --- /dev/null +++ b/frameworks/AutoGluon/README.md @@ -0,0 +1,16 @@ +# AutoGluon + +To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...``` + +To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ...``` + + +# AutoGluonTS + +AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems. + +## Run Steps + +To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts timeseries ...``` + +To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest timeseries ...``` diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index be2c15147..025f16590 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -1,25 +1,53 @@ -from amlb.benchmark import TaskConfig -from amlb.data import Dataset + from amlb.utils import call_script_in_same_dir +from amlb.benchmark import TaskConfig +from amlb.data import Dataset, DatasetType +from copy import deepcopy def setup(*args, **kwargs): call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs) - def run(dataset: Dataset, config: TaskConfig): from frameworks.shared.caller import run_in_venv - data = dict( - train=dict(path=dataset.train.data_path('parquet')), - test=dict(path=dataset.test.data_path('parquet')), - target=dict( - name=dataset.target.name, - classes=dataset.target.values - ), - problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType - ) - - return run_in_venv(__file__, "exec.py", - input_data=data, dataset=dataset, config=config) + if dataset.type is not DatasetType.timeseries: + data = dict( + train=dict(path=dataset.train.data_path('parquet')), + test=dict(path=dataset.test.data_path('parquet')), + target=dict( + name=dataset.target.name, + classes=dataset.target.values + ), + problem_type=dataset.type.name # AutoGluon problem_type is using same names as amlb.data.DatasetType + ) + exec_file = "exec.py" + + else: + dataset = deepcopy(dataset) + if not hasattr(dataset, 'timestamp_column'): + dataset.timestamp_column = None + if not hasattr(dataset, 'id_column'): + dataset.id_column = None + if not hasattr(dataset, 'forecast_range_in_steps'): + raise AttributeError("Unspecified `forecast_range_in_steps`.") + + data = dict( + # train=dict(path=dataset.train.data_path('parquet')), + # test=dict(path=dataset.test.data_path('parquet')), + train=dict(path=dataset.train.path), + test=dict(path=dataset.test.path), + target=dict( + name=dataset.target.name, + classes=dataset.target.values + ), + problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType + timestamp_column=dataset.timestamp_column, + id_column=dataset.id_column, + forecast_range_in_steps=dataset.forecast_range_in_steps + ) + exec_file = "exec_ts.py" + + return run_in_venv(__file__, exec_file, + input_data=data, dataset=dataset, config=config) diff --git a/frameworks/AutoGluonTS/exec.py b/frameworks/AutoGluon/exec_ts.py similarity index 78% rename from frameworks/AutoGluonTS/exec.py rename to frameworks/AutoGluon/exec_ts.py index 87e7f44f3..ab7c4110f 100644 --- a/frameworks/AutoGluonTS/exec.py +++ b/frameworks/AutoGluon/exec_ts.py @@ -4,6 +4,7 @@ import warnings import sys import tempfile +import numpy as np warnings.simplefilter("ignore") if sys.platform == 'darwin': @@ -27,7 +28,7 @@ def run(dataset, config): timestamp_column = dataset.timestamp_column id_column = dataset.id_column - prediction_length = dataset.prediction_length + prediction_length = dataset.forecast_range_in_steps eval_metric = get_eval_metric(config) label = dataset.target.name @@ -76,6 +77,23 @@ def run(dataset, config): save_artifacts(predictor=predictor, leaderboard=leaderboard, config=config) shutil.rmtree(predictor.path, ignore_errors=True) + quantiles = predictions.drop(columns=['mean']).reset_index(drop=True) + period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works. + + # we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)| + # 1. retrieve item_ids for each sequence/item + #dataset..X /. y + item_ids, inverse_item_ids = np.unique(test_data.reset_index()["item_id"].squeeze().to_numpy(), return_index=False, return_inverse=True) + # 2. capture sequences in a list + y_past = [test_data[label].squeeze().to_numpy()[inverse_item_ids == i][:-prediction_length] for i in range(len(item_ids))] + # 3. calculate period error per sequence + y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past] + # 4. repeat period error for each sequence, to save one for each element + y_past_period_error_rep = np.repeat(y_past_period_error, prediction_length) + + optional_columns = quantiles + optional_columns = optional_columns.assign(y_past_period_error=y_past_period_error_rep) + return result(output_file=config.output_predictions_file, predictions=predictions_only, truth=truth_only, @@ -85,8 +103,7 @@ def run(dataset, config): models_count=num_models_trained, training_duration=training.duration, predict_duration=predict.duration, - quantiles=predictions.drop(columns=['mean'])) - + optional_columns=optional_columns) def load_data(train_path, test_path, timestamp_column, id_column): diff --git a/frameworks/AutoGluon/setup.sh b/frameworks/AutoGluon/setup.sh index 6ef50ed8c..7cbccbee9 100755 --- a/frameworks/AutoGluon/setup.sh +++ b/frameworks/AutoGluon/setup.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash + HERE=$(dirname "$0") VERSION=${1:-"stable"} REPO=${2:-"https://github.com/awslabs/autogluon.git"} @@ -36,4 +37,10 @@ else PIP install -e tabular/[skex] fi -PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed" +if [[ ${MODULE} == "timeseries" ]]; then + PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed" + # TODO: GPU version install + PIP install "mxnet<2.0" +else + PY -c "from autogluon.timeseries.version import __version__; print(__version__)" >> "${HERE}/.setup/installed" +fi diff --git a/frameworks/AutoGluonTS/README.md b/frameworks/AutoGluonTS/README.md deleted file mode 100644 index 938b459c4..000000000 --- a/frameworks/AutoGluonTS/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# AutoGluonTS - -AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems. - -This code is currently a prototype, since time series support is not fully defined in AutoMLBenchmark yet. -Consider the code a proof of concept. - -## Run Steps - -To run AutoGluonTS in AutoMLBenchmark on the covid dataset from the AutoGluon tutorial, do the following: - -1. Create a fresh Python environment -2. Follow automlbenchmark install instructions -3. Run the following command in terminal: ```python3 ../automlbenchmark/runbenchmark.py autogluonts ts test``` - -To run mainline AutoGluonTS instead of v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ts test``` diff --git a/frameworks/AutoGluonTS/__init__.py b/frameworks/AutoGluonTS/__init__.py deleted file mode 100644 index 70283c3e5..000000000 --- a/frameworks/AutoGluonTS/__init__.py +++ /dev/null @@ -1,38 +0,0 @@ -from amlb.benchmark import TaskConfig -from amlb.data import Dataset, DatasetType -from amlb.utils import call_script_in_same_dir - - -def setup(*args, **kwargs): - call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs) - - -def run(dataset: Dataset, config: TaskConfig): - from frameworks.shared.caller import run_in_venv - - if hasattr(dataset, 'timestamp_column') is False: - dataset.timestamp_column = None - if hasattr(dataset, 'id_column') is False: - dataset.id_column = None - if hasattr(dataset, 'prediction_length') is False: - raise AttributeError("Unspecified `prediction_length`.") - if dataset.type is not DatasetType.timeseries: - raise ValueError("AutoGluonTS only supports timeseries.") - - data = dict( - # train=dict(path=dataset.train.data_path('parquet')), - # test=dict(path=dataset.test.data_path('parquet')), - train=dict(path=dataset.train.path), - test=dict(path=dataset.test.path), - target=dict( - name=dataset.target.name, - classes=dataset.target.values - ), - problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType - timestamp_column=dataset.timestamp_column, - id_column=dataset.id_column, - prediction_length=dataset.prediction_length - ) - - return run_in_venv(__file__, "exec.py", - input_data=data, dataset=dataset, config=config) diff --git a/frameworks/AutoGluonTS/setup.sh b/frameworks/AutoGluonTS/setup.sh deleted file mode 100755 index d9fc7e8da..000000000 --- a/frameworks/AutoGluonTS/setup.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env bash -HERE=$(dirname "$0") -VERSION=${1:-"stable"} -REPO=${2:-"https://github.com/awslabs/autogluon.git"} -PKG=${3:-"autogluon"} -if [[ "$VERSION" == "latest" ]]; then - VERSION="master" -fi - -# creating local venv -. ${HERE}/../shared/setup.sh ${HERE} true - -PIP install --upgrade pip -PIP install --upgrade setuptools wheel - -if [[ "$VERSION" == "stable" ]]; then - PIP install --no-cache-dir -U "${PKG}" - PIP install --no-cache-dir -U "${PKG}.tabular[skex]" -elif [[ "$VERSION" =~ ^[0-9] ]]; then - PIP install --no-cache-dir -U "${PKG}==${VERSION}" - PIP install --no-cache-dir -U "${PKG}.tabular[skex]==${VERSION}" -else - TARGET_DIR="${HERE}/lib/${PKG}" - rm -Rf ${TARGET_DIR} - git clone --depth 1 --single-branch --branch ${VERSION} --recurse-submodules ${REPO} ${TARGET_DIR} - cd ${TARGET_DIR} - PY_EXEC_NO_ARGS="$(cut -d' ' -f1 <<<"$py_exec")" - PY_EXEC_DIR=$(dirname "$PY_EXEC_NO_ARGS") - env PATH="$PY_EXEC_DIR:$PATH" bash -c ./full_install.sh - PIP install -e tabular/[skex] -fi - -# TODO: GPU version install -PIP install "mxnet<2.0" - -PY -c "from autogluon.timeseries.version import __version__; print(__version__)" >> "${HERE}/.setup/installed" diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py index 70b5a3be0..c596e01c5 100644 --- a/frameworks/shared/callee.py +++ b/frameworks/shared/callee.py @@ -17,12 +17,12 @@ class FrameworkError(Exception): def result(output_file=None, predictions=None, truth=None, probabilities=None, probabilities_labels=None, + optional_columns=None, target_is_encoded=False, error_message=None, models_count=None, training_duration=None, predict_duration=None, - quantiles=None, **others): return locals() @@ -70,7 +70,7 @@ def load_data(name, path, **_): wait_retry_secs=10): result = run_fn(ds, config) res = dict(result) - for name in ['predictions', 'truth', 'probabilities', 'quantiles']: + for name in ['predictions', 'truth', 'probabilities', 'optional_columns']: arr = result[name] if arr is not None: path = os.path.join(config.result_dir, '.'.join([name, 'data'])) diff --git a/frameworks/shared/caller.py b/frameworks/shared/caller.py index 68963a820..09654dc32 100644 --- a/frameworks/shared/caller.py +++ b/frameworks/shared/caller.py @@ -149,7 +149,7 @@ def run_in_venv(caller_file, script_file: str, *args, if res.error_message is not None: raise NoResultError(res.error_message) - for name in ['predictions', 'truth', 'probabilities', 'quantiles']: + for name in ['predictions', 'truth', 'probabilities', 'optional_columns']: res[name] = deserialize_data(res[name], config=ser_config) if res[name] is not None else None if callable(process_results): @@ -164,8 +164,8 @@ def run_in_venv(caller_file, script_file: str, *args, else dataset.test.y), probabilities=res.probabilities, probabilities_labels=res.probabilities_labels, - target_is_encoded=res.target_is_encoded, - quantiles=res.quantiles) + optional_columns=res.optional_columns, + target_is_encoded=res.target_is_encoded) return dict( models_count=res.models_count if res.models_count is not None else 1, diff --git a/resources/benchmarks/timeseries.yaml b/resources/benchmarks/timeseries.yaml new file mode 100644 index 000000000..26af06497 --- /dev/null +++ b/resources/benchmarks/timeseries.yaml @@ -0,0 +1,13 @@ +--- + +- name: covid + dataset: + train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv + test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv + target: ConfirmedCases + type: timeseries + forecast_range_in_steps: 19 + id_column: name + timestamp_column: Date + + folds: 1 diff --git a/resources/benchmarks/ts.yaml b/resources/benchmarks/ts.yaml deleted file mode 100644 index 0a73c81fb..000000000 --- a/resources/benchmarks/ts.yaml +++ /dev/null @@ -1,15 +0,0 @@ ---- - -- name: covid - dataset: - train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv - # s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv | https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv - test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv - # s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv | https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv - target: ConfirmedCases # target | ConfirmedCases - type: timeseries - prediction_length: 30 - id_column: name # item_id | name - timestamp_column: Date # timestamp | Date - - folds: 1 diff --git a/resources/frameworks.yaml b/resources/frameworks.yaml index eb59c44bf..513c99586 100644 --- a/resources/frameworks.yaml +++ b/resources/frameworks.yaml @@ -86,9 +86,9 @@ autoxgboost: flaml: version: 'stable' description: | - FLAML is a lightweight Python library that finds accurate machine learning models - automatically, efficiently and economically. It frees users from selecting learners - and hyperparameters for each learner. It is fast and cheap. + FLAML is a lightweight Python library that finds accurate machine learning models + automatically, efficiently and economically. It frees users from selecting learners + and hyperparameters for each learner. It is fast and cheap. project: https://github.com/microsoft/FLAML refs: [https://arxiv.org/pdf/1911.04706.pdf] @@ -139,12 +139,12 @@ mljarsupervised_compete: description: "MLJAR is using 'Compete' mode to provide the most accurate predictor" params: mode: Compete # set mode for Compete, default mode is Explain - + MLNet: version: 'latest' description: | MLNET.CLI is a automated machine learning tool implemented by ml.net. - + MLPlan: version: 'stable' abstract: true @@ -196,10 +196,12 @@ TPOT: #################################### AutoGluonTS: + extends: AutoGluon version: "stable" description: | AutoGluon-TimeSeries - project: https://auto.gluon.ai + setup_env: + MODULE: timeseries ####################################### ### Non AutoML reference frameworks ### @@ -242,4 +244,3 @@ TunedRandomForest: # _n_jobs: 1 # cf. RandomForest # _tuning: # n_estimators: 500 - diff --git a/resources/frameworks_latest.yaml b/resources/frameworks_latest.yaml index b23bf72b0..becdc4e3e 100644 --- a/resources/frameworks_latest.yaml +++ b/resources/frameworks_latest.yaml @@ -85,10 +85,12 @@ TPOT: #################################### AutoGluonTS: + extends: AutoGluon version: "latest" description: | AutoGluon-TimeSeries - project: https://auto.gluon.ai + setup_env: + MODULE: timeseries ####################################### ### Non AutoML reference frameworks ### @@ -111,4 +113,3 @@ TunedRandomForest: version: 'latest' params: n_estimators: 2000 -