From fdac87d788e219575b68bec8b5d5b3fde115fa72 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 14 Sep 2022 13:49:37 +0000
Subject: [PATCH 01/30] fixed loading test & train, changed pred.-l. 5->30

---
 frameworks/AutoGluonTS/exec.py | 49 ++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/frameworks/AutoGluonTS/exec.py b/frameworks/AutoGluonTS/exec.py
index 9b172e129..88c46c0fb 100644
--- a/frameworks/AutoGluonTS/exec.py
+++ b/frameworks/AutoGluonTS/exec.py
@@ -32,7 +32,7 @@ def run(dataset, config):
     # TODO: Need to pass the following info somehow
     timestamp_column = "Date"
     id_column = "name"
-    prediction_length = 5
+    prediction_length = 30
     #################
 
     eval_metric = get_eval_metric(config)
@@ -41,10 +41,10 @@ def run(dataset, config):
 
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
 
-    train_data, test_data, test_data_leaderboard = load_data(train_path=dataset.train.path,
-                                                             timestamp_column=timestamp_column,
-                                                             id_column=id_column,
-                                                             prediction_length=prediction_length)
+    train_data, test_data = load_data(train_path=dataset.train.path,
+                                      test_path=dataset.test.path,
+                                      timestamp_column=timestamp_column,
+                                      id_column=id_column)
 
     predictor_path = tempfile.mkdtemp() + os.sep
     with Timer() as training:
@@ -61,16 +61,18 @@ def run(dataset, config):
         )
 
     with Timer() as predict:
-        predictions = predictor.predict(train_data)
+        test_data_past = test_data.copy().slice_by_timestep(slice(None, -prediction_length))
+        predictions = predictor.predict(test_data_past)
     log.info(predictions)
 
     predictions_only = predictions['mean'].values
-    truth_only = test_data[label].values
+    test_data_future = test_data.copy().slice_by_timestep(slice(-prediction_length, None))
+    truth_only = test_data_future[label].values
 
     log.info(predictions_only)
     log.info(truth_only)
 
-    leaderboard = predictor.leaderboard(test_data_leaderboard)
+    leaderboard = predictor.leaderboard(test_data)
 
     with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
         log.info(leaderboard)
@@ -91,18 +93,31 @@ def run(dataset, config):
                   predict_duration=predict.duration)
 
 
-def load_data(train_path, timestamp_column, id_column, prediction_length):
-    df = TabularDataset(train_path)
-    df[timestamp_column] = pd.to_datetime(df[timestamp_column].astype('object'))
-    train_data = TimeSeriesDataFrame.from_data_frame(df, id_column=id_column, timestamp_column=timestamp_column)
+def load_data(train_path, test_path, timestamp_column, id_column):
 
-    test_data_leaderboard = train_data.copy()
-    # the data set with the last prediction_length time steps included, i.e., akin to `a[:-5]`
-    train_data = train_data.slice_by_timestep(slice(None, -prediction_length))
+    train_df = pd.read_csv(
+        train_path,
+        parse_dates=[timestamp_column],
+    )
+
+    train_data = TimeSeriesDataFrame.from_data_frame(
+        train_df,
+        id_column=id_column,
+        timestamp_column=timestamp_column,
+    )
 
-    test_data = test_data_leaderboard.slice_by_timestep(slice(-prediction_length, None))
+    test_df = pd.read_csv(
+        test_path,
+        parse_dates=[timestamp_column],
+    )
+
+    test_data = TimeSeriesDataFrame.from_data_frame(
+        test_df,
+        id_column=id_column,
+        timestamp_column=timestamp_column,
+    )
 
-    return train_data, test_data, test_data_leaderboard
+    return train_data, test_data
 
 
 def get_eval_metric(config):

From acae465e418f94e7a9f69c53b6354f11f1df1bc6 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 14 Sep 2022 13:51:22 +0000
Subject: [PATCH 02/30] ignore launch.json of vscode

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 4dba33db1..bc9c76adc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ venv/
 .idea/
 *.iml
 *.swp
+launch.json
 
 # tmp files
 .ipynb_checkpoints/

From b5723cfa4f5ede93beadf97d31825fe1902ca674 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Fri, 16 Sep 2022 15:41:08 +0000
Subject: [PATCH 03/30] ensuring timestamp parsing

---
 amlb/datautils.py | 44 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

diff --git a/amlb/datautils.py b/amlb/datautils.py
index f3eeeb2a5..1c489ef8b 100644
--- a/amlb/datautils.py
+++ b/amlb/datautils.py
@@ -26,7 +26,7 @@
 log = logging.getLogger(__name__)
 
 
-def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dtype=None):
+def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dtype=None, timestamp_column=None):
     """
     read csv file to DataFrame.
 
@@ -39,11 +39,15 @@ def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dty
     :param dtype: data type for columns.
     :return: a DataFrame
     """
+    if dtype is not None and timestamp_column is not None and timestamp_column in dtype:
+            del dtype[timestamp_column]
+
     df = pd.read_csv(path,
                      nrows=nrows,
                      header=0 if header else None,
                      index_col=0 if index else None,
-                     dtype=dtype)
+                     dtype=dtype,
+                     parse_dates=[timestamp_column] if timestamp_column is not None else None)
     return df if as_data_frame else df.values
 
 
@@ -344,3 +348,39 @@ def _restore_dtypes(X_np, X_ori):
         return X_np.astype(X_ori.dtype, copy=False)
     else:
         return X_np
+
+
+DEFAULT_SEASONALITIES = {
+    "S": 3600,  # 1 hour
+    "T": 1440,  # 1 day
+    "H": 24,  # 1 day
+    "D": 1,  # 1 day
+    "W": 1,  # 1 week
+    "M": 12,
+    "B": 5,
+    "Q": 4,
+}
+
+
+def norm_freq_str(freq_str: str) -> str:
+    return freq_str.split("-")[0]
+
+def get_seasonality(freq: str, seasonalities=DEFAULT_SEASONALITIES) -> int:
+    """
+    Return the seasonality of a given frequency:
+    >>> get_seasonality("2H")
+    12
+    """
+    offset = pd.tseries.frequencies.to_offset(freq)
+
+    base_seasonality = seasonalities.get(norm_freq_str(offset.name), 1)
+
+    seasonality, remainder = divmod(base_seasonality, offset.n)
+    if not remainder:
+        return seasonality
+
+    log.warning(
+        f"Multiple {offset.n} does not divide base seasonality "
+        f"{base_seasonality}. Falling back to seasonality 1."
+    )
+    return 1

From 55c63e9cb6167d2afe4f34ce2b60c86a2826bf3c Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Fri, 16 Sep 2022 15:44:03 +0000
Subject: [PATCH 04/30] pass config, save pred, add results

---
 amlb/benchmark.py                  |  14 +++-
 amlb/data.py                       |   1 +
 amlb/datasets/file.py              |  20 ++---
 amlb/results.py                    | 114 +++++++++++++++++++++++++++--
 frameworks/AutoGluonTS/__init__.py |   6 +-
 frameworks/AutoGluonTS/exec.py     |  10 ++-
 frameworks/shared/callee.py        |   3 +-
 frameworks/shared/caller.py        |   5 +-
 resources/benchmarks/ts.yaml       |   8 +-
 resources/config.yaml              |   1 +
 10 files changed, 152 insertions(+), 30 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index b9975efdc..43f64e9f9 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -489,7 +489,11 @@ def load_data(self):
             # TODO
             raise NotImplementedError("OpenML datasets without task_id are not supported yet.")
         elif hasattr(self._task_def, 'dataset'):
-            self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=self._task_def.dataset, fold=self.fold)
+            self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=self._task_def.dataset, fold=self.fold, timestamp_column=self._task_def.dataset['timestamp_column'])
+            if self._dataset.type == DatasetType.timeseries:
+                self._dataset.timestamp_column=self._task_def.dataset['timestamp_column']
+                self._dataset.id_column=self._task_def.dataset['id_column']
+                self._dataset.prediction_length=self._task_def.dataset['prediction_length']
         else:
             raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].")
 
@@ -522,7 +526,12 @@ def run(self):
                              predictions_dir=self.benchmark.output_dirs.predictions)
         framework_def = self.benchmark.framework_def
         task_config = copy(self.task_config)
-        task_config.type = 'regression' if self._dataset.type == DatasetType.regression else 'classification'
+        if self._dataset.type == DatasetType.regression:
+            task_config.type = 'regression'
+        elif self._dataset.type == DatasetType.timeseries:
+            task_config.type = 'timeseries'
+        else:
+            task_config.type = 'classification'
         task_config.type_ = self._dataset.type.name
         task_config.framework = self.benchmark.framework_name
         task_config.framework_params = framework_def.params
@@ -552,4 +561,3 @@ def run(self):
         finally:
             self._dataset.release()
         return results.compute_score(result=result, meta_result=meta_result)
-
diff --git a/amlb/data.py b/amlb/data.py
index 4e4cea879..acca17841 100644
--- a/amlb/data.py
+++ b/amlb/data.py
@@ -172,6 +172,7 @@ class DatasetType(Enum):
     binary = 1
     multiclass = 2
     regression = 3
+    timeseries = 4
 
 
 class Dataset(ABC):
diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
index 6ddca4042..0bfa9453b 100644
--- a/amlb/datasets/file.py
+++ b/amlb/datasets/file.py
@@ -30,7 +30,7 @@ def __init__(self, cache_dir=None):
         self._cache_dir = cache_dir if cache_dir else tempfile.mkdtemp(prefix='amlb_cache')
 
     @profile(logger=log)
-    def load(self, dataset, fold=0):
+    def load(self, dataset, fold=0, timestamp_column=None):
         dataset = dataset if isinstance(dataset, ns) else ns(path=dataset)
         log.debug("Loading dataset %s", dataset)
         paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold)
@@ -51,7 +51,7 @@ def load(self, dataset, fold=0):
         if ext == '.arff':
             return ArffDataset(train_path, test_path, target=target, features=features, type=type_)
         elif ext == '.csv':
-            return CsvDataset(train_path, test_path, target=target, features=features, type=type_)
+            return CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=timestamp_column)
         else:
             raise ValueError(f"Unsupported file type: {ext}")
 
@@ -302,25 +302,26 @@ def release(self, properties=None):
 class CsvDataset(FileDataset):
 
     def __init__(self, train_path, test_path,
-                 target=None, features=None, type=None):
+                 target=None, features=None, type=None, timestamp_column=None):
         # todo: handle auto-split (if test_path is None): requires loading the training set, split, save
         super().__init__(None, None,
                          target=target, features=features, type=type)
-        self._train = CsvDatasplit(self, train_path)
-        self._test = CsvDatasplit(self, test_path)
+        self._train = CsvDatasplit(self, train_path, timestamp_column=timestamp_column)
+        self._test = CsvDatasplit(self, test_path, timestamp_column=timestamp_column)
         self._dtypes = None
 
 
 class CsvDatasplit(FileDatasplit):
 
-    def __init__(self, dataset, path):
+    def __init__(self, dataset, path, timestamp_column=None):
         super().__init__(dataset, format='csv', path=path)
         self._ds = None
+        self.timestamp_column = timestamp_column
 
     def _ensure_loaded(self):
         if self._ds is None:
             if self.dataset._dtypes is None:
-                df = read_csv(self.path)
+                df = read_csv(self.path, timestamp_column=self.timestamp_column)
                 # df = df.convert_dtypes()
                 dt_conversions = {name: 'category'
                                   for name, dtype in zip(df.dtypes.index, df.dtypes.values)
@@ -336,8 +337,9 @@ def _ensure_loaded(self):
 
                 self._ds = df
                 self.dataset._dtypes = self._ds.dtypes
+
             else:
-                self._ds = read_csv(self.path, dtype=self.dataset._dtypes.to_dict())
+                self._ds = read_csv(self.path, dtype=self.dataset._dtypes.to_dict(), timestamp_column=self.timestamp_column)
 
     @profile(logger=log)
     def load_metadata(self):
@@ -348,7 +350,7 @@ def load_metadata(self):
                                       else 'number' if pat.is_numeric_dtype(dt)
                                       else 'category' if pat.is_categorical_dtype(dt)
                                       else 'string' if pat.is_string_dtype(dt)
-                                      # else 'datetime' if pat.is_datetime64_dtype(dt)
+                                      else 'datetime' if pat.is_datetime64_dtype(dt)
                                       else 'object')
         features = [Feature(i, col, to_feature_type(dtypes[i]))
                     for i, col in enumerate(self._ds.columns)]
diff --git a/amlb/results.py b/amlb/results.py
index 2f547b4ec..f86a7eebe 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -20,7 +20,7 @@
 from .data import Dataset, DatasetType, Feature
 from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \
     mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, r2_score, roc_auc_score, \
-    read_csv, write_csv, is_data_frame, to_data_frame
+    read_csv, write_csv, is_data_frame, to_data_frame, get_seasonality
 from .resources import get as rget, config as rconfig, output_dirs
 from .utils import Namespace, backup_file, cached, datetime_iso, get_metadata, json_load, memoize, profile, set_metadata
 
@@ -228,12 +228,16 @@ def load_predictions(predictions_file):
             try:
                 df = read_csv(predictions_file, dtype=object)
                 log.debug("Predictions preview:\n %s\n", df.head(10).to_string())
-                if rconfig().test_mode:
-                    TaskResult.validate_predictions(df)
-                if df.shape[1] > 2:
-                    return ClassificationResult(df)
+                if  'y_past_period_error' in df.columns:
+                    return TimeSeriesResult(df)
                 else:
-                    return RegressionResult(df)
+                    if rconfig().test_mode:
+                        TaskResult.validate_predictions(df)
+
+                    if df.shape[1] > 2:
+                        return ClassificationResult(df)
+                    else:
+                        return RegressionResult(df)
             except Exception as e:
                 return ErrorResult(ResultError(e))
         else:
@@ -255,7 +259,8 @@ def save_predictions(dataset: Dataset, output_file: str,
                          predictions: Union[A, DF, S] = None, truth: Union[A, DF, S] = None,
                          probabilities: Union[A, DF] = None, probabilities_labels: Union[list, A] = None,
                          target_is_encoded: bool = False,
-                         preview: bool = True):
+                         preview: bool = True,
+                         quantiles: Union[A, DF] = None):
         """ Save class probabilities and predicted labels to file in csv format.
 
         :param dataset:
@@ -308,6 +313,16 @@ def save_predictions(dataset: Dataset, output_file: str,
 
         df = df.assign(predictions=preds)
         df = df.assign(truth=truth)
+        if quantiles is not None:
+            quantiles.reset_index(drop=True, inplace=True)
+            df = pd.concat([df, quantiles], axis=1)
+        if dataset.type == DatasetType.timeseries:
+            period_length = 1 # this period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this should be fine.
+            item_ids, inverse_item_ids = np.unique(dataset.test.X[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True)
+            y_past = [dataset.test.y.squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))]
+            y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
+            y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length)
+            df = df.assign(y_past_period_error=y_past_period_error_rep)
         if preview:
             log.info("Predictions preview:\n %s\n", df.head(20).to_string())
         backup_file(output_file)
@@ -656,6 +671,91 @@ def r2(self):
         """R^2"""
         return float(r2_score(self.truth, self.predictions))
 
+class TimeSeriesResult(Result):
+
+    def __init__(self, predictions_df, info=None):
+        super().__init__(predictions_df, info)
+        self.truth = self.df['truth'].values if self.df is not None else None #.iloc[:, 1].values if self.df is not None else None
+        self.predictions = self.df['predictions'].values if self.df is not None else None #.iloc[:, -2].values if self.df is not None else None
+        self.y_past_period_error = self.df['y_past_period_error'].values
+        self.quantiles = self.df.iloc[:, 2:-1].values
+        self.quantiles_probs = np.array([float(q) for q in self.df.columns[2:-1]])
+        self.truth = self.truth.astype(float, copy=False)
+        self.predictions = self.predictions.astype(float, copy=False)
+        self.quantiles = self.quantiles.astype(float, copy=False)
+        self.y_past_period_error = self.y_past_period_error.astype(float, copy=False)
+
+        self.target = Feature(0, 'target', 'real', is_target=True)
+        self.type = DatasetType.timeseries
+
+    @metric(higher_is_better=False)
+    def mae(self):
+        """Mean Absolute Error"""
+        return float(mean_absolute_error(self.truth, self.predictions))
+
+    @metric(higher_is_better=False)
+    def mse(self):
+        """Mean Squared Error"""
+        return float(mean_squared_error(self.truth, self.predictions))
+
+    @metric(higher_is_better=False)
+    def msle(self):
+        """Mean Squared Logarithmic Error"""
+        return float(mean_squared_log_error(self.truth, self.predictions))
+
+    @metric(higher_is_better=False)
+    def rmse(self):
+        """Root Mean Square Error"""
+        return math.sqrt(self.mse())
+
+    @metric(higher_is_better=False)
+    def rmsle(self):
+        """Root Mean Square Logarithmic Error"""
+        return math.sqrt(self.msle())
+
+    @metric(higher_is_better=True)
+    def r2(self):
+        """R^2"""
+        return float(r2_score(self.truth, self.predictions))
+
+    @metric(higher_is_better=False)
+    def mase(self):
+        """Mean Absolute Scaled Error"""
+        return float(np.nanmean(np.abs(self.truth/self.y_past_period_error - self.predictions/self.y_past_period_error)))
+
+    @metric(higher_is_better=False)
+    def smape(self):
+        """Symmetric Mean Absolute Percentage Error"""
+        num = np.abs(self.truth - self.predictions)
+        denom = (np.abs(self.truth) + np.abs(self.predictions)) / 2
+        # If the denominator is 0, we set it to float('inf') such that any division yields 0 (this
+        # might not be fully mathematically correct, but at least we don't get NaNs)
+        denom[denom == 0] = math.inf
+        return np.mean(num / denom)
+
+    @metric(higher_is_better=False)
+    def nrmse(self):
+        """Normalized Root Mean Square Error"""
+        return self.rmse() / np.mean(np.abs(self.truth))
+
+    @metric(higher_is_better=False)
+    def nd(self):
+        """nd = ?"""
+        return np.sum(np.abs(self.truth - self.predictions)) / np.sum(np.abs(self.truth))
+
+    @metric(higher_is_better=False)
+    def ncrps(self):
+        """Normalized Continuous Ranked Probability Score"""
+        quantile_losses = 2 * np.sum(
+            np.abs(
+                (self.quantiles - self.truth[:, None])
+                * ((self.quantiles >= self.truth[:, None]) - self.quantiles_probs[None, :])
+            ),
+            axis=0,
+        )
+        denom = np.sum(np.abs(self.truth)) # shape [num_time_series, num_quantiles]
+        weighted_losses = quantile_losses.sum(0) / denom  # shape [num_quantiles]
+        return weighted_losses.mean()
 
 _encode_predictions_and_truth_ = False
 
diff --git a/frameworks/AutoGluonTS/__init__.py b/frameworks/AutoGluonTS/__init__.py
index 4e3f16e1f..3e1744ee7 100644
--- a/frameworks/AutoGluonTS/__init__.py
+++ b/frameworks/AutoGluonTS/__init__.py
@@ -19,9 +19,11 @@ def run(dataset: Dataset, config: TaskConfig):
             name=dataset.target.name,
             classes=dataset.target.values
         ),
-        problem_type=dataset.type.name  # AutoGluon problem_type is using same names as amlb.data.DatasetType
+        problem_type=dataset.type.name,  # AutoGluon problem_type is using same names as amlb.data.DatasetType
+        timestamp_column=dataset.timestamp_column if dataset.timestamp_column is not None else None,
+        id_column=dataset.id_column if dataset.id_column is not None else None,
+        prediction_length=dataset.prediction_length if dataset.prediction_length is not None else None
     )
 
     return run_in_venv(__file__, "exec.py",
                        input_data=data, dataset=dataset, config=config)
-
diff --git a/frameworks/AutoGluonTS/exec.py b/frameworks/AutoGluonTS/exec.py
index 88c46c0fb..f14579515 100644
--- a/frameworks/AutoGluonTS/exec.py
+++ b/frameworks/AutoGluonTS/exec.py
@@ -30,9 +30,9 @@ def run(dataset, config):
 
     #################
     # TODO: Need to pass the following info somehow
-    timestamp_column = "Date"
-    id_column = "name"
-    prediction_length = 30
+    timestamp_column = dataset.timestamp_column
+    id_column = dataset.id_column
+    prediction_length = dataset.prediction_length
     #################
 
     eval_metric = get_eval_metric(config)
@@ -90,7 +90,8 @@ def run(dataset, config):
                   target_is_encoded=False,
                   models_count=num_models_trained,
                   training_duration=training.duration,
-                  predict_duration=predict.duration)
+                  predict_duration=predict.duration,
+                  quantiles=predictions.iloc[:, 1:])
 
 
 def load_data(train_path, test_path, timestamp_column, id_column):
@@ -125,6 +126,7 @@ def get_eval_metric(config):
     metrics_mapping = dict(
         mse="MSE",
         rmse="RMSE",
+        mase="MASE",
     )
 
     eval_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index 3bf70dd3c..70b5a3be0 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -22,6 +22,7 @@ def result(output_file=None,
            models_count=None,
            training_duration=None,
            predict_duration=None,
+           quantiles=None,
            **others):
     return locals()
 
@@ -69,7 +70,7 @@ def load_data(name, path, **_):
                               wait_retry_secs=10):
             result = run_fn(ds, config)
             res = dict(result)
-            for name in ['predictions', 'truth', 'probabilities']:
+            for name in ['predictions', 'truth', 'probabilities', 'quantiles']:
                 arr = result[name]
                 if arr is not None:
                     path = os.path.join(config.result_dir, '.'.join([name, 'data']))
diff --git a/frameworks/shared/caller.py b/frameworks/shared/caller.py
index da8cea0e5..68963a820 100644
--- a/frameworks/shared/caller.py
+++ b/frameworks/shared/caller.py
@@ -149,7 +149,7 @@ def run_in_venv(caller_file, script_file: str, *args,
         if res.error_message is not None:
             raise NoResultError(res.error_message)
 
-        for name in ['predictions', 'truth', 'probabilities']:
+        for name in ['predictions', 'truth', 'probabilities', 'quantiles']:
             res[name] = deserialize_data(res[name], config=ser_config) if res[name] is not None else None
 
         if callable(process_results):
@@ -164,7 +164,8 @@ def run_in_venv(caller_file, script_file: str, *args,
                                     else dataset.test.y),
                              probabilities=res.probabilities,
                              probabilities_labels=res.probabilities_labels,
-                             target_is_encoded=res.target_is_encoded)
+                             target_is_encoded=res.target_is_encoded,
+                             quantiles=res.quantiles)
 
         return dict(
             models_count=res.models_count if res.models_count is not None else 1,
diff --git a/resources/benchmarks/ts.yaml b/resources/benchmarks/ts.yaml
index 5a67366b7..04cb86bb9 100644
--- a/resources/benchmarks/ts.yaml
+++ b/resources/benchmarks/ts.yaml
@@ -3,7 +3,11 @@
 - name: covid
   dataset:
     train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
-    test:  https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
+    test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
     target: ConfirmedCases
-  folds: 1
+    type: timeseries
+    prediction_length: 30
+    id_column: name
+    timestamp_column: Date
 
+  folds: 1
diff --git a/resources/config.yaml b/resources/config.yaml
index ba3a9f930..835758c76 100644
--- a/resources/config.yaml
+++ b/resources/config.yaml
@@ -54,6 +54,7 @@ benchmarks:                     # configuration namespace for the benchmarks def
     binary: ['auc', 'logloss', 'acc', 'balacc']     # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error).
     multiclass: ['logloss', 'acc', 'balacc']        # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average.
     regression: ['rmse', 'r2', 'mae']               # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2).
+    timeseries: ['mase', 'smape', 'nrmse', 'nd', 'ncrps', 'rmse']
   defaults:            # the default constraints, usually overridden by a constraint.
     folds: 10          # the amount of fold-runs executed for each dataset.
     max_runtime_seconds: 3600   # default time allocated to the framework to train a model.

From 0f3898678f2d6ba6c1ed751f0cb2fe7167520d24 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Fri, 16 Sep 2022 15:58:48 +0000
Subject: [PATCH 05/30] remove unused code

---
 amlb/datautils.py | 36 ------------------------------------
 amlb/results.py   |  2 +-
 2 files changed, 1 insertion(+), 37 deletions(-)

diff --git a/amlb/datautils.py b/amlb/datautils.py
index 1c489ef8b..efc5b2c6d 100644
--- a/amlb/datautils.py
+++ b/amlb/datautils.py
@@ -348,39 +348,3 @@ def _restore_dtypes(X_np, X_ori):
         return X_np.astype(X_ori.dtype, copy=False)
     else:
         return X_np
-
-
-DEFAULT_SEASONALITIES = {
-    "S": 3600,  # 1 hour
-    "T": 1440,  # 1 day
-    "H": 24,  # 1 day
-    "D": 1,  # 1 day
-    "W": 1,  # 1 week
-    "M": 12,
-    "B": 5,
-    "Q": 4,
-}
-
-
-def norm_freq_str(freq_str: str) -> str:
-    return freq_str.split("-")[0]
-
-def get_seasonality(freq: str, seasonalities=DEFAULT_SEASONALITIES) -> int:
-    """
-    Return the seasonality of a given frequency:
-    >>> get_seasonality("2H")
-    12
-    """
-    offset = pd.tseries.frequencies.to_offset(freq)
-
-    base_seasonality = seasonalities.get(norm_freq_str(offset.name), 1)
-
-    seasonality, remainder = divmod(base_seasonality, offset.n)
-    if not remainder:
-        return seasonality
-
-    log.warning(
-        f"Multiple {offset.n} does not divide base seasonality "
-        f"{base_seasonality}. Falling back to seasonality 1."
-    )
-    return 1
diff --git a/amlb/results.py b/amlb/results.py
index f86a7eebe..186c5a6c7 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -20,7 +20,7 @@
 from .data import Dataset, DatasetType, Feature
 from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \
     mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, r2_score, roc_auc_score, \
-    read_csv, write_csv, is_data_frame, to_data_frame, get_seasonality
+    read_csv, write_csv, is_data_frame, to_data_frame
 from .resources import get as rget, config as rconfig, output_dirs
 from .utils import Namespace, backup_file, cached, datetime_iso, get_metadata, json_load, memoize, profile, set_metadata
 

From f93266994634c23fb3d1466481419ca60f8d8c37 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Tue, 20 Sep 2022 14:02:57 +0000
Subject: [PATCH 06/30] add readability, remove slice from timer

---
 frameworks/AutoGluonTS/exec.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/frameworks/AutoGluonTS/exec.py b/frameworks/AutoGluonTS/exec.py
index f14579515..4720ac26a 100644
--- a/frameworks/AutoGluonTS/exec.py
+++ b/frameworks/AutoGluonTS/exec.py
@@ -22,18 +22,12 @@
 log = logging.getLogger(__name__)
 
 
-# FIXME: Why does leaderboard claim a different test score than AMLB for RMSE?
-# FIXME: Currently ignoring test_path, just using train data for evaluation
-# TODO: How to evaluate more complex metrics?
 def run(dataset, config):
     log.info(f"\n**** AutoGluon TimeSeries [v{__version__}] ****\n")
 
-    #################
-    # TODO: Need to pass the following info somehow
     timestamp_column = dataset.timestamp_column
     id_column = dataset.id_column
     prediction_length = dataset.prediction_length
-    #################
 
     eval_metric = get_eval_metric(config)
     label = dataset.target.name
@@ -45,6 +39,7 @@ def run(dataset, config):
                                       test_path=dataset.test.path,
                                       timestamp_column=timestamp_column,
                                       id_column=id_column)
+    test_data_past = test_data.copy().slice_by_timestep(slice(None, -prediction_length))
 
     predictor_path = tempfile.mkdtemp() + os.sep
     with Timer() as training:
@@ -61,7 +56,6 @@ def run(dataset, config):
         )
 
     with Timer() as predict:
-        test_data_past = test_data.copy().slice_by_timestep(slice(None, -prediction_length))
         predictions = predictor.predict(test_data_past)
     log.info(predictions)
 
@@ -72,7 +66,7 @@ def run(dataset, config):
     log.info(predictions_only)
     log.info(truth_only)
 
-    leaderboard = predictor.leaderboard(test_data)
+    leaderboard = predictor.leaderboard(test_data, silent=True)
 
     with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
         log.info(leaderboard)
@@ -91,7 +85,7 @@ def run(dataset, config):
                   models_count=num_models_trained,
                   training_duration=training.duration,
                   predict_duration=predict.duration,
-                  quantiles=predictions.iloc[:, 1:])
+                  quantiles=predictions.drop(columns=['mean']))
 
 
 def load_data(train_path, test_path, timestamp_column, id_column):

From 16a165b890f9499dd4e07af6113876fde547b5cc Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Tue, 20 Sep 2022 14:04:41 +0000
Subject: [PATCH 07/30] ensure autogluonts has required info

---
 frameworks/AutoGluonTS/__init__.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/frameworks/AutoGluonTS/__init__.py b/frameworks/AutoGluonTS/__init__.py
index 3e1744ee7..70283c3e5 100644
--- a/frameworks/AutoGluonTS/__init__.py
+++ b/frameworks/AutoGluonTS/__init__.py
@@ -1,5 +1,5 @@
 from amlb.benchmark import TaskConfig
-from amlb.data import Dataset
+from amlb.data import Dataset, DatasetType
 from amlb.utils import call_script_in_same_dir
 
 
@@ -10,6 +10,15 @@ def setup(*args, **kwargs):
 def run(dataset: Dataset, config: TaskConfig):
     from frameworks.shared.caller import run_in_venv
 
+    if hasattr(dataset, 'timestamp_column') is False:
+        dataset.timestamp_column = None
+    if hasattr(dataset, 'id_column') is False:
+        dataset.id_column = None
+    if hasattr(dataset, 'prediction_length') is False:
+        raise AttributeError("Unspecified `prediction_length`.")
+    if dataset.type is not DatasetType.timeseries:
+        raise ValueError("AutoGluonTS only supports timeseries.")
+
     data = dict(
         # train=dict(path=dataset.train.data_path('parquet')),
         # test=dict(path=dataset.test.data_path('parquet')),
@@ -20,9 +29,9 @@ def run(dataset: Dataset, config: TaskConfig):
             classes=dataset.target.values
         ),
         problem_type=dataset.type.name,  # AutoGluon problem_type is using same names as amlb.data.DatasetType
-        timestamp_column=dataset.timestamp_column if dataset.timestamp_column is not None else None,
-        id_column=dataset.id_column if dataset.id_column is not None else None,
-        prediction_length=dataset.prediction_length if dataset.prediction_length is not None else None
+        timestamp_column=dataset.timestamp_column,
+        id_column=dataset.id_column,
+        prediction_length=dataset.prediction_length
     )
 
     return run_in_venv(__file__, "exec.py",

From 758b92d25d38e0978f9f0ee6d23318e91f1bc666 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Tue, 20 Sep 2022 14:05:04 +0000
Subject: [PATCH 08/30] add comments for readability

---
 amlb/results.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/amlb/results.py b/amlb/results.py
index 186c5a6c7..814c204ff 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -271,6 +271,7 @@ def save_predictions(dataset: Dataset, output_file: str,
         :param probabilities_labels:
         :param target_is_encoded:
         :param preview:
+        :param qunatiles:
         :return: None
         """
         log.debug("Saving predictions to `%s`.", output_file)
@@ -313,14 +314,22 @@ def save_predictions(dataset: Dataset, output_file: str,
 
         df = df.assign(predictions=preds)
         df = df.assign(truth=truth)
-        if quantiles is not None:
-            quantiles.reset_index(drop=True, inplace=True)
-            df = pd.concat([df, quantiles], axis=1)
+
         if dataset.type == DatasetType.timeseries:
-            period_length = 1 # this period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this should be fine.
+            if quantiles is not None:
+                quantiles = quantiles.reset_index(drop=True)
+                df = pd.concat([df, quantiles], axis=1)
+
+            period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works.
+
+            # we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)|
+            # 1. retrieve item_ids for each sequence/item
             item_ids, inverse_item_ids = np.unique(dataset.test.X[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True)
+            # 2. capture sequences in a list
             y_past = [dataset.test.y.squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))]
+            # 3. calculate period error per sequence
             y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
+            # 4. repeat period error for each sequence, to save one for each element
             y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length)
             df = df.assign(y_past_period_error=y_past_period_error_rep)
         if preview:

From 04872e7b96a6d25836a876da1d098e3d02adcd74 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Tue, 20 Sep 2022 14:05:52 +0000
Subject: [PATCH 09/30] setting defaults for timeseries task

---
 amlb/benchmark.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 43f64e9f9..4ed79d6a1 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -489,8 +489,17 @@ def load_data(self):
             # TODO
             raise NotImplementedError("OpenML datasets without task_id are not supported yet.")
         elif hasattr(self._task_def, 'dataset'):
+            if self._task_def.dataset['type'] == 'timeseries' and self._task_def.dataset['timestamp_column'] is None:
+                log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.")
+                self._task_def.dataset['timestamp_column'] = "timestamp"
             self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=self._task_def.dataset, fold=self.fold, timestamp_column=self._task_def.dataset['timestamp_column'])
             if self._dataset.type == DatasetType.timeseries:
+                if self._task_def.dataset['id_column'] is None:
+                    log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.")
+                    self._task_def.dataset['id_column'] = "item_id"
+                if self._task_def.dataset['prediction_length'] is None:
+                    log.warning("Warning: For timeseries task setting undefined prediction length to `1`.")
+                    self._task_def.dataset['prediction_length'] = "1"
                 self._dataset.timestamp_column=self._task_def.dataset['timestamp_column']
                 self._dataset.id_column=self._task_def.dataset['id_column']
                 self._dataset.prediction_length=self._task_def.dataset['prediction_length']

From 888a1cb8f44c4b647d8d4b885a155fbee3a37efb Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Tue, 20 Sep 2022 14:06:24 +0000
Subject: [PATCH 10/30] remove outer context manipulation

---
 amlb/datautils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/amlb/datautils.py b/amlb/datautils.py
index efc5b2c6d..7946fdc1c 100644
--- a/amlb/datautils.py
+++ b/amlb/datautils.py
@@ -37,9 +37,11 @@ def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dty
     :param header: if the columns header should be read.
     :param as_data_frame: if the result should be returned as a data frame (default) or a numpy array.
     :param dtype: data type for columns.
+    :param timestamp_column: column name for timestamp, to ensure dates are correctly parsed by pandas.
     :return: a DataFrame
     """
     if dtype is not None and timestamp_column is not None and timestamp_column in dtype:
+            dtype = dtype.copy() # to avoid outer context manipulation
             del dtype[timestamp_column]
 
     df = pd.read_csv(path,

From e15de3eb4c4b5be5cc27eb3b5789abe04f2c367b Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Tue, 20 Sep 2022 14:08:59 +0000
Subject: [PATCH 11/30] corrected spelling error for quantiles

---
 amlb/results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/amlb/results.py b/amlb/results.py
index 814c204ff..3f7320fa2 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -271,7 +271,7 @@ def save_predictions(dataset: Dataset, output_file: str,
         :param probabilities_labels:
         :param target_is_encoded:
         :param preview:
-        :param qunatiles:
+        :param quantiles:
         :return: None
         """
         log.debug("Saving predictions to `%s`.", output_file)

From 866492fddfa110828201d4e0b77cf9f324ce1a9b Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 21 Sep 2022 12:34:51 +0000
Subject: [PATCH 12/30] adding mape, correct available metrics

---
 amlb/results.py                | 46 ++++++++++------------------------
 frameworks/AutoGluonTS/exec.py |  4 ++-
 resources/benchmarks/ts.yaml   | 10 ++++----
 resources/config.yaml          |  2 +-
 4 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/amlb/results.py b/amlb/results.py
index 3f7320fa2..91228ca4e 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -680,7 +680,7 @@ def r2(self):
         """R^2"""
         return float(r2_score(self.truth, self.predictions))
 
-class TimeSeriesResult(Result):
+class TimeSeriesResult(RegressionResult):
 
     def __init__(self, predictions_df, info=None):
         super().__init__(predictions_df, info)
@@ -697,36 +697,6 @@ def __init__(self, predictions_df, info=None):
         self.target = Feature(0, 'target', 'real', is_target=True)
         self.type = DatasetType.timeseries
 
-    @metric(higher_is_better=False)
-    def mae(self):
-        """Mean Absolute Error"""
-        return float(mean_absolute_error(self.truth, self.predictions))
-
-    @metric(higher_is_better=False)
-    def mse(self):
-        """Mean Squared Error"""
-        return float(mean_squared_error(self.truth, self.predictions))
-
-    @metric(higher_is_better=False)
-    def msle(self):
-        """Mean Squared Logarithmic Error"""
-        return float(mean_squared_log_error(self.truth, self.predictions))
-
-    @metric(higher_is_better=False)
-    def rmse(self):
-        """Root Mean Square Error"""
-        return math.sqrt(self.mse())
-
-    @metric(higher_is_better=False)
-    def rmsle(self):
-        """Root Mean Square Logarithmic Error"""
-        return math.sqrt(self.msle())
-
-    @metric(higher_is_better=True)
-    def r2(self):
-        """R^2"""
-        return float(r2_score(self.truth, self.predictions))
-
     @metric(higher_is_better=False)
     def mase(self):
         """Mean Absolute Scaled Error"""
@@ -742,14 +712,24 @@ def smape(self):
         denom[denom == 0] = math.inf
         return np.mean(num / denom)
 
+    @metric(higher_is_better=False)
+    def mape(self):
+        """Symmetric Mean Absolute Percentage Error"""
+        num = np.abs(self.truth - self.predictions)
+        denom = np.abs(self.truth)
+        # If the denominator is 0, we set it to float('inf') such that any division yields 0 (this
+        # might not be fully mathematically correct, but at least we don't get NaNs)
+        denom[denom == 0] = math.inf
+        return np.mean(num / denom)
+
     @metric(higher_is_better=False)
     def nrmse(self):
         """Normalized Root Mean Square Error"""
         return self.rmse() / np.mean(np.abs(self.truth))
 
     @metric(higher_is_better=False)
-    def nd(self):
-        """nd = ?"""
+    def wape(self):
+        """Weighted Average Percentage Error"""
         return np.sum(np.abs(self.truth - self.predictions)) / np.sum(np.abs(self.truth))
 
     @metric(higher_is_better=False)
diff --git a/frameworks/AutoGluonTS/exec.py b/frameworks/AutoGluonTS/exec.py
index 4720ac26a..87e7f44f3 100644
--- a/frameworks/AutoGluonTS/exec.py
+++ b/frameworks/AutoGluonTS/exec.py
@@ -118,9 +118,11 @@ def load_data(train_path, test_path, timestamp_column, id_column):
 def get_eval_metric(config):
     # TODO: Support more metrics
     metrics_mapping = dict(
+        mape="MAPE",
+        smape="sMAPE",
+        mase="MASE",
         mse="MSE",
         rmse="RMSE",
-        mase="MASE",
     )
 
     eval_metric = metrics_mapping[config.metric] if config.metric in metrics_mapping else None
diff --git a/resources/benchmarks/ts.yaml b/resources/benchmarks/ts.yaml
index 04cb86bb9..1b4850a44 100644
--- a/resources/benchmarks/ts.yaml
+++ b/resources/benchmarks/ts.yaml
@@ -2,12 +2,12 @@
 
 - name: covid
   dataset:
-    train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
-    test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
-    target: ConfirmedCases
+    train: s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv # /tmp/gluonts/train_df.csv # https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
+    test: s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv # /tmp/gluonts/test_df.csv # https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
+    #target: target #target ConfirmedCases
     type: timeseries
     prediction_length: 30
-    id_column: name
-    timestamp_column: Date
+    #id_column: item_id # item_id name
+    #timestamp_column: timestamp # timestamp Date
 
   folds: 1
diff --git a/resources/config.yaml b/resources/config.yaml
index 835758c76..0e237584e 100644
--- a/resources/config.yaml
+++ b/resources/config.yaml
@@ -54,7 +54,7 @@ benchmarks:                     # configuration namespace for the benchmarks def
     binary: ['auc', 'logloss', 'acc', 'balacc']     # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error).
     multiclass: ['logloss', 'acc', 'balacc']        # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average.
     regression: ['rmse', 'r2', 'mae']               # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2).
-    timeseries: ['mase', 'smape', 'nrmse', 'nd', 'ncrps', 'rmse']
+    timeseries: ['mase', 'mape', 'smape', 'rmse', 'mse', 'nrmse', 'wape', 'ncrps']
   defaults:            # the default constraints, usually overridden by a constraint.
     folds: 10          # the amount of fold-runs executed for each dataset.
     max_runtime_seconds: 3600   # default time allocated to the framework to train a model.

From 9252835982b58cb6fdaf23b1c2de4cdbf18c1050 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 21 Sep 2022 12:39:53 +0000
Subject: [PATCH 13/30] beautify config options

---
 resources/benchmarks/ts.yaml | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/resources/benchmarks/ts.yaml b/resources/benchmarks/ts.yaml
index 1b4850a44..463f243c9 100644
--- a/resources/benchmarks/ts.yaml
+++ b/resources/benchmarks/ts.yaml
@@ -2,12 +2,14 @@
 
 - name: covid
   dataset:
-    train: s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv # /tmp/gluonts/train_df.csv # https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
-    test: s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv # /tmp/gluonts/test_df.csv # https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
-    #target: target #target ConfirmedCases
+    train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
+    # s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv | https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
+    test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
+    # s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv  |  https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
+    target: target              # target    | ConfirmedCases
     type: timeseries
     prediction_length: 30
-    #id_column: item_id # item_id name
-    #timestamp_column: timestamp # timestamp Date
+    id_column: item_id          # item_id   | name
+    timestamp_column: timestamp # timestamp | Date
 
   folds: 1

From 18cc6aff7873e69e0887946561933c71235e0066 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 21 Sep 2022 12:49:37 +0000
Subject: [PATCH 14/30] fixed config for public access

---
 resources/benchmarks/ts.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/resources/benchmarks/ts.yaml b/resources/benchmarks/ts.yaml
index 463f243c9..0a73c81fb 100644
--- a/resources/benchmarks/ts.yaml
+++ b/resources/benchmarks/ts.yaml
@@ -6,10 +6,10 @@
     # s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv | https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
     test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
     # s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv  |  https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
-    target: target              # target    | ConfirmedCases
+    target: ConfirmedCases              # target    | ConfirmedCases
     type: timeseries
     prediction_length: 30
-    id_column: item_id          # item_id   | name
-    timestamp_column: timestamp # timestamp | Date
+    id_column: name          # item_id   | name
+    timestamp_column: Date # timestamp | Date
 
   folds: 1

From 3e8945a78852b8b2a10d7bf091fb728697e25db3 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Fri, 23 Sep 2022 09:56:17 +0000
Subject: [PATCH 15/30] no outer context manipulation, add dataset subdir

---
 amlb/benchmark.py                  |  4 +++-
 amlb/datasets/file.py              | 19 +++++++++++--------
 frameworks/AutoGluonTS/__init__.py |  3 ++-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 4ed79d6a1..16b5a671d 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -492,7 +492,9 @@ def load_data(self):
             if self._task_def.dataset['type'] == 'timeseries' and self._task_def.dataset['timestamp_column'] is None:
                 log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.")
                 self._task_def.dataset['timestamp_column'] = "timestamp"
-            self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=self._task_def.dataset, fold=self.fold, timestamp_column=self._task_def.dataset['timestamp_column'])
+            dataset_name_and_config = copy(self._task_def.dataset)
+            dataset_name_and_config.name = self._task_def.name
+            self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=dataset_name_and_config, fold=self.fold)
             if self._dataset.type == DatasetType.timeseries:
                 if self._task_def.dataset['id_column'] is None:
                     log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.")
diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
index 0bfa9453b..7c1080de8 100644
--- a/amlb/datasets/file.py
+++ b/amlb/datasets/file.py
@@ -30,10 +30,10 @@ def __init__(self, cache_dir=None):
         self._cache_dir = cache_dir if cache_dir else tempfile.mkdtemp(prefix='amlb_cache')
 
     @profile(logger=log)
-    def load(self, dataset, fold=0, timestamp_column=None):
+    def load(self, dataset, fold=0):
         dataset = dataset if isinstance(dataset, ns) else ns(path=dataset)
         log.debug("Loading dataset %s", dataset)
-        paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold)
+        paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold, name=dataset['name'] if 'name' in dataset else None)
         assert fold < len(paths['train']), f"No training dataset available for fold {fold} among dataset files {paths['train']}"
         # seed = rget().seed(fold)
         # if len(paths['test']) == 0:
@@ -51,21 +51,21 @@ def load(self, dataset, fold=0, timestamp_column=None):
         if ext == '.arff':
             return ArffDataset(train_path, test_path, target=target, features=features, type=type_)
         elif ext == '.csv':
-            return CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=timestamp_column)
+            return CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=dataset['timestamp_column'] if 'timestamp_column' in dataset else None)
         else:
             raise ValueError(f"Unsupported file type: {ext}")
 
-    def _extract_train_test_paths(self, dataset, fold=None):
+    def _extract_train_test_paths(self, dataset, fold=None, name=None):
         if isinstance(dataset, (tuple, list)):
             assert len(dataset) % 2 == 0, "dataset list must contain an even number of paths: [train_0, test_0, train_1, test_1, ...]."
             return self._extract_train_test_paths(ns(train=[p for i, p in enumerate(dataset) if i % 2 == 0],
                                                      test=[p for i, p in enumerate(dataset) if i % 2 == 1]),
-                                                  fold=fold)
+                                                  fold=fold, name=name)
         elif isinstance(dataset, ns):
-            return dict(train=[self._extract_train_test_paths(p)['train'][0]
+            return dict(train=[self._extract_train_test_paths(p, name=name)['train'][0]
                                if i == fold else None
                                for i, p in enumerate(as_list(dataset.train))],
-                        test=[self._extract_train_test_paths(p)['train'][0]
+                        test=[self._extract_train_test_paths(p, name=name)['train'][0]
                               if i == fold else None
                               for i, p in enumerate(as_list(dataset.test))])
         else:
@@ -116,7 +116,10 @@ def _extract_train_test_paths(self, dataset, fold=None):
                 assert len(paths) > 0, f"No dataset file found in {dataset}: they should follow the naming xxxx_train.ext, xxxx_test.ext or xxxx_train_0.ext, xxxx_test_0.ext, xxxx_train_1.ext, ..."
                 return paths
         elif is_valid_url(dataset):
-            cached_file = os.path.join(self._cache_dir, os.path.basename(dataset))
+            if name is None:
+                cached_file = os.path.join(self._cache_dir, os.path.basename(dataset))
+            else:
+                cached_file = os.path.join(self._cache_dir, name, os.path.basename(dataset))
             if not os.path.exists(cached_file):  # don't download if previously done
                 handler = get_file_handler(dataset)
                 assert handler.exists(dataset), f"Invalid path/url: {dataset}"
diff --git a/frameworks/AutoGluonTS/__init__.py b/frameworks/AutoGluonTS/__init__.py
index 70283c3e5..5be567305 100644
--- a/frameworks/AutoGluonTS/__init__.py
+++ b/frameworks/AutoGluonTS/__init__.py
@@ -1,7 +1,7 @@
 from amlb.benchmark import TaskConfig
 from amlb.data import Dataset, DatasetType
 from amlb.utils import call_script_in_same_dir
-
+from copy import deepcopy
 
 def setup(*args, **kwargs):
     call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)
@@ -10,6 +10,7 @@ def setup(*args, **kwargs):
 def run(dataset: Dataset, config: TaskConfig):
     from frameworks.shared.caller import run_in_venv
 
+    dataset = deepcopy(dataset)
     if hasattr(dataset, 'timestamp_column') is False:
         dataset.timestamp_column = None
     if hasattr(dataset, 'id_column') is False:

From 4ca2118793a59001f1652daa40b6b730da6a1d88 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Fri, 23 Sep 2022 09:58:55 +0000
Subject: [PATCH 16/30] add more datasets

---
 resources/benchmarks/ts.yaml | 98 +++++++++++++++++++++++++++++++++---
 1 file changed, 90 insertions(+), 8 deletions(-)

diff --git a/resources/benchmarks/ts.yaml b/resources/benchmarks/ts.yaml
index 0a73c81fb..b800e8bf8 100644
--- a/resources/benchmarks/ts.yaml
+++ b/resources/benchmarks/ts.yaml
@@ -1,15 +1,97 @@
 ---
 
-- name: covid
+- name: covid_deaths
   dataset:
-    train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
-    # s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv | https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
-    test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
-    # s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv  |  https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
-    target: ConfirmedCases              # target    | ConfirmedCases
+    train: s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv
+    test: s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv
     type: timeseries
     prediction_length: 30
-    id_column: name          # item_id   | name
-    timestamp_column: Date # timestamp | Date
+  folds: 1
+
+- name: hospital
+  dataset:
+    train: s3://autogluon-ts-bench/data/hospital/csv/train.csv
+    test: s3://autogluon-ts-bench/data/hospital/csv/test.csv
+    type: timeseries
+    prediction_length: 12
+  folds: 1
+
+- name: kdd_2018
+  dataset:
+    train: s3://autogluon-ts-bench/data/kdd_2018/csv/train.csv
+    test: s3://autogluon-ts-bench/data/kdd_2018/csv/test.csv
+    type: timeseries
+    prediction_length: 48
+  folds: 1
+
+- name: m3_monthly
+  dataset:
+    train: s3://autogluon-ts-bench/data/m3_monthly/csv/train.csv
+    test: s3://autogluon-ts-bench/data/m3_monthly/csv/test.csv
+    type: timeseries
+    prediction_length: 18
+  folds: 1
+
+- name: m3_other
+  dataset:
+    train: s3://autogluon-ts-bench/data/m3_other/csv/train.csv
+    test: s3://autogluon-ts-bench/data/m3_other/csv/test.csv
+    type: timeseries
+    prediction_length: 8
+  folds: 1
 
+- name: m3_quarterly
+  dataset:
+    train: s3://autogluon-ts-bench/data/m3_quarterly/csv/train.csv
+    test: s3://autogluon-ts-bench/data/m3_quarterly/csv/test.csv
+    type: timeseries
+    prediction_length: 8
+  folds: 1
+
+- name: m4_hourly
+  dataset:
+    train: s3://autogluon-ts-bench/data/m4_hourly/csv/train.csv
+    test: s3://autogluon-ts-bench/data/m4_hourly/csv/test.csv
+    type: timeseries
+    prediction_length: 48
+  folds: 1
+
+- name: m4_weekly
+  dataset:
+    train: s3://autogluon-ts-bench/data/m4_weekly/csv/train.csv
+    test: s3://autogluon-ts-bench/data/m4_weekly/csv/test.csv
+    type: timeseries
+    prediction_length: 13
+  folds: 1
+
+- name: nn5
+  dataset:
+    train: s3://autogluon-ts-bench/data/nn5/csv/train.csv
+    test: s3://autogluon-ts-bench/data/nn5/csv/test.csv
+    type: timeseries
+    prediction_length: 56
+  folds: 1
+
+- name: solar
+  dataset:
+    train: s3://autogluon-ts-bench/data/solar/csv/train.csv
+    test: s3://autogluon-ts-bench/data/solar/csv/test.csv
+    type: timeseries
+    prediction_length: 24
+  folds: 1
+
+- name: tourism_monthly
+  dataset:
+    train: s3://autogluon-ts-bench/data/tourism_monthly/csv/train.csv
+    test: s3://autogluon-ts-bench/data/tourism_monthly/csv/test.csv
+    type: timeseries
+    prediction_length: 24
+  folds: 1
+
+- name: tourism_quarterly
+  dataset:
+    train: s3://autogluon-ts-bench/data/tourism_quarterly/csv/train.csv
+    test: s3://autogluon-ts-bench/data/tourism_quarterly/csv/test.csv
+    type: timeseries
+    prediction_length: 8
   folds: 1

From f7f21fcb96fe964b341c6a3250313da337b53038 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Mon, 26 Sep 2022 10:13:42 +0000
Subject: [PATCH 17/30] include error raising for too large pred. length.

---
 amlb/benchmark.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 16b5a671d..43997513f 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -505,6 +505,20 @@ def load_data(self):
                 self._dataset.timestamp_column=self._task_def.dataset['timestamp_column']
                 self._dataset.id_column=self._task_def.dataset['id_column']
                 self._dataset.prediction_length=self._task_def.dataset['prediction_length']
+
+                train_seqs_lengths = self._dataset.train.X.groupby(self._dataset.id_column).count()
+                test_seqs_lengths = self._dataset.test.X.groupby(self._dataset.id_column).count()
+                prediction_length_max_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
+                prediction_length_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
+                if not self._dataset.prediction_length == prediction_length_max_diff_train_test:
+                    log.warning("Warning: Prediction length {}, does not equal difference between test and train sequence lengths {}.".format(self._dataset.prediction_length, prediction_length_max_diff_train_test))
+                if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
+                    raise ValueError("Error: Not all sequences of train and test set have same sequence length difference.")
+                if self._dataset.prediction_length > prediction_length_max_diff_train_test:
+                    raise ValueError("Error: Prediction length {} longer than at least one difference between train and test sequence length.")
+                if self._dataset.prediction_length > prediction_length_max_min_train_test:
+                    raise ValueError("Error: Prediction length {} longer than minimum sequence length + 1.".format())
+
         else:
             raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].")
 

From fb429c65067c636d4a2e65146fd681e90ffbf910 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 5 Oct 2022 08:36:27 +0000
Subject: [PATCH 18/30] mergin AutoGluonTS framework folder into AutoGluon

---
 frameworks/AutoGluon/README.md                | 16 +++++
 frameworks/AutoGluon/__init__.py              | 58 ++++++++++++++-----
 .../exec.py => AutoGluon/exec_ts.py}          |  1 +
 frameworks/AutoGluon/setup.sh                 |  9 ++-
 frameworks/AutoGluonTS/README.md              | 36 ------------
 frameworks/AutoGluonTS/__init__.py            | 39 -------------
 frameworks/AutoGluonTS/setup.sh               | 36 ------------
 resources/frameworks.yaml                     | 17 +++---
 8 files changed, 78 insertions(+), 134 deletions(-)
 create mode 100644 frameworks/AutoGluon/README.md
 rename frameworks/{AutoGluonTS/exec.py => AutoGluon/exec_ts.py} (99%)
 delete mode 100644 frameworks/AutoGluonTS/README.md
 delete mode 100644 frameworks/AutoGluonTS/__init__.py
 delete mode 100755 frameworks/AutoGluonTS/setup.sh

diff --git a/frameworks/AutoGluon/README.md b/frameworks/AutoGluon/README.md
new file mode 100644
index 000000000..51286533e
--- /dev/null
+++ b/frameworks/AutoGluon/README.md
@@ -0,0 +1,16 @@
+# AutoGluon
+
+To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...```
+
+To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ...```
+
+
+# AutoGluonTS
+
+AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems.
+
+## Run Steps
+
+To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts timeseries ...```
+
+To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest timeseries ...```
diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py
index be2c15147..bee3b99a2 100644
--- a/frameworks/AutoGluon/__init__.py
+++ b/frameworks/AutoGluon/__init__.py
@@ -1,25 +1,53 @@
-from amlb.benchmark import TaskConfig
-from amlb.data import Dataset
+
 from amlb.utils import call_script_in_same_dir
+from amlb.benchmark import TaskConfig
+from amlb.data import Dataset, DatasetType
+from copy import deepcopy
 
 
 def setup(*args, **kwargs):
     call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)
 
-
 def run(dataset: Dataset, config: TaskConfig):
     from frameworks.shared.caller import run_in_venv
 
-    data = dict(
-        train=dict(path=dataset.train.data_path('parquet')),
-        test=dict(path=dataset.test.data_path('parquet')),
-        target=dict(
-            name=dataset.target.name,
-            classes=dataset.target.values
-        ),
-        problem_type=dataset.type.name  # AutoGluon problem_type is using same names as amlb.data.DatasetType
-    )
-
-    return run_in_venv(__file__, "exec.py",
-                       input_data=data, dataset=dataset, config=config)
+    if dataset.type is not DatasetType.timeseries:
 
+        data = dict(
+            train=dict(path=dataset.train.data_path('parquet')),
+            test=dict(path=dataset.test.data_path('parquet')),
+            target=dict(
+                name=dataset.target.name,
+                classes=dataset.target.values
+            ),
+            problem_type=dataset.type.name  # AutoGluon problem_type is using same names as amlb.data.DatasetType
+        )
+        exec_file = "exec.py"
+
+    else:
+        dataset = deepcopy(dataset)
+        if not hasattr(dataset, 'timestamp_column'):
+            dataset.timestamp_column = None
+        if not hasattr(dataset, 'id_column'):
+            dataset.id_column = None
+        if not hasattr(dataset, 'prediction_length'):
+            raise AttributeError("Unspecified `prediction_length`.")
+
+        data = dict(
+            # train=dict(path=dataset.train.data_path('parquet')),
+            # test=dict(path=dataset.test.data_path('parquet')),
+            train=dict(path=dataset.train.path),
+            test=dict(path=dataset.test.path),
+            target=dict(
+                name=dataset.target.name,
+                classes=dataset.target.values
+            ),
+            problem_type=dataset.type.name,  # AutoGluon problem_type is using same names as amlb.data.DatasetType
+            timestamp_column=dataset.timestamp_column,
+            id_column=dataset.id_column,
+            prediction_length=dataset.prediction_length
+        )
+        exec_file = "exec_ts.py"
+
+    return run_in_venv(__file__, exec_file,
+                       input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/AutoGluonTS/exec.py b/frameworks/AutoGluon/exec_ts.py
similarity index 99%
rename from frameworks/AutoGluonTS/exec.py
rename to frameworks/AutoGluon/exec_ts.py
index 87e7f44f3..85593932d 100644
--- a/frameworks/AutoGluonTS/exec.py
+++ b/frameworks/AutoGluon/exec_ts.py
@@ -32,6 +32,7 @@ def run(dataset, config):
     eval_metric = get_eval_metric(config)
     label = dataset.target.name
     time_limit = config.max_runtime_seconds
+    time_limit = 10.
 
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
 
diff --git a/frameworks/AutoGluon/setup.sh b/frameworks/AutoGluon/setup.sh
index 6ef50ed8c..7cbccbee9 100755
--- a/frameworks/AutoGluon/setup.sh
+++ b/frameworks/AutoGluon/setup.sh
@@ -1,4 +1,5 @@
 #!/usr/bin/env bash
+
 HERE=$(dirname "$0")
 VERSION=${1:-"stable"}
 REPO=${2:-"https://github.com/awslabs/autogluon.git"}
@@ -36,4 +37,10 @@ else
     PIP install -e tabular/[skex]
 fi
 
-PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
+if [[ ${MODULE} == "timeseries" ]]; then
+    PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
+    # TODO: GPU version install
+    PIP install "mxnet<2.0"
+else
+    PY -c "from autogluon.timeseries.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
+fi
diff --git a/frameworks/AutoGluonTS/README.md b/frameworks/AutoGluonTS/README.md
deleted file mode 100644
index 627f8231c..000000000
--- a/frameworks/AutoGluonTS/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# AutoGluonTS
-
-AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems.
-
-This code is currently a prototype, since time series support is not fully defined in AutoMLBenchmark yet.
-Consider the code a proof of concept.
-
-## Run Steps
-
-To run AutoGluonTS in AutoMLBenchmark on the covid dataset from the AutoGluon tutorial, do the following:
-
-1. Create a fresh Python environment
-2. Follow automlbenchmark install instructions
-3. Run the following command in terminal: ```python3 ../automlbenchmark/runbenchmark.py autogluonts ts test```
-4. Done.
-
-To run mainline AutoGluonTS instead of v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ts test```
-
-## TODO
-
-### FIXME: Why does leaderboard claim a different test score than AutoMLBenchmark for RMSE?
-### FIXME: Currently ignoring test_path, just using train data for evaluation
-### TODO: How to evaluate more complex metrics like MAPE?
-### How to pass timestamp_column?
-### How to pass id_column?
-### How to pass prediction_length?
-
-
-
-
-
-
-
-
-
-
diff --git a/frameworks/AutoGluonTS/__init__.py b/frameworks/AutoGluonTS/__init__.py
deleted file mode 100644
index 5be567305..000000000
--- a/frameworks/AutoGluonTS/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from amlb.benchmark import TaskConfig
-from amlb.data import Dataset, DatasetType
-from amlb.utils import call_script_in_same_dir
-from copy import deepcopy
-
-def setup(*args, **kwargs):
-    call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)
-
-
-def run(dataset: Dataset, config: TaskConfig):
-    from frameworks.shared.caller import run_in_venv
-
-    dataset = deepcopy(dataset)
-    if hasattr(dataset, 'timestamp_column') is False:
-        dataset.timestamp_column = None
-    if hasattr(dataset, 'id_column') is False:
-        dataset.id_column = None
-    if hasattr(dataset, 'prediction_length') is False:
-        raise AttributeError("Unspecified `prediction_length`.")
-    if dataset.type is not DatasetType.timeseries:
-        raise ValueError("AutoGluonTS only supports timeseries.")
-
-    data = dict(
-        # train=dict(path=dataset.train.data_path('parquet')),
-        # test=dict(path=dataset.test.data_path('parquet')),
-        train=dict(path=dataset.train.path),
-        test=dict(path=dataset.test.path),
-        target=dict(
-            name=dataset.target.name,
-            classes=dataset.target.values
-        ),
-        problem_type=dataset.type.name,  # AutoGluon problem_type is using same names as amlb.data.DatasetType
-        timestamp_column=dataset.timestamp_column,
-        id_column=dataset.id_column,
-        prediction_length=dataset.prediction_length
-    )
-
-    return run_in_venv(__file__, "exec.py",
-                       input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/AutoGluonTS/setup.sh b/frameworks/AutoGluonTS/setup.sh
deleted file mode 100755
index d9fc7e8da..000000000
--- a/frameworks/AutoGluonTS/setup.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-HERE=$(dirname "$0")
-VERSION=${1:-"stable"}
-REPO=${2:-"https://github.com/awslabs/autogluon.git"}
-PKG=${3:-"autogluon"}
-if [[ "$VERSION" == "latest" ]]; then
-    VERSION="master"
-fi
-
-# creating local venv
-. ${HERE}/../shared/setup.sh ${HERE} true
-
-PIP install --upgrade pip
-PIP install --upgrade setuptools wheel
-
-if [[ "$VERSION" == "stable" ]]; then
-    PIP install --no-cache-dir -U "${PKG}"
-    PIP install --no-cache-dir -U "${PKG}.tabular[skex]"
-elif [[ "$VERSION" =~ ^[0-9] ]]; then
-    PIP install --no-cache-dir -U "${PKG}==${VERSION}"
-    PIP install --no-cache-dir -U "${PKG}.tabular[skex]==${VERSION}"
-else
-    TARGET_DIR="${HERE}/lib/${PKG}"
-    rm -Rf ${TARGET_DIR}
-    git clone --depth 1 --single-branch --branch ${VERSION} --recurse-submodules ${REPO} ${TARGET_DIR}
-    cd ${TARGET_DIR}
-    PY_EXEC_NO_ARGS="$(cut -d' ' -f1 <<<"$py_exec")"
-    PY_EXEC_DIR=$(dirname "$PY_EXEC_NO_ARGS")
-    env PATH="$PY_EXEC_DIR:$PATH" bash -c ./full_install.sh
-    PIP install -e tabular/[skex]
-fi
-
-# TODO: GPU version install
-PIP install "mxnet<2.0"
-
-PY -c "from autogluon.timeseries.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
diff --git a/resources/frameworks.yaml b/resources/frameworks.yaml
index eb59c44bf..4358b7515 100644
--- a/resources/frameworks.yaml
+++ b/resources/frameworks.yaml
@@ -86,9 +86,9 @@ autoxgboost:
 flaml:
   version: 'stable'
   description: |
-    FLAML is a lightweight Python library that finds accurate machine learning models 
-    automatically, efficiently and economically. It frees users from selecting learners 
-    and hyperparameters for each learner. It is fast and cheap. 
+    FLAML is a lightweight Python library that finds accurate machine learning models
+    automatically, efficiently and economically. It frees users from selecting learners
+    and hyperparameters for each learner. It is fast and cheap.
   project: https://github.com/microsoft/FLAML
   refs: [https://arxiv.org/pdf/1911.04706.pdf]
 
@@ -139,12 +139,12 @@ mljarsupervised_compete:
   description: "MLJAR is using 'Compete' mode to provide the most accurate predictor"
   params:
     mode: Compete   # set mode for Compete, default mode is Explain
-    
+
 MLNet:
   version: 'latest'
   description: |
     MLNET.CLI is a automated machine learning tool implemented by ml.net.
-    
+
 MLPlan:
   version: 'stable'
   abstract: true
@@ -196,10 +196,14 @@ TPOT:
 ####################################
 
 AutoGluonTS:
+  extends: AutoGluon
   version: "stable"
   description: |
     AutoGluon-TimeSeries
-  project: https://auto.gluon.ai
+  setup_env:
+    MODULE: timeseries
+  params:
+    presets: good_quality
 
 #######################################
 ### Non AutoML reference frameworks ###
@@ -242,4 +246,3 @@ TunedRandomForest:
 #    _n_jobs: 1  # cf. RandomForest
 #    _tuning:
 #      n_estimators: 500
-

From 23d057a86b0ae213c21b8f7f20c8c57d3519dd11 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 5 Oct 2022 08:38:42 +0000
Subject: [PATCH 19/30] renaming ts.yaml to timeseries.yaml, plus ext.

---
 resources/benchmarks/{ts.yaml => timeseries.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename resources/benchmarks/{ts.yaml => timeseries.yaml} (100%)

diff --git a/resources/benchmarks/ts.yaml b/resources/benchmarks/timeseries.yaml
similarity index 100%
rename from resources/benchmarks/ts.yaml
rename to resources/benchmarks/timeseries.yaml

From 1396d2007baedffa9db158bbe29d8801081303e9 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 5 Oct 2022 09:35:28 +0000
Subject: [PATCH 20/30] removing presets, correct latest config for AGTS

---
 resources/frameworks.yaml        | 2 --
 resources/frameworks_latest.yaml | 5 +++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/resources/frameworks.yaml b/resources/frameworks.yaml
index 4358b7515..513c99586 100644
--- a/resources/frameworks.yaml
+++ b/resources/frameworks.yaml
@@ -202,8 +202,6 @@ AutoGluonTS:
     AutoGluon-TimeSeries
   setup_env:
     MODULE: timeseries
-  params:
-    presets: good_quality
 
 #######################################
 ### Non AutoML reference frameworks ###
diff --git a/resources/frameworks_latest.yaml b/resources/frameworks_latest.yaml
index b23bf72b0..becdc4e3e 100644
--- a/resources/frameworks_latest.yaml
+++ b/resources/frameworks_latest.yaml
@@ -85,10 +85,12 @@ TPOT:
 ####################################
 
 AutoGluonTS:
+  extends: AutoGluon
   version: "latest"
   description: |
     AutoGluon-TimeSeries
-  project: https://auto.gluon.ai
+  setup_env:
+    MODULE: timeseries
 
 #######################################
 ### Non AutoML reference frameworks ###
@@ -111,4 +113,3 @@ TunedRandomForest:
   version: 'latest'
   params:
     n_estimators: 2000
-

From 8332960ea36ba2ad5a5367c909d4041001017759 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 5 Oct 2022 09:45:56 +0000
Subject: [PATCH 21/30] move dataset timeseries ext to datasets/file.py

---
 amlb/benchmark.py     | 26 --------------------------
 amlb/datasets/file.py | 39 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 43997513f..39458efe5 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -489,35 +489,9 @@ def load_data(self):
             # TODO
             raise NotImplementedError("OpenML datasets without task_id are not supported yet.")
         elif hasattr(self._task_def, 'dataset'):
-            if self._task_def.dataset['type'] == 'timeseries' and self._task_def.dataset['timestamp_column'] is None:
-                log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.")
-                self._task_def.dataset['timestamp_column'] = "timestamp"
             dataset_name_and_config = copy(self._task_def.dataset)
             dataset_name_and_config.name = self._task_def.name
             self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=dataset_name_and_config, fold=self.fold)
-            if self._dataset.type == DatasetType.timeseries:
-                if self._task_def.dataset['id_column'] is None:
-                    log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.")
-                    self._task_def.dataset['id_column'] = "item_id"
-                if self._task_def.dataset['prediction_length'] is None:
-                    log.warning("Warning: For timeseries task setting undefined prediction length to `1`.")
-                    self._task_def.dataset['prediction_length'] = "1"
-                self._dataset.timestamp_column=self._task_def.dataset['timestamp_column']
-                self._dataset.id_column=self._task_def.dataset['id_column']
-                self._dataset.prediction_length=self._task_def.dataset['prediction_length']
-
-                train_seqs_lengths = self._dataset.train.X.groupby(self._dataset.id_column).count()
-                test_seqs_lengths = self._dataset.test.X.groupby(self._dataset.id_column).count()
-                prediction_length_max_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
-                prediction_length_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
-                if not self._dataset.prediction_length == prediction_length_max_diff_train_test:
-                    log.warning("Warning: Prediction length {}, does not equal difference between test and train sequence lengths {}.".format(self._dataset.prediction_length, prediction_length_max_diff_train_test))
-                if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
-                    raise ValueError("Error: Not all sequences of train and test set have same sequence length difference.")
-                if self._dataset.prediction_length > prediction_length_max_diff_train_test:
-                    raise ValueError("Error: Prediction length {} longer than at least one difference between train and test sequence length.")
-                if self._dataset.prediction_length > prediction_length_max_min_train_test:
-                    raise ValueError("Error: Prediction length {} longer than minimum sequence length + 1.".format())
 
         else:
             raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].")
diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
index 7c1080de8..c7154a76b 100644
--- a/amlb/datasets/file.py
+++ b/amlb/datasets/file.py
@@ -51,7 +51,16 @@ def load(self, dataset, fold=0):
         if ext == '.arff':
             return ArffDataset(train_path, test_path, target=target, features=features, type=type_)
         elif ext == '.csv':
-            return CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=dataset['timestamp_column'] if 'timestamp_column' in dataset else None)
+            if DatasetType[dataset['type']] == DatasetType.timeseries and dataset['timestamp_column'] is None:
+                log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.")
+                dataset['timestamp_column'] = "timestamp"
+
+            csv_dataset = CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=dataset['timestamp_column'] if 'timestamp_column' in dataset else None)
+
+            if csv_dataset.type == DatasetType.timeseries:
+                csv_dataset = self.extend_dataset_with_timeseries_config(csv_dataset, dataset)
+
+            return csv_dataset
         else:
             raise ValueError(f"Unsupported file type: {ext}")
 
@@ -132,6 +141,34 @@ def __repr__(self):
         return repr_def(self)
 
 
+    def extend_dataset_with_timeseries_config(self, dataset, dataset_config):
+        if dataset_config['id_column'] is None:
+            log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.")
+            dataset_config['id_column'] = "item_id"
+        if dataset_config['prediction_length'] is None:
+            log.warning("Warning: For timeseries task setting undefined prediction length to `1`.")
+            dataset_config['prediction_length'] = "1"
+
+        dataset.timestamp_column=dataset_config['timestamp_column']
+        dataset.id_column=dataset_config['id_column']
+        dataset.prediction_length=dataset_config['prediction_length']
+
+        train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count()
+        test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count()
+        prediction_length_max_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
+        prediction_length_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
+        if not dataset.prediction_length == prediction_length_max_diff_train_test:
+            log.warning("Warning: Prediction length {}, does not equal difference between test and train sequence lengths {}.".format(dataset.prediction_length, prediction_length_max_diff_train_test))
+        if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
+            raise ValueError("Error: Not all sequences of train and test set have same sequence length difference.")
+        if dataset.prediction_length > prediction_length_max_diff_train_test:
+            raise ValueError("Error: Prediction length {} longer than at least one difference between train and test sequence length.")
+        if dataset.prediction_length > prediction_length_max_min_train_test:
+            raise ValueError("Error: Prediction length {} longer than minimum sequence length + 1.".format())
+        return dataset
+
+
+
 class FileDataset(Dataset):
 
     def __init__(self, train: Datasplit, test: Datasplit,

From d41f6328965a1ca2a8e4b210a12417b6e28c7fcf Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 5 Oct 2022 09:54:31 +0000
Subject: [PATCH 22/30] dont bypass test mode

---
 amlb/results.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/amlb/results.py b/amlb/results.py
index 91228ca4e..2af80dca4 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -228,12 +228,13 @@ def load_predictions(predictions_file):
             try:
                 df = read_csv(predictions_file, dtype=object)
                 log.debug("Predictions preview:\n %s\n", df.head(10).to_string())
+
+                if rconfig().test_mode:
+                    TaskResult.validate_predictions(df)
+
                 if  'y_past_period_error' in df.columns:
                     return TimeSeriesResult(df)
                 else:
-                    if rconfig().test_mode:
-                        TaskResult.validate_predictions(df)
-
                     if df.shape[1] > 2:
                         return ClassificationResult(df)
                     else:

From 3935e9e688913737b001047693f0d92d923d44eb Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 5 Oct 2022 16:21:50 +0000
Subject: [PATCH 23/30] move quantiles and y_past_period_error to opt_cols

---
 amlb/results.py                 | 26 ++++++--------------------
 frameworks/AutoGluon/exec_ts.py | 21 +++++++++++++++++++--
 frameworks/shared/callee.py     |  4 ++--
 frameworks/shared/caller.py     |  6 +++---
 4 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/amlb/results.py b/amlb/results.py
index 2af80dca4..3887203f6 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -259,9 +259,9 @@ def load_metadata(metadata_file):
     def save_predictions(dataset: Dataset, output_file: str,
                          predictions: Union[A, DF, S] = None, truth: Union[A, DF, S] = None,
                          probabilities: Union[A, DF] = None, probabilities_labels: Union[list, A] = None,
+                         optional_columns: Union[A, DF] = None,
                          target_is_encoded: bool = False,
-                         preview: bool = True,
-                         quantiles: Union[A, DF] = None):
+                         preview: bool = True):
         """ Save class probabilities and predicted labels to file in csv format.
 
         :param dataset:
@@ -270,9 +270,9 @@ def save_predictions(dataset: Dataset, output_file: str,
         :param predictions:
         :param truth:
         :param probabilities_labels:
+        :param optional_columns:
         :param target_is_encoded:
         :param preview:
-        :param quantiles:
         :return: None
         """
         log.debug("Saving predictions to `%s`.", output_file)
@@ -316,23 +316,9 @@ def save_predictions(dataset: Dataset, output_file: str,
         df = df.assign(predictions=preds)
         df = df.assign(truth=truth)
 
-        if dataset.type == DatasetType.timeseries:
-            if quantiles is not None:
-                quantiles = quantiles.reset_index(drop=True)
-                df = pd.concat([df, quantiles], axis=1)
-
-            period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works.
-
-            # we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)|
-            # 1. retrieve item_ids for each sequence/item
-            item_ids, inverse_item_ids = np.unique(dataset.test.X[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True)
-            # 2. capture sequences in a list
-            y_past = [dataset.test.y.squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))]
-            # 3. calculate period error per sequence
-            y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
-            # 4. repeat period error for each sequence, to save one for each element
-            y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length)
-            df = df.assign(y_past_period_error=y_past_period_error_rep)
+        if optional_columns is not None:
+            df = pd.concat([df, optional_columns], axis=1)
+
         if preview:
             log.info("Predictions preview:\n %s\n", df.head(20).to_string())
         backup_file(output_file)
diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py
index 85593932d..6ec5a77ae 100644
--- a/frameworks/AutoGluon/exec_ts.py
+++ b/frameworks/AutoGluon/exec_ts.py
@@ -4,6 +4,7 @@
 import warnings
 import sys
 import tempfile
+import numpy as np
 warnings.simplefilter("ignore")
 
 if sys.platform == 'darwin':
@@ -77,6 +78,23 @@ def run(dataset, config):
     save_artifacts(predictor=predictor, leaderboard=leaderboard, config=config)
     shutil.rmtree(predictor.path, ignore_errors=True)
 
+    quantiles = predictions.drop(columns=['mean']).reset_index(drop=True)
+    period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works.
+
+    # we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)|
+    # 1. retrieve item_ids for each sequence/item
+    #dataset..X /. y
+    item_ids, inverse_item_ids = np.unique(test_data.reset_index()[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True)
+    # 2. capture sequences in a list
+    y_past = [test_data[label].squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))]
+    # 3. calculate period error per sequence
+    y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
+    # 4. repeat period error for each sequence, to save one for each element
+    y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length)
+
+    optional_columns = quantiles
+    optional_columns = optional_columns.assign(y_past_period_error=y_past_period_error_rep)
+
     return result(output_file=config.output_predictions_file,
                   predictions=predictions_only,
                   truth=truth_only,
@@ -86,8 +104,7 @@ def run(dataset, config):
                   models_count=num_models_trained,
                   training_duration=training.duration,
                   predict_duration=predict.duration,
-                  quantiles=predictions.drop(columns=['mean']))
-
+                  optional_columns=optional_columns)
 
 def load_data(train_path, test_path, timestamp_column, id_column):
 
diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index 70b5a3be0..c596e01c5 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -17,12 +17,12 @@ class FrameworkError(Exception):
 def result(output_file=None,
            predictions=None, truth=None,
            probabilities=None, probabilities_labels=None,
+           optional_columns=None,
            target_is_encoded=False,
            error_message=None,
            models_count=None,
            training_duration=None,
            predict_duration=None,
-           quantiles=None,
            **others):
     return locals()
 
@@ -70,7 +70,7 @@ def load_data(name, path, **_):
                               wait_retry_secs=10):
             result = run_fn(ds, config)
             res = dict(result)
-            for name in ['predictions', 'truth', 'probabilities', 'quantiles']:
+            for name in ['predictions', 'truth', 'probabilities', 'optional_columns']:
                 arr = result[name]
                 if arr is not None:
                     path = os.path.join(config.result_dir, '.'.join([name, 'data']))
diff --git a/frameworks/shared/caller.py b/frameworks/shared/caller.py
index 68963a820..09654dc32 100644
--- a/frameworks/shared/caller.py
+++ b/frameworks/shared/caller.py
@@ -149,7 +149,7 @@ def run_in_venv(caller_file, script_file: str, *args,
         if res.error_message is not None:
             raise NoResultError(res.error_message)
 
-        for name in ['predictions', 'truth', 'probabilities', 'quantiles']:
+        for name in ['predictions', 'truth', 'probabilities', 'optional_columns']:
             res[name] = deserialize_data(res[name], config=ser_config) if res[name] is not None else None
 
         if callable(process_results):
@@ -164,8 +164,8 @@ def run_in_venv(caller_file, script_file: str, *args,
                                     else dataset.test.y),
                              probabilities=res.probabilities,
                              probabilities_labels=res.probabilities_labels,
-                             target_is_encoded=res.target_is_encoded,
-                             quantiles=res.quantiles)
+                             optional_columns=res.optional_columns,
+                             target_is_encoded=res.target_is_encoded)
 
         return dict(
             models_count=res.models_count if res.models_count is not None else 1,

From 1f7c5748ac90d401100f2f998ac9c98c65f23136 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Wed, 5 Oct 2022 16:22:04 +0000
Subject: [PATCH 24/30] remove whitespaces

---
 amlb/benchmark.py     | 1 -
 amlb/datasets/file.py | 3 ---
 2 files changed, 4 deletions(-)

diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 39458efe5..7c54a344c 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -492,7 +492,6 @@ def load_data(self):
             dataset_name_and_config = copy(self._task_def.dataset)
             dataset_name_and_config.name = self._task_def.name
             self._dataset = Benchmark.data_loader.load(DataSourceType.file, dataset=dataset_name_and_config, fold=self.fold)
-
         else:
             raise ValueError("Tasks should have one property among [openml_task_id, openml_dataset_id, dataset].")
 
diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
index c7154a76b..77d6474e0 100644
--- a/amlb/datasets/file.py
+++ b/amlb/datasets/file.py
@@ -54,12 +54,9 @@ def load(self, dataset, fold=0):
             if DatasetType[dataset['type']] == DatasetType.timeseries and dataset['timestamp_column'] is None:
                 log.warning("Warning: For timeseries task setting undefined timestamp column to `timestamp`.")
                 dataset['timestamp_column'] = "timestamp"
-
             csv_dataset = CsvDataset(train_path, test_path, target=target, features=features, type=type_, timestamp_column=dataset['timestamp_column'] if 'timestamp_column' in dataset else None)
-
             if csv_dataset.type == DatasetType.timeseries:
                 csv_dataset = self.extend_dataset_with_timeseries_config(csv_dataset, dataset)
-
             return csv_dataset
         else:
             raise ValueError(f"Unsupported file type: {ext}")

From 79e54c93dd9223e1fa91c77f2f15054c89552964 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Thu, 6 Oct 2022 09:44:29 +0000
Subject: [PATCH 25/30] deleting merge artifacts

---
 frameworks/AutoGluonTS/README.md   | 16 -------------
 frameworks/AutoGluonTS/__init__.py | 38 ------------------------------
 2 files changed, 54 deletions(-)
 delete mode 100644 frameworks/AutoGluonTS/README.md
 delete mode 100644 frameworks/AutoGluonTS/__init__.py

diff --git a/frameworks/AutoGluonTS/README.md b/frameworks/AutoGluonTS/README.md
deleted file mode 100644
index 938b459c4..000000000
--- a/frameworks/AutoGluonTS/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# AutoGluonTS
-
-AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems.
-
-This code is currently a prototype, since time series support is not fully defined in AutoMLBenchmark yet.
-Consider the code a proof of concept.
-
-## Run Steps
-
-To run AutoGluonTS in AutoMLBenchmark on the covid dataset from the AutoGluon tutorial, do the following:
-
-1. Create a fresh Python environment
-2. Follow automlbenchmark install instructions
-3. Run the following command in terminal: ```python3 ../automlbenchmark/runbenchmark.py autogluonts ts test```
-
-To run mainline AutoGluonTS instead of v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ts test```
diff --git a/frameworks/AutoGluonTS/__init__.py b/frameworks/AutoGluonTS/__init__.py
deleted file mode 100644
index 70283c3e5..000000000
--- a/frameworks/AutoGluonTS/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from amlb.benchmark import TaskConfig
-from amlb.data import Dataset, DatasetType
-from amlb.utils import call_script_in_same_dir
-
-
-def setup(*args, **kwargs):
-    call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)
-
-
-def run(dataset: Dataset, config: TaskConfig):
-    from frameworks.shared.caller import run_in_venv
-
-    if hasattr(dataset, 'timestamp_column') is False:
-        dataset.timestamp_column = None
-    if hasattr(dataset, 'id_column') is False:
-        dataset.id_column = None
-    if hasattr(dataset, 'prediction_length') is False:
-        raise AttributeError("Unspecified `prediction_length`.")
-    if dataset.type is not DatasetType.timeseries:
-        raise ValueError("AutoGluonTS only supports timeseries.")
-
-    data = dict(
-        # train=dict(path=dataset.train.data_path('parquet')),
-        # test=dict(path=dataset.test.data_path('parquet')),
-        train=dict(path=dataset.train.path),
-        test=dict(path=dataset.test.path),
-        target=dict(
-            name=dataset.target.name,
-            classes=dataset.target.values
-        ),
-        problem_type=dataset.type.name,  # AutoGluon problem_type is using same names as amlb.data.DatasetType
-        timestamp_column=dataset.timestamp_column,
-        id_column=dataset.id_column,
-        prediction_length=dataset.prediction_length
-    )
-
-    return run_in_venv(__file__, "exec.py",
-                       input_data=data, dataset=dataset, config=config)

From 6a251707a710b3af0de149d13ef9a3b2c8ceb800 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Thu, 6 Oct 2022 09:48:03 +0000
Subject: [PATCH 26/30] delete merge artifacts

---
 resources/benchmarks/ts.yaml | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100644 resources/benchmarks/ts.yaml

diff --git a/resources/benchmarks/ts.yaml b/resources/benchmarks/ts.yaml
deleted file mode 100644
index 0a73c81fb..000000000
--- a/resources/benchmarks/ts.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
----
-
-- name: covid
-  dataset:
-    train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
-    # s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv | https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
-    test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
-    # s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv  |  https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
-    target: ConfirmedCases              # target    | ConfirmedCases
-    type: timeseries
-    prediction_length: 30
-    id_column: name          # item_id   | name
-    timestamp_column: Date # timestamp | Date
-
-  folds: 1

From 928c2cf4bde2f9119768f9a7602c2155f2e89c50 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Thu, 6 Oct 2022 10:23:05 +0000
Subject: [PATCH 27/30] renaming prediction_length to forecast_range_in_steps

---
 amlb/datasets/file.py                | 24 ++++++++++++------------
 frameworks/AutoGluon/__init__.py     |  6 +++---
 frameworks/AutoGluon/exec_ts.py      |  6 +++---
 resources/benchmarks/timeseries.yaml | 24 ++++++++++++------------
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
index 77d6474e0..4c7ba9b34 100644
--- a/amlb/datasets/file.py
+++ b/amlb/datasets/file.py
@@ -142,26 +142,26 @@ def extend_dataset_with_timeseries_config(self, dataset, dataset_config):
         if dataset_config['id_column'] is None:
             log.warning("Warning: For timeseries task setting undefined itemid column to `item_id`.")
             dataset_config['id_column'] = "item_id"
-        if dataset_config['prediction_length'] is None:
-            log.warning("Warning: For timeseries task setting undefined prediction length to `1`.")
-            dataset_config['prediction_length'] = "1"
+        if dataset_config['forecast_range_in_steps'] is None:
+            log.warning("Warning: For timeseries task setting undefined forecast_range_in_steps to `1`.")
+            dataset_config['forecast_range_in_steps'] = "1"
 
         dataset.timestamp_column=dataset_config['timestamp_column']
         dataset.id_column=dataset_config['id_column']
-        dataset.prediction_length=dataset_config['prediction_length']
+        dataset.forecast_range_in_steps=int(dataset_config['forecast_range_in_steps'])
 
         train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count()
         test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count()
-        prediction_length_max_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
-        prediction_length_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
-        if not dataset.prediction_length == prediction_length_max_diff_train_test:
-            log.warning("Warning: Prediction length {}, does not equal difference between test and train sequence lengths {}.".format(dataset.prediction_length, prediction_length_max_diff_train_test))
+        forecast_range_in_steps_max_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
+        forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
+        if not dataset.forecast_range_in_steps == forecast_range_in_steps_max_diff_train_test:
+            log.warning("Warning: Forecast range {}, does not equal difference between test and train sequence lengths {}.".format(dataset.forecast_range_in_steps, forecast_range_in_steps_max_diff_train_test))
         if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
             raise ValueError("Error: Not all sequences of train and test set have same sequence length difference.")
-        if dataset.prediction_length > prediction_length_max_diff_train_test:
-            raise ValueError("Error: Prediction length {} longer than at least one difference between train and test sequence length.")
-        if dataset.prediction_length > prediction_length_max_min_train_test:
-            raise ValueError("Error: Prediction length {} longer than minimum sequence length + 1.".format())
+        if dataset.forecast_range_in_steps > forecast_range_in_steps_max_diff_train_test:
+            raise ValueError("Error: Forecast range {} longer than at least one difference between train and test sequence length.")
+        if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test:
+            raise ValueError("Error: Forecast range {} longer than minimum sequence length + 1.".format())
         return dataset
 
 
diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py
index bee3b99a2..025f16590 100644
--- a/frameworks/AutoGluon/__init__.py
+++ b/frameworks/AutoGluon/__init__.py
@@ -30,8 +30,8 @@ def run(dataset: Dataset, config: TaskConfig):
             dataset.timestamp_column = None
         if not hasattr(dataset, 'id_column'):
             dataset.id_column = None
-        if not hasattr(dataset, 'prediction_length'):
-            raise AttributeError("Unspecified `prediction_length`.")
+        if not hasattr(dataset, 'forecast_range_in_steps'):
+            raise AttributeError("Unspecified `forecast_range_in_steps`.")
 
         data = dict(
             # train=dict(path=dataset.train.data_path('parquet')),
@@ -45,7 +45,7 @@ def run(dataset: Dataset, config: TaskConfig):
             problem_type=dataset.type.name,  # AutoGluon problem_type is using same names as amlb.data.DatasetType
             timestamp_column=dataset.timestamp_column,
             id_column=dataset.id_column,
-            prediction_length=dataset.prediction_length
+            forecast_range_in_steps=dataset.forecast_range_in_steps
         )
         exec_file = "exec_ts.py"
 
diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py
index 6ec5a77ae..ac1b65b6b 100644
--- a/frameworks/AutoGluon/exec_ts.py
+++ b/frameworks/AutoGluon/exec_ts.py
@@ -28,7 +28,7 @@ def run(dataset, config):
 
     timestamp_column = dataset.timestamp_column
     id_column = dataset.id_column
-    prediction_length = dataset.prediction_length
+    prediction_length = dataset.forecast_range_in_steps
 
     eval_metric = get_eval_metric(config)
     label = dataset.target.name
@@ -86,11 +86,11 @@ def run(dataset, config):
     #dataset..X /. y
     item_ids, inverse_item_ids = np.unique(test_data.reset_index()[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True)
     # 2. capture sequences in a list
-    y_past = [test_data[label].squeeze().to_numpy()[inverse_item_ids == i][:-dataset.prediction_length] for i in range(len(item_ids))]
+    y_past = [test_data[label].squeeze().to_numpy()[inverse_item_ids == i][:-prediction_length] for i in range(len(item_ids))]
     # 3. calculate period error per sequence
     y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
     # 4. repeat period error for each sequence, to save one for each element
-    y_past_period_error_rep = np.repeat(y_past_period_error, dataset.prediction_length)
+    y_past_period_error_rep = np.repeat(y_past_period_error, prediction_length)
 
     optional_columns = quantiles
     optional_columns = optional_columns.assign(y_past_period_error=y_past_period_error_rep)
diff --git a/resources/benchmarks/timeseries.yaml b/resources/benchmarks/timeseries.yaml
index b800e8bf8..a6cd3b51e 100644
--- a/resources/benchmarks/timeseries.yaml
+++ b/resources/benchmarks/timeseries.yaml
@@ -5,7 +5,7 @@
     train: s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv
     test: s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv
     type: timeseries
-    prediction_length: 30
+    forecast_range_in_steps: 30
   folds: 1
 
 - name: hospital
@@ -13,7 +13,7 @@
     train: s3://autogluon-ts-bench/data/hospital/csv/train.csv
     test: s3://autogluon-ts-bench/data/hospital/csv/test.csv
     type: timeseries
-    prediction_length: 12
+    forecast_range_in_steps: 12
   folds: 1
 
 - name: kdd_2018
@@ -21,7 +21,7 @@
     train: s3://autogluon-ts-bench/data/kdd_2018/csv/train.csv
     test: s3://autogluon-ts-bench/data/kdd_2018/csv/test.csv
     type: timeseries
-    prediction_length: 48
+    forecast_range_in_steps: 48
   folds: 1
 
 - name: m3_monthly
@@ -29,7 +29,7 @@
     train: s3://autogluon-ts-bench/data/m3_monthly/csv/train.csv
     test: s3://autogluon-ts-bench/data/m3_monthly/csv/test.csv
     type: timeseries
-    prediction_length: 18
+    forecast_range_in_steps: 18
   folds: 1
 
 - name: m3_other
@@ -37,7 +37,7 @@
     train: s3://autogluon-ts-bench/data/m3_other/csv/train.csv
     test: s3://autogluon-ts-bench/data/m3_other/csv/test.csv
     type: timeseries
-    prediction_length: 8
+    forecast_range_in_steps: 8
   folds: 1
 
 - name: m3_quarterly
@@ -45,7 +45,7 @@
     train: s3://autogluon-ts-bench/data/m3_quarterly/csv/train.csv
     test: s3://autogluon-ts-bench/data/m3_quarterly/csv/test.csv
     type: timeseries
-    prediction_length: 8
+    forecast_range_in_steps: 8
   folds: 1
 
 - name: m4_hourly
@@ -53,7 +53,7 @@
     train: s3://autogluon-ts-bench/data/m4_hourly/csv/train.csv
     test: s3://autogluon-ts-bench/data/m4_hourly/csv/test.csv
     type: timeseries
-    prediction_length: 48
+    forecast_range_in_steps: 48
   folds: 1
 
 - name: m4_weekly
@@ -61,7 +61,7 @@
     train: s3://autogluon-ts-bench/data/m4_weekly/csv/train.csv
     test: s3://autogluon-ts-bench/data/m4_weekly/csv/test.csv
     type: timeseries
-    prediction_length: 13
+    forecast_range_in_steps: 13
   folds: 1
 
 - name: nn5
@@ -69,7 +69,7 @@
     train: s3://autogluon-ts-bench/data/nn5/csv/train.csv
     test: s3://autogluon-ts-bench/data/nn5/csv/test.csv
     type: timeseries
-    prediction_length: 56
+    forecast_range_in_steps: 56
   folds: 1
 
 - name: solar
@@ -77,7 +77,7 @@
     train: s3://autogluon-ts-bench/data/solar/csv/train.csv
     test: s3://autogluon-ts-bench/data/solar/csv/test.csv
     type: timeseries
-    prediction_length: 24
+    forecast_range_in_steps: 24
   folds: 1
 
 - name: tourism_monthly
@@ -85,7 +85,7 @@
     train: s3://autogluon-ts-bench/data/tourism_monthly/csv/train.csv
     test: s3://autogluon-ts-bench/data/tourism_monthly/csv/test.csv
     type: timeseries
-    prediction_length: 24
+    forecast_range_in_steps: 24
   folds: 1
 
 - name: tourism_quarterly
@@ -93,5 +93,5 @@
     train: s3://autogluon-ts-bench/data/tourism_quarterly/csv/train.csv
     test: s3://autogluon-ts-bench/data/tourism_quarterly/csv/test.csv
     type: timeseries
-    prediction_length: 8
+    forecast_range_in_steps: 8
   folds: 1

From 47d311c3a09db52389ca0f87e8ae4d46cd00e501 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Thu, 6 Oct 2022 10:59:30 +0000
Subject: [PATCH 28/30] use public dataset, reduced range to maximum

---
 resources/benchmarks/timeseries.yaml | 98 ++--------------------------
 1 file changed, 7 insertions(+), 91 deletions(-)

diff --git a/resources/benchmarks/timeseries.yaml b/resources/benchmarks/timeseries.yaml
index a6cd3b51e..26af06497 100644
--- a/resources/benchmarks/timeseries.yaml
+++ b/resources/benchmarks/timeseries.yaml
@@ -1,97 +1,13 @@
 ---
 
-- name: covid_deaths
+- name: covid
   dataset:
-    train: s3://autogluon-ts-bench/data/covid_deaths/csv/train.csv
-    test: s3://autogluon-ts-bench/data/covid_deaths/csv/test.csv
+    train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
+    test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
+    target: ConfirmedCases
     type: timeseries
-    forecast_range_in_steps: 30
-  folds: 1
-
-- name: hospital
-  dataset:
-    train: s3://autogluon-ts-bench/data/hospital/csv/train.csv
-    test: s3://autogluon-ts-bench/data/hospital/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 12
-  folds: 1
-
-- name: kdd_2018
-  dataset:
-    train: s3://autogluon-ts-bench/data/kdd_2018/csv/train.csv
-    test: s3://autogluon-ts-bench/data/kdd_2018/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 48
-  folds: 1
-
-- name: m3_monthly
-  dataset:
-    train: s3://autogluon-ts-bench/data/m3_monthly/csv/train.csv
-    test: s3://autogluon-ts-bench/data/m3_monthly/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 18
-  folds: 1
-
-- name: m3_other
-  dataset:
-    train: s3://autogluon-ts-bench/data/m3_other/csv/train.csv
-    test: s3://autogluon-ts-bench/data/m3_other/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 8
-  folds: 1
+    forecast_range_in_steps: 19
+    id_column: name
+    timestamp_column: Date
 
-- name: m3_quarterly
-  dataset:
-    train: s3://autogluon-ts-bench/data/m3_quarterly/csv/train.csv
-    test: s3://autogluon-ts-bench/data/m3_quarterly/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 8
-  folds: 1
-
-- name: m4_hourly
-  dataset:
-    train: s3://autogluon-ts-bench/data/m4_hourly/csv/train.csv
-    test: s3://autogluon-ts-bench/data/m4_hourly/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 48
-  folds: 1
-
-- name: m4_weekly
-  dataset:
-    train: s3://autogluon-ts-bench/data/m4_weekly/csv/train.csv
-    test: s3://autogluon-ts-bench/data/m4_weekly/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 13
-  folds: 1
-
-- name: nn5
-  dataset:
-    train: s3://autogluon-ts-bench/data/nn5/csv/train.csv
-    test: s3://autogluon-ts-bench/data/nn5/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 56
-  folds: 1
-
-- name: solar
-  dataset:
-    train: s3://autogluon-ts-bench/data/solar/csv/train.csv
-    test: s3://autogluon-ts-bench/data/solar/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 24
-  folds: 1
-
-- name: tourism_monthly
-  dataset:
-    train: s3://autogluon-ts-bench/data/tourism_monthly/csv/train.csv
-    test: s3://autogluon-ts-bench/data/tourism_monthly/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 24
-  folds: 1
-
-- name: tourism_quarterly
-  dataset:
-    train: s3://autogluon-ts-bench/data/tourism_quarterly/csv/train.csv
-    test: s3://autogluon-ts-bench/data/tourism_quarterly/csv/test.csv
-    type: timeseries
-    forecast_range_in_steps: 8
   folds: 1

From b244e9c9d02cc7c2d2f9c0d1841c72375c4608fd Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Thu, 6 Oct 2022 11:00:22 +0000
Subject: [PATCH 29/30] fix format string works

---
 amlb/datasets/file.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
index 4c7ba9b34..ffcfa8094 100644
--- a/amlb/datasets/file.py
+++ b/amlb/datasets/file.py
@@ -152,16 +152,20 @@ def extend_dataset_with_timeseries_config(self, dataset, dataset_config):
 
         train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count()
         test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count()
-        forecast_range_in_steps_max_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
+        forecast_range_in_steps_mean_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
         forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
-        if not dataset.forecast_range_in_steps == forecast_range_in_steps_max_diff_train_test:
-            log.warning("Warning: Forecast range {}, does not equal difference between test and train sequence lengths {}.".format(dataset.forecast_range_in_steps, forecast_range_in_steps_max_diff_train_test))
+        if not dataset.forecast_range_in_steps == forecast_range_in_steps_mean_diff_train_test:
+            msg = f"Warning: Forecast range {dataset.forecast_range_in_steps}, does not equal mean difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
+            log.warning(msg)
         if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
-            raise ValueError("Error: Not all sequences of train and test set have same sequence length difference.")
-        if dataset.forecast_range_in_steps > forecast_range_in_steps_max_diff_train_test:
-            raise ValueError("Error: Forecast range {} longer than at least one difference between train and test sequence length.")
+            msg = f"Error: Not all sequences of train and test set have same sequence length difference."
+            raise ValueError(msg)
+        if dataset.forecast_range_in_steps > forecast_range_in_steps_mean_diff_train_test:
+            msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
+            raise ValueError(msg)
         if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test:
-            raise ValueError("Error: Forecast range {} longer than minimum sequence length + 1.".format())
+            msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than minimum sequence length + 1, {forecast_range_in_steps_max_min_train_test}."
+            raise ValueError(msg)
         return dataset
 
 

From 3074f4212ce90af0f8dc4269ff1388aae21b57c9 Mon Sep 17 00:00:00 2001
From: sommerle <sommerle@amazon.com>
Date: Thu, 6 Oct 2022 11:57:33 +0000
Subject: [PATCH 30/30] fix key error bug, remove magic time limit

---
 frameworks/AutoGluon/exec_ts.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py
index ac1b65b6b..ab7c4110f 100644
--- a/frameworks/AutoGluon/exec_ts.py
+++ b/frameworks/AutoGluon/exec_ts.py
@@ -33,7 +33,6 @@ def run(dataset, config):
     eval_metric = get_eval_metric(config)
     label = dataset.target.name
     time_limit = config.max_runtime_seconds
-    time_limit = 10.
 
     training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
 
@@ -84,7 +83,7 @@ def run(dataset, config):
     # we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)|
     # 1. retrieve item_ids for each sequence/item
     #dataset..X /. y
-    item_ids, inverse_item_ids = np.unique(test_data.reset_index()[dataset.id_column].squeeze().to_numpy(), return_index=False, return_inverse=True)
+    item_ids, inverse_item_ids = np.unique(test_data.reset_index()["item_id"].squeeze().to_numpy(), return_index=False, return_inverse=True)
     # 2. capture sequences in a list
     y_past = [test_data[label].squeeze().to_numpy()[inverse_item_ids == i][:-prediction_length] for i in range(len(item_ids))]
     # 3. calculate period error per sequence