From 648b9eb5e939652e165223ba1b8f880b2d13da4e Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 22 Aug 2024 11:37:19 +0800
Subject: [PATCH 1/6] refactor: remove useless comments;

---
 benchpots/datasets/beijing_multisite_air_quality.py       | 1 -
 benchpots/datasets/electricity_load_diagrams.py           | 1 -
 benchpots/datasets/electricity_transformer_temperature.py | 2 +-
 benchpots/datasets/italy_air_quality.py                   | 1 -
 benchpots/datasets/pems_traffic.py                        | 1 -
 benchpots/datasets/physionet_2012.py                      | 1 -
 benchpots/datasets/physionet_2019.py                      | 1 -
 benchpots/datasets/solar_alabama.py                       | 1 -
 benchpots/datasets/ucr_uea_datasets.py                    | 1 -
 benchpots/utils/missingness.py                            | 2 +-
 10 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/benchpots/datasets/beijing_multisite_air_quality.py b/benchpots/datasets/beijing_multisite_air_quality.py
index 3a58983..34832c5 100644
--- a/benchpots/datasets/beijing_multisite_air_quality.py
+++ b/benchpots/datasets/beijing_multisite_air_quality.py
@@ -140,7 +140,6 @@ def preprocess_beijing_air_quality(
         processed_dataset["val_X_ori"] = val_X_ori
 
         processed_dataset["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
         processed_dataset["test_X_ori"] = test_X_ori
     else:
         logger.warning("rate is 0, no missing values are artificially added.")
diff --git a/benchpots/datasets/electricity_load_diagrams.py b/benchpots/datasets/electricity_load_diagrams.py
index 65b8da8..3c5bf92 100644
--- a/benchpots/datasets/electricity_load_diagrams.py
+++ b/benchpots/datasets/electricity_load_diagrams.py
@@ -108,7 +108,6 @@ def preprocess_electricity_load_diagrams(
         processed_dataset["val_X_ori"] = val_X_ori
 
         processed_dataset["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
         processed_dataset["test_X_ori"] = test_X_ori
     else:
         logger.warning("rate is 0, no missing values are artificially added.")
diff --git a/benchpots/datasets/electricity_transformer_temperature.py b/benchpots/datasets/electricity_transformer_temperature.py
index 2a3c133..44c7741 100644
--- a/benchpots/datasets/electricity_transformer_temperature.py
+++ b/benchpots/datasets/electricity_transformer_temperature.py
@@ -115,7 +115,7 @@ def preprocess_ett(
         processed_dataset["val_X_ori"] = val_X_ori
 
         processed_dataset["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
+
         processed_dataset["test_X_ori"] = test_X_ori
     else:
         logger.warning("rate is 0, no missing values are artificially added.")
diff --git a/benchpots/datasets/italy_air_quality.py b/benchpots/datasets/italy_air_quality.py
index 9e333aa..0881345 100644
--- a/benchpots/datasets/italy_air_quality.py
+++ b/benchpots/datasets/italy_air_quality.py
@@ -99,7 +99,6 @@ def preprocess_italy_air_quality(
         processed_dataset["val_X_ori"] = val_X_ori
 
         processed_dataset["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
         processed_dataset["test_X_ori"] = test_X_ori
     else:
         logger.warning("rate is 0, no missing values are artificially added.")
diff --git a/benchpots/datasets/pems_traffic.py b/benchpots/datasets/pems_traffic.py
index 736f835..d63373d 100644
--- a/benchpots/datasets/pems_traffic.py
+++ b/benchpots/datasets/pems_traffic.py
@@ -110,7 +110,6 @@ def preprocess_pems_traffic(
         processed_dataset["val_X_ori"] = val_X_ori
 
         processed_dataset["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
         processed_dataset["test_X_ori"] = test_X_ori
     else:
         logger.warning("rate is 0, no missing values are artificially added.")
diff --git a/benchpots/datasets/physionet_2012.py b/benchpots/datasets/physionet_2012.py
index dd07034..e4b4402 100644
--- a/benchpots/datasets/physionet_2012.py
+++ b/benchpots/datasets/physionet_2012.py
@@ -227,7 +227,6 @@ def apply_func(df_temp):  # pad and truncate to set the max length of samples as
         )
 
         processed_dataset["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
         processed_dataset["test_X_ori"] = test_X_ori
 
         test_X_indicating_mask = np.isnan(test_X_ori) ^ np.isnan(test_X)
diff --git a/benchpots/datasets/physionet_2019.py b/benchpots/datasets/physionet_2019.py
index d3fa274..c2085d1 100644
--- a/benchpots/datasets/physionet_2019.py
+++ b/benchpots/datasets/physionet_2019.py
@@ -192,7 +192,6 @@ def apply_func(df_temp):  # pad and truncate to set the max length of samples as
         processed_dataset["val_X_ori"] = val_X_ori
 
         processed_dataset["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
         processed_dataset["test_X_ori"] = test_X_ori
 
         test_X_indicating_mask = np.isnan(test_X_ori) ^ np.isnan(test_X)
diff --git a/benchpots/datasets/solar_alabama.py b/benchpots/datasets/solar_alabama.py
index 2614f8c..1ae87d4 100644
--- a/benchpots/datasets/solar_alabama.py
+++ b/benchpots/datasets/solar_alabama.py
@@ -108,7 +108,6 @@ def preprocess_solar_alabama(
         processed_dataset["val_X_ori"] = val_X_ori
 
         processed_dataset["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
         processed_dataset["test_X_ori"] = test_X_ori
     else:
         logger.warning("rate is 0, no missing values are artificially added.")
diff --git a/benchpots/datasets/ucr_uea_datasets.py b/benchpots/datasets/ucr_uea_datasets.py
index df4a333..28be2a3 100644
--- a/benchpots/datasets/ucr_uea_datasets.py
+++ b/benchpots/datasets/ucr_uea_datasets.py
@@ -123,7 +123,6 @@ def preprocess_ucr_uea_datasets(
         processed_dataset["val_X_ori"] = val_X_ori
 
         processed_dataset["test_X"] = test_X
-        # test_X_ori is for error calc, not for model input, hence mustn't have NaNs
         processed_dataset["test_X_ori"] = test_X_ori
     else:
         logger.warning("rate is 0, no missing values are artificially added.")
diff --git a/benchpots/utils/missingness.py b/benchpots/utils/missingness.py
index a55fe6e..223ff55 100644
--- a/benchpots/utils/missingness.py
+++ b/benchpots/utils/missingness.py
@@ -32,7 +32,7 @@ def create_missingness(X, rate, pattern, **kwargs):
 
     assert 0 < rate < 1, "rate must be in [0, 1)"
     assert (
-            pattern.lower() in supported_missing_pattern
+        pattern.lower() in supported_missing_pattern
     ), f"pattern must be one of {supported_missing_pattern}, but got {pattern}"
 
     if pattern == "point":

From e15f2d40a4026bd481afda56c7cc2424f0521cbf Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Thu, 22 Aug 2024 12:53:05 +0800
Subject: [PATCH 2/6] feat: add pre-commit config file;

---
 .pre-commit-config.yaml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 .pre-commit-config.yaml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..8479c6a
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,26 @@
+repos:
+    # hooks for checking files
+    -   repo: https://github.com/pre-commit/pre-commit-hooks
+        rev: v4.6.0
+        hooks:
+            -   id: trailing-whitespace
+            -   id: end-of-file-fixer
+            -   id: check-yaml
+
+    # hooks for linting code
+    -   repo: https://github.com/psf/black
+        rev: 24.8.0
+        hooks:
+            -   id: black
+                args: [
+                    --line-length=120, # refer to pyproject.toml
+                ]
+
+    -   repo: https://github.com/PyCQA/flake8
+        rev: 7.1.1
+        hooks:
+            -   id: flake8
+                args: [
+                    --max-line-length=120, # refer to pyproject.toml
+                    --extend-ignore=E203,E231
+                ]

From eb47094897e366794f158e1c6f73067795e6f20d Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Fri, 23 Aug 2024 15:01:26 +0800
Subject: [PATCH 3/6] feat: update pyproject.toml;

---
 pyproject.toml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 337b769..b932fe3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,6 +58,9 @@ version = { attr = "benchpots.version.__version__" }
 readme = { file = "README.md", content-type = "text/markdown" }
 dependencies = { file = "requirements.txt" }
 
+[tool.black]
+line-length = 120
+
 [tool.flake8]
 # People may argue that coding style is personal. This may be true if the project is personal and one works like a
 # hermit, but to PyPOTS and its community, the answer is NO.
@@ -69,6 +72,8 @@ dependencies = { file = "requirements.txt" }
 # who prefer the default setting can keep using 88 or 79 while coding. Please ensure your code lines not exceeding 120.
 max-line-length = 120
 # why ignore E203? Refer to https://github.com/PyCQA/pycodestyle/issues/373
+# why ignore E231? Bad trailing comma, conflict with Black
 extend-ignore = """
-    E203
+    E203,
+    E231,
 """
\ No newline at end of file

From 8fc3470f0cf0158ca5ce9dc5b6e3cfc5792776d2 Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Sat, 24 Aug 2024 12:18:40 +0800
Subject: [PATCH 4/6] feat: add preprocessing func for random walk dataset;

---
 benchpots/datasets/random_walk.py | 331 ++++++++++++++++++++++++++++++
 1 file changed, 331 insertions(+)
 create mode 100644 benchpots/datasets/random_walk.py

diff --git a/benchpots/datasets/random_walk.py b/benchpots/datasets/random_walk.py
new file mode 100644
index 0000000..79028fa
--- /dev/null
+++ b/benchpots/datasets/random_walk.py
@@ -0,0 +1,331 @@
+"""
+Preprocessing func for the generated random walk dataset.
+
+"""
+
+# Created by Wenjie Du <wenjay.du@gmail.com>
+# License: BSD-3-Clause
+
+
+import math
+from typing import Optional, Tuple
+
+import numpy as np
+from pygrinder import mcar
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.utils import check_random_state
+
+from ..utils.logging import logger, print_final_dataset_info
+from ..utils.missingness import create_missingness
+
+
+def gene_complete_random_walk(
+    n_samples: int = 1000,
+    n_steps: int = 24,
+    n_features: int = 10,
+    mu: float = 0.0,
+    std: float = 1.0,
+    random_state: Optional[int] = None,
+) -> np.ndarray:
+    """Generate complete random walk time-series data, i.e. having no missing values.
+
+    Parameters
+    ----------
+    n_samples : int, default=1000
+        The number of training time-series samples to generate.
+
+    n_steps: int, default=24
+        The number of time steps (length) of generated time-series samples.
+
+    n_features : int, default=10
+        The number of features (dimensions) of generated time-series samples.
+
+    mu : float, default=0.0
+        Mean of the normal distribution, which random walk steps are sampled from.
+
+    std : float, default=1.0
+        Standard deviation of the normal distribution, which random walk steps are sampled from.
+
+    random_state : int, default=None
+        Random seed for data generation.
+
+    Returns
+    -------
+    ts_samples: array, shape of [n_samples, n_steps, n_features]
+        Generated random walk time series.
+    """
+    seed = check_random_state(random_state)
+    ts_samples = np.zeros([n_samples, n_steps, n_features])
+    random_values = seed.randn(n_samples, n_steps, n_features) * std + mu
+    ts_samples[:, 0, :] = random_values[:, 0, :]
+    for t in range(1, n_steps):
+        ts_samples[:, t, :] = ts_samples[:, t - 1, :] + random_values[:, t, :]
+    ts_samples = np.asarray(ts_samples)
+    return ts_samples
+
+
+def gene_complete_random_walk_for_classification(
+    n_classes: int = 2,
+    n_samples_each_class: int = 500,
+    n_steps: int = 24,
+    n_features: int = 10,
+    shuffle: bool = True,
+    random_state: Optional[int] = None,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate complete random walk time-series data for the classification task.
+
+    Parameters
+    ----------
+    n_classes : int, must >=1, default=2
+        Number of classes (types) of the generated data.
+
+    n_samples_each_class : int, default=500
+        Number of samples for each class to generate.
+
+    n_steps : int, default=24
+        Number of time steps in each sample.
+
+    n_features : int, default=10
+        Number of features.
+
+    shuffle : bool, default=True
+        Whether to shuffle generated samples.
+        If not, you can separate samples of each class according to `n_samples_each_class`.
+        For example,
+        X_class0=X[:n_samples_each_class],
+        X_class1=X[n_samples_each_class:n_samples_each_class*2]
+
+    random_state : int, default=None
+        Random seed for data generation.
+
+    Returns
+    -------
+    X : array, shape of [n_samples, n_steps, n_features]
+        Generated time-series data.
+
+    y : array, shape of [n_samples]
+        Labels indicating classes of time-series samples.
+
+    """
+    assert n_classes > 1, f"n_classes should be >1, but got {n_classes}"
+
+    ts_collector = []
+    label_collector = []
+
+    mu = 0
+    std = 1
+
+    for c_ in range(n_classes):
+        ts_samples = gene_complete_random_walk(n_samples_each_class, n_steps, n_features, mu, std, random_state)
+        label_samples = np.asarray([1 for _ in range(n_samples_each_class)]) * c_
+        ts_collector.extend(ts_samples)
+        label_collector.extend(label_samples)
+        mu += 1
+
+    X = np.asarray(ts_collector)
+    y = np.asarray(label_collector)
+
+    # if shuffling, then shuffle the order of samples
+    if shuffle:
+        indices = np.arange(len(X))
+        np.random.shuffle(indices)
+        X = X[indices]
+        y = y[indices]
+
+    return X, y
+
+
+def gene_complete_random_walk_for_anomaly_detection(
+    n_samples: int = 1000,
+    n_steps: int = 24,
+    n_features: int = 10,
+    mu: float = 0.0,
+    std: float = 1.0,
+    anomaly_proportion: float = 0.1,
+    anomaly_fraction: float = 0.02,
+    anomaly_scale_factor: float = 2.0,
+    random_state: Optional[int] = None,
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate random walk time-series data for the anomaly-detection task.
+
+    Parameters
+    ----------
+    n_samples : int, default=1000
+        The number of training time-series samples to generate.
+
+    n_features : int, default=10
+        The number of features (dimensions) of generated time-series samples.
+
+    n_steps: int, default=24
+        The number of time steps (length) of generated time-series samples.
+
+    mu : float, default=0.0
+        Mean of the normal distribution, which random walk steps are sampled from.
+
+    std : float, default=1.0
+        Standard deviation of the normal distribution, which random walk steps are sampled from.
+
+    anomaly_proportion : float, default=0.1
+        Proportion of anomaly samples in all samples.
+
+    anomaly_fraction : float, default=0.02
+        Fraction of anomaly points in each anomaly sample.
+
+    anomaly_scale_factor : float, default=2.0
+        Scale factor for value scaling to create anomaly points in time series samples.
+
+    random_state : int, default=None
+        Random seed for data generation.
+
+    Returns
+    -------
+    X : array, shape of [n_samples, n_steps, n_features]
+        Generated time-series data.
+
+    y : array, shape of [n_samples]
+        Labels indicating if time-series samples are anomalies.
+    """
+    assert 0 < anomaly_proportion < 1, f"anomaly_proportion should be >0 and <1, but got {anomaly_proportion}"
+    assert 0 < anomaly_fraction < 1, f"anomaly_fraction should be >0 and <1, but got {anomaly_fraction}"
+    seed = check_random_state(random_state)
+    X = seed.randn(n_samples, n_steps, n_features) * std + mu
+    n_anomaly = math.floor(n_samples * anomaly_proportion)
+    anomaly_indices = np.random.choice(n_samples, size=n_anomaly, replace=False)
+    for a_i in anomaly_indices:
+        anomaly_sample = X[a_i]
+        anomaly_sample = anomaly_sample.flatten()
+        min_val = anomaly_sample.min()
+        max_val = anomaly_sample.max()
+        max_difference = min_val - max_val
+        n_points = n_steps * n_features
+        n_anomaly_points = int(n_points * anomaly_fraction)
+        point_indices = np.random.choice(a=n_points, size=n_anomaly_points, replace=False)
+        for p_i in point_indices:
+            anomaly_sample[p_i] = mu + np.random.uniform(
+                low=min_val - anomaly_scale_factor * max_difference,
+                high=max_val + anomaly_scale_factor * max_difference,
+            )
+        X[a_i] = anomaly_sample.reshape(n_steps, n_features)
+
+    # create labels
+    y = np.zeros(n_samples)
+    y[anomaly_indices] = 1
+
+    # shuffling
+    indices = np.arange(n_samples)
+    np.random.shuffle(indices)
+    X = X[indices]
+    y = y[indices]
+
+    return X, y
+
+
+def preprocess_random_walk(
+    n_steps=24,
+    n_features=10,
+    n_classes=2,
+    n_samples_each_class=1000,
+    missing_rate=0.1,
+    pattern: str = "point",
+    **kwargs,
+) -> dict:
+    """Generate a random-walk data.
+
+    Parameters
+    ----------
+    n_steps : int, default=24
+        Number of time steps in each sample.
+
+    n_features : int, default=10
+        Number of features.
+
+    n_classes : int, default=2
+        Number of classes (types) of the generated data.
+
+    n_samples_each_class : int, default=1000
+        Number of samples for each class to generate.
+
+    missing_rate : float, default=0.1
+        The rate of randomly missing values to generate, should be in [0,1).
+
+    Returns
+    -------
+    data: dict,
+        A dictionary containing the generated data.
+    """
+
+    assert 0 <= missing_rate < 1, "missing_rate must be in [0,1)"
+
+    # generate samples
+    X, y = gene_complete_random_walk_for_classification(
+        n_classes=n_classes,
+        n_samples_each_class=n_samples_each_class,
+        n_steps=n_steps,
+        n_features=n_features,
+    )
+    # split into train/val/test sets
+    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
+    train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.2)
+
+    if missing_rate > 0:
+        # create random missing values
+        train_X_ori = train_X
+        train_X = mcar(train_X, missing_rate)
+        # test set is left to mask after normalization
+
+    train_X = train_X.reshape(-1, n_features)
+    val_X = val_X.reshape(-1, n_features)
+    test_X = test_X.reshape(-1, n_features)
+    # normalization
+    scaler = StandardScaler()
+    train_X = scaler.fit_transform(train_X)
+    val_X = scaler.transform(val_X)
+    test_X = scaler.transform(test_X)
+    # reshape into time series samples
+    train_X = train_X.reshape(-1, n_steps, n_features)
+    val_X = val_X.reshape(-1, n_steps, n_features)
+    test_X = test_X.reshape(-1, n_steps, n_features)
+    processed_dataset = {
+        # general info
+        "n_classes": n_classes,
+        "n_steps": n_steps,
+        "n_features": n_features,
+        "scaler": scaler,
+        # train set
+        "train_X": train_X,
+        "train_y": train_y,
+        # val set
+        "val_X": val_X,
+        "val_y": val_y,
+        # test set
+        "test_X": test_X,
+        "test_y": test_y,
+    }
+
+    if missing_rate > 0:
+        # hold out ground truth in the original data for evaluation
+        train_X_ori = scaler.transform(train_X_ori.reshape(-1, n_features)).reshape(-1, n_steps, n_features)
+        val_X_ori = val_X
+        test_X_ori = test_X
+
+        # mask values in the train set to keep the same with below validation and test sets
+        train_X = create_missingness(train_X, missing_rate, pattern, **kwargs)
+        # mask values in the validation set as ground truth
+        val_X = create_missingness(val_X, missing_rate, pattern, **kwargs)
+        # mask values in the test set as ground truth
+        test_X = create_missingness(test_X, missing_rate, pattern, **kwargs)
+
+        processed_dataset["train_X"] = train_X
+        processed_dataset["train_X_ori"] = train_X_ori
+
+        processed_dataset["val_X"] = val_X
+        processed_dataset["val_X_ori"] = val_X_ori
+
+        processed_dataset["test_X"] = test_X
+        processed_dataset["test_X_ori"] = test_X_ori
+    else:
+        logger.warning("rate is 0, no missing values are artificially added.")
+
+    print_final_dataset_info(train_X, val_X, test_X)
+    return processed_dataset

From 0498e34dcce51af511710567c5f38bbeaa7e75fd Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Sat, 24 Aug 2024 13:02:31 +0800
Subject: [PATCH 5/6] test: add testing case for random walk preprocessing;

---
 benchpots/datasets/random_walk.py |  5 +++++
 tests/test_benchpots.py           | 10 ++++++++++
 2 files changed, 15 insertions(+)

diff --git a/benchpots/datasets/random_walk.py b/benchpots/datasets/random_walk.py
index 79028fa..4cc45eb 100644
--- a/benchpots/datasets/random_walk.py
+++ b/benchpots/datasets/random_walk.py
@@ -249,6 +249,11 @@ def preprocess_random_walk(
     missing_rate : float, default=0.1
         The rate of randomly missing values to generate, should be in [0,1).
 
+    pattern :
+        The missing pattern to apply to the dataset.
+        Must be one of ['point', 'subseq', 'block'].
+
+
     Returns
     -------
     data: dict,
diff --git a/tests/test_benchpots.py b/tests/test_benchpots.py
index e4b8119..4d27080 100644
--- a/tests/test_benchpots.py
+++ b/tests/test_benchpots.py
@@ -9,6 +9,7 @@
 import unittest
 
 from benchpots.datasets import (
+    preprocess_random_walk,
     preprocess_physionet2012,
     preprocess_physionet2019,
     preprocess_ett,
@@ -20,6 +21,15 @@
 
 
 class TestBenchPOTS(unittest.TestCase):
+    def test_random_walk(self):
+        preprocess_random_walk(
+            n_steps=8,
+            n_features=5,
+            n_classes=2,
+            n_samples_each_class=100,
+            missing_rate=0.1,
+        )
+
     def test_physionet2012(self):
         preprocess_physionet2012(subset="set-a", rate=0.1)
 

From 975e0eb74978470c0f7ef2ac1b9f29f317e8036f Mon Sep 17 00:00:00 2001
From: Wenjie Du <wenjay.du@gmail.com>
Date: Sun, 8 Sep 2024 17:15:51 +0800
Subject: [PATCH 6/6] feat: release v0.3;

---
 benchpots/datasets/__init__.py | 2 ++
 benchpots/version.py           | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchpots/datasets/__init__.py b/benchpots/datasets/__init__.py
index da25cc0..d2e0800 100644
--- a/benchpots/datasets/__init__.py
+++ b/benchpots/datasets/__init__.py
@@ -14,6 +14,7 @@
 from .physionet_2019 import preprocess_physionet2019
 from .ucr_uea_datasets import preprocess_ucr_uea_datasets
 from .solar_alabama import preprocess_solar_alabama
+from .random_walk import preprocess_random_walk
 
 __all__ = [
     "preprocess_physionet2012",
@@ -25,4 +26,5 @@
     "preprocess_pems_traffic",
     "preprocess_ucr_uea_datasets",
     "preprocess_solar_alabama",
+    "preprocess_random_walk",
 ]
diff --git a/benchpots/version.py b/benchpots/version.py
index 41151fe..996d4f9 100644
--- a/benchpots/version.py
+++ b/benchpots/version.py
@@ -22,4 +22,4 @@
 #
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
-__version__ = "0.2.2"
+__version__ = "0.3"