From 648b9eb5e939652e165223ba1b8f880b2d13da4e Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 22 Aug 2024 11:37:19 +0800 Subject: [PATCH 1/6] refactor: remove useless comments; --- benchpots/datasets/beijing_multisite_air_quality.py | 1 - benchpots/datasets/electricity_load_diagrams.py | 1 - benchpots/datasets/electricity_transformer_temperature.py | 2 +- benchpots/datasets/italy_air_quality.py | 1 - benchpots/datasets/pems_traffic.py | 1 - benchpots/datasets/physionet_2012.py | 1 - benchpots/datasets/physionet_2019.py | 1 - benchpots/datasets/solar_alabama.py | 1 - benchpots/datasets/ucr_uea_datasets.py | 1 - benchpots/utils/missingness.py | 2 +- 10 files changed, 2 insertions(+), 10 deletions(-) diff --git a/benchpots/datasets/beijing_multisite_air_quality.py b/benchpots/datasets/beijing_multisite_air_quality.py index 3a58983..34832c5 100644 --- a/benchpots/datasets/beijing_multisite_air_quality.py +++ b/benchpots/datasets/beijing_multisite_air_quality.py @@ -140,7 +140,6 @@ def preprocess_beijing_air_quality( processed_dataset["val_X_ori"] = val_X_ori processed_dataset["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs processed_dataset["test_X_ori"] = test_X_ori else: logger.warning("rate is 0, no missing values are artificially added.") diff --git a/benchpots/datasets/electricity_load_diagrams.py b/benchpots/datasets/electricity_load_diagrams.py index 65b8da8..3c5bf92 100644 --- a/benchpots/datasets/electricity_load_diagrams.py +++ b/benchpots/datasets/electricity_load_diagrams.py @@ -108,7 +108,6 @@ def preprocess_electricity_load_diagrams( processed_dataset["val_X_ori"] = val_X_ori processed_dataset["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs processed_dataset["test_X_ori"] = test_X_ori else: logger.warning("rate is 0, no missing values are artificially added.") diff --git a/benchpots/datasets/electricity_transformer_temperature.py b/benchpots/datasets/electricity_transformer_temperature.py index 2a3c133..44c7741 100644 --- a/benchpots/datasets/electricity_transformer_temperature.py +++ b/benchpots/datasets/electricity_transformer_temperature.py @@ -115,7 +115,7 @@ def preprocess_ett( processed_dataset["val_X_ori"] = val_X_ori processed_dataset["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs + processed_dataset["test_X_ori"] = test_X_ori else: logger.warning("rate is 0, no missing values are artificially added.") diff --git a/benchpots/datasets/italy_air_quality.py b/benchpots/datasets/italy_air_quality.py index 9e333aa..0881345 100644 --- a/benchpots/datasets/italy_air_quality.py +++ b/benchpots/datasets/italy_air_quality.py @@ -99,7 +99,6 @@ def preprocess_italy_air_quality( processed_dataset["val_X_ori"] = val_X_ori processed_dataset["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs processed_dataset["test_X_ori"] = test_X_ori else: logger.warning("rate is 0, no missing values are artificially added.") diff --git a/benchpots/datasets/pems_traffic.py b/benchpots/datasets/pems_traffic.py index 736f835..d63373d 100644 --- a/benchpots/datasets/pems_traffic.py +++ b/benchpots/datasets/pems_traffic.py @@ -110,7 +110,6 @@ def preprocess_pems_traffic( processed_dataset["val_X_ori"] = val_X_ori processed_dataset["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs processed_dataset["test_X_ori"] = test_X_ori else: logger.warning("rate is 0, no missing values are artificially added.") diff --git a/benchpots/datasets/physionet_2012.py b/benchpots/datasets/physionet_2012.py index dd07034..e4b4402 100644 --- a/benchpots/datasets/physionet_2012.py +++ b/benchpots/datasets/physionet_2012.py @@ -227,7 +227,6 @@ def apply_func(df_temp): # pad and truncate to set the max length of samples as ) processed_dataset["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs processed_dataset["test_X_ori"] = test_X_ori test_X_indicating_mask = np.isnan(test_X_ori) ^ np.isnan(test_X) diff --git a/benchpots/datasets/physionet_2019.py b/benchpots/datasets/physionet_2019.py index d3fa274..c2085d1 100644 --- a/benchpots/datasets/physionet_2019.py +++ b/benchpots/datasets/physionet_2019.py @@ -192,7 +192,6 @@ def apply_func(df_temp): # pad and truncate to set the max length of samples as processed_dataset["val_X_ori"] = val_X_ori processed_dataset["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs processed_dataset["test_X_ori"] = test_X_ori test_X_indicating_mask = np.isnan(test_X_ori) ^ np.isnan(test_X) diff --git a/benchpots/datasets/solar_alabama.py b/benchpots/datasets/solar_alabama.py index 2614f8c..1ae87d4 100644 --- a/benchpots/datasets/solar_alabama.py +++ b/benchpots/datasets/solar_alabama.py @@ -108,7 +108,6 @@ def preprocess_solar_alabama( processed_dataset["val_X_ori"] = val_X_ori processed_dataset["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs processed_dataset["test_X_ori"] = test_X_ori else: logger.warning("rate is 0, no missing values are artificially added.") diff --git a/benchpots/datasets/ucr_uea_datasets.py b/benchpots/datasets/ucr_uea_datasets.py index df4a333..28be2a3 100644 --- a/benchpots/datasets/ucr_uea_datasets.py +++ b/benchpots/datasets/ucr_uea_datasets.py @@ -123,7 +123,6 @@ def preprocess_ucr_uea_datasets( processed_dataset["val_X_ori"] = val_X_ori processed_dataset["test_X"] = test_X - # test_X_ori is for error calc, not for model input, hence mustn't have NaNs processed_dataset["test_X_ori"] = test_X_ori else: logger.warning("rate is 0, no missing values are artificially added.") diff --git a/benchpots/utils/missingness.py b/benchpots/utils/missingness.py index a55fe6e..223ff55 100644 --- a/benchpots/utils/missingness.py +++ b/benchpots/utils/missingness.py @@ -32,7 +32,7 @@ def create_missingness(X, rate, pattern, **kwargs): assert 0 < rate < 1, "rate must be in [0, 1)" assert ( - pattern.lower() in supported_missing_pattern + pattern.lower() in supported_missing_pattern ), f"pattern must be one of {supported_missing_pattern}, but got {pattern}" if pattern == "point": From e15f2d40a4026bd481afda56c7cc2424f0521cbf Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Thu, 22 Aug 2024 12:53:05 +0800 Subject: [PATCH 2/6] feat: add pre-commit config file; --- .pre-commit-config.yaml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..8479c6a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,26 @@ +repos: + # hooks for checking files + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + + # hooks for linting code + - repo: https://github.com/psf/black + rev: 24.8.0 + hooks: + - id: black + args: [ + --line-length=120, # refer to pyproject.toml + ] + + - repo: https://github.com/PyCQA/flake8 + rev: 7.1.1 + hooks: + - id: flake8 + args: [ + --max-line-length=120, # refer to pyproject.toml + --extend-ignore=E203,E231 + ] From eb47094897e366794f158e1c6f73067795e6f20d Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Fri, 23 Aug 2024 15:01:26 +0800 Subject: [PATCH 3/6] feat: update pyproject.toml; --- pyproject.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 337b769..b932fe3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,9 @@ version = { attr = "benchpots.version.__version__" } readme = { file = "README.md", content-type = "text/markdown" } dependencies = { file = "requirements.txt" } +[tool.black] +line-length = 120 + [tool.flake8] # People may argue that coding style is personal. This may be true if the project is personal and one works like a # hermit, but to PyPOTS and its community, the answer is NO. @@ -69,6 +72,8 @@ dependencies = { file = "requirements.txt" } # who prefer the default setting can keep using 88 or 79 while coding. Please ensure your code lines not exceeding 120. max-line-length = 120 # why ignore E203? Refer to https://github.com/PyCQA/pycodestyle/issues/373 +# why ignore E231? Bad trailing comma, conflict with Black extend-ignore = """ - E203 + E203, + E231, """ \ No newline at end of file From 8fc3470f0cf0158ca5ce9dc5b6e3cfc5792776d2 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sat, 24 Aug 2024 12:18:40 +0800 Subject: [PATCH 4/6] feat: add preprocessing func for random walk dataset; --- benchpots/datasets/random_walk.py | 331 ++++++++++++++++++++++++++++++ 1 file changed, 331 insertions(+) create mode 100644 benchpots/datasets/random_walk.py diff --git a/benchpots/datasets/random_walk.py b/benchpots/datasets/random_walk.py new file mode 100644 index 0000000..79028fa --- /dev/null +++ b/benchpots/datasets/random_walk.py @@ -0,0 +1,331 @@ +""" +Preprocessing func for the generated random walk dataset. + +""" + +# Created by Wenjie Du +# License: BSD-3-Clause + + +import math +from typing import Optional, Tuple + +import numpy as np +from pygrinder import mcar +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.utils import check_random_state + +from ..utils.logging import logger, print_final_dataset_info +from ..utils.missingness import create_missingness + + +def gene_complete_random_walk( + n_samples: int = 1000, + n_steps: int = 24, + n_features: int = 10, + mu: float = 0.0, + std: float = 1.0, + random_state: Optional[int] = None, +) -> np.ndarray: + """Generate complete random walk time-series data, i.e. having no missing values. + + Parameters + ---------- + n_samples : int, default=1000 + The number of training time-series samples to generate. + + n_steps: int, default=24 + The number of time steps (length) of generated time-series samples. + + n_features : int, default=10 + The number of features (dimensions) of generated time-series samples. + + mu : float, default=0.0 + Mean of the normal distribution, which random walk steps are sampled from. + + std : float, default=1.0 + Standard deviation of the normal distribution, which random walk steps are sampled from. + + random_state : int, default=None + Random seed for data generation. + + Returns + ------- + ts_samples: array, shape of [n_samples, n_steps, n_features] + Generated random walk time series. + """ + seed = check_random_state(random_state) + ts_samples = np.zeros([n_samples, n_steps, n_features]) + random_values = seed.randn(n_samples, n_steps, n_features) * std + mu + ts_samples[:, 0, :] = random_values[:, 0, :] + for t in range(1, n_steps): + ts_samples[:, t, :] = ts_samples[:, t - 1, :] + random_values[:, t, :] + ts_samples = np.asarray(ts_samples) + return ts_samples + + +def gene_complete_random_walk_for_classification( + n_classes: int = 2, + n_samples_each_class: int = 500, + n_steps: int = 24, + n_features: int = 10, + shuffle: bool = True, + random_state: Optional[int] = None, +) -> Tuple[np.ndarray, np.ndarray]: + """Generate complete random walk time-series data for the classification task. + + Parameters + ---------- + n_classes : int, must >=1, default=2 + Number of classes (types) of the generated data. + + n_samples_each_class : int, default=500 + Number of samples for each class to generate. + + n_steps : int, default=24 + Number of time steps in each sample. + + n_features : int, default=10 + Number of features. + + shuffle : bool, default=True + Whether to shuffle generated samples. + If not, you can separate samples of each class according to `n_samples_each_class`. + For example, + X_class0=X[:n_samples_each_class], + X_class1=X[n_samples_each_class:n_samples_each_class*2] + + random_state : int, default=None + Random seed for data generation. + + Returns + ------- + X : array, shape of [n_samples, n_steps, n_features] + Generated time-series data. + + y : array, shape of [n_samples] + Labels indicating classes of time-series samples. + + """ + assert n_classes > 1, f"n_classes should be >1, but got {n_classes}" + + ts_collector = [] + label_collector = [] + + mu = 0 + std = 1 + + for c_ in range(n_classes): + ts_samples = gene_complete_random_walk(n_samples_each_class, n_steps, n_features, mu, std, random_state) + label_samples = np.asarray([1 for _ in range(n_samples_each_class)]) * c_ + ts_collector.extend(ts_samples) + label_collector.extend(label_samples) + mu += 1 + + X = np.asarray(ts_collector) + y = np.asarray(label_collector) + + # if shuffling, then shuffle the order of samples + if shuffle: + indices = np.arange(len(X)) + np.random.shuffle(indices) + X = X[indices] + y = y[indices] + + return X, y + + +def gene_complete_random_walk_for_anomaly_detection( + n_samples: int = 1000, + n_steps: int = 24, + n_features: int = 10, + mu: float = 0.0, + std: float = 1.0, + anomaly_proportion: float = 0.1, + anomaly_fraction: float = 0.02, + anomaly_scale_factor: float = 2.0, + random_state: Optional[int] = None, +) -> Tuple[np.ndarray, np.ndarray]: + """Generate random walk time-series data for the anomaly-detection task. + + Parameters + ---------- + n_samples : int, default=1000 + The number of training time-series samples to generate. + + n_features : int, default=10 + The number of features (dimensions) of generated time-series samples. + + n_steps: int, default=24 + The number of time steps (length) of generated time-series samples. + + mu : float, default=0.0 + Mean of the normal distribution, which random walk steps are sampled from. + + std : float, default=1.0 + Standard deviation of the normal distribution, which random walk steps are sampled from. + + anomaly_proportion : float, default=0.1 + Proportion of anomaly samples in all samples. + + anomaly_fraction : float, default=0.02 + Fraction of anomaly points in each anomaly sample. + + anomaly_scale_factor : float, default=2.0 + Scale factor for value scaling to create anomaly points in time series samples. + + random_state : int, default=None + Random seed for data generation. + + Returns + ------- + X : array, shape of [n_samples, n_steps, n_features] + Generated time-series data. + + y : array, shape of [n_samples] + Labels indicating if time-series samples are anomalies. + """ + assert 0 < anomaly_proportion < 1, f"anomaly_proportion should be >0 and <1, but got {anomaly_proportion}" + assert 0 < anomaly_fraction < 1, f"anomaly_fraction should be >0 and <1, but got {anomaly_fraction}" + seed = check_random_state(random_state) + X = seed.randn(n_samples, n_steps, n_features) * std + mu + n_anomaly = math.floor(n_samples * anomaly_proportion) + anomaly_indices = np.random.choice(n_samples, size=n_anomaly, replace=False) + for a_i in anomaly_indices: + anomaly_sample = X[a_i] + anomaly_sample = anomaly_sample.flatten() + min_val = anomaly_sample.min() + max_val = anomaly_sample.max() + max_difference = min_val - max_val + n_points = n_steps * n_features + n_anomaly_points = int(n_points * anomaly_fraction) + point_indices = np.random.choice(a=n_points, size=n_anomaly_points, replace=False) + for p_i in point_indices: + anomaly_sample[p_i] = mu + np.random.uniform( + low=min_val - anomaly_scale_factor * max_difference, + high=max_val + anomaly_scale_factor * max_difference, + ) + X[a_i] = anomaly_sample.reshape(n_steps, n_features) + + # create labels + y = np.zeros(n_samples) + y[anomaly_indices] = 1 + + # shuffling + indices = np.arange(n_samples) + np.random.shuffle(indices) + X = X[indices] + y = y[indices] + + return X, y + + +def preprocess_random_walk( + n_steps=24, + n_features=10, + n_classes=2, + n_samples_each_class=1000, + missing_rate=0.1, + pattern: str = "point", + **kwargs, +) -> dict: + """Generate a random-walk data. + + Parameters + ---------- + n_steps : int, default=24 + Number of time steps in each sample. + + n_features : int, default=10 + Number of features. + + n_classes : int, default=2 + Number of classes (types) of the generated data. + + n_samples_each_class : int, default=1000 + Number of samples for each class to generate. + + missing_rate : float, default=0.1 + The rate of randomly missing values to generate, should be in [0,1). + + Returns + ------- + data: dict, + A dictionary containing the generated data. + """ + + assert 0 <= missing_rate < 1, "missing_rate must be in [0,1)" + + # generate samples + X, y = gene_complete_random_walk_for_classification( + n_classes=n_classes, + n_samples_each_class=n_samples_each_class, + n_steps=n_steps, + n_features=n_features, + ) + # split into train/val/test sets + train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2) + train_X, val_X, train_y, val_y = train_test_split(train_X, train_y, test_size=0.2) + + if missing_rate > 0: + # create random missing values + train_X_ori = train_X + train_X = mcar(train_X, missing_rate) + # test set is left to mask after normalization + + train_X = train_X.reshape(-1, n_features) + val_X = val_X.reshape(-1, n_features) + test_X = test_X.reshape(-1, n_features) + # normalization + scaler = StandardScaler() + train_X = scaler.fit_transform(train_X) + val_X = scaler.transform(val_X) + test_X = scaler.transform(test_X) + # reshape into time series samples + train_X = train_X.reshape(-1, n_steps, n_features) + val_X = val_X.reshape(-1, n_steps, n_features) + test_X = test_X.reshape(-1, n_steps, n_features) + processed_dataset = { + # general info + "n_classes": n_classes, + "n_steps": n_steps, + "n_features": n_features, + "scaler": scaler, + # train set + "train_X": train_X, + "train_y": train_y, + # val set + "val_X": val_X, + "val_y": val_y, + # test set + "test_X": test_X, + "test_y": test_y, + } + + if missing_rate > 0: + # hold out ground truth in the original data for evaluation + train_X_ori = scaler.transform(train_X_ori.reshape(-1, n_features)).reshape(-1, n_steps, n_features) + val_X_ori = val_X + test_X_ori = test_X + + # mask values in the train set to keep the same with below validation and test sets + train_X = create_missingness(train_X, missing_rate, pattern, **kwargs) + # mask values in the validation set as ground truth + val_X = create_missingness(val_X, missing_rate, pattern, **kwargs) + # mask values in the test set as ground truth + test_X = create_missingness(test_X, missing_rate, pattern, **kwargs) + + processed_dataset["train_X"] = train_X + processed_dataset["train_X_ori"] = train_X_ori + + processed_dataset["val_X"] = val_X + processed_dataset["val_X_ori"] = val_X_ori + + processed_dataset["test_X"] = test_X + processed_dataset["test_X_ori"] = test_X_ori + else: + logger.warning("rate is 0, no missing values are artificially added.") + + print_final_dataset_info(train_X, val_X, test_X) + return processed_dataset From 0498e34dcce51af511710567c5f38bbeaa7e75fd Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sat, 24 Aug 2024 13:02:31 +0800 Subject: [PATCH 5/6] test: add testing case for random walk preprocessing; --- benchpots/datasets/random_walk.py | 5 +++++ tests/test_benchpots.py | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/benchpots/datasets/random_walk.py b/benchpots/datasets/random_walk.py index 79028fa..4cc45eb 100644 --- a/benchpots/datasets/random_walk.py +++ b/benchpots/datasets/random_walk.py @@ -249,6 +249,11 @@ def preprocess_random_walk( missing_rate : float, default=0.1 The rate of randomly missing values to generate, should be in [0,1). + pattern : + The missing pattern to apply to the dataset. + Must be one of ['point', 'subseq', 'block']. + + Returns ------- data: dict, diff --git a/tests/test_benchpots.py b/tests/test_benchpots.py index e4b8119..4d27080 100644 --- a/tests/test_benchpots.py +++ b/tests/test_benchpots.py @@ -9,6 +9,7 @@ import unittest from benchpots.datasets import ( + preprocess_random_walk, preprocess_physionet2012, preprocess_physionet2019, preprocess_ett, @@ -20,6 +21,15 @@ class TestBenchPOTS(unittest.TestCase): + def test_random_walk(self): + preprocess_random_walk( + n_steps=8, + n_features=5, + n_classes=2, + n_samples_each_class=100, + missing_rate=0.1, + ) + def test_physionet2012(self): preprocess_physionet2012(subset="set-a", rate=0.1) From 975e0eb74978470c0f7ef2ac1b9f29f317e8036f Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Sun, 8 Sep 2024 17:15:51 +0800 Subject: [PATCH 6/6] feat: release v0.3; --- benchpots/datasets/__init__.py | 2 ++ benchpots/version.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/benchpots/datasets/__init__.py b/benchpots/datasets/__init__.py index da25cc0..d2e0800 100644 --- a/benchpots/datasets/__init__.py +++ b/benchpots/datasets/__init__.py @@ -14,6 +14,7 @@ from .physionet_2019 import preprocess_physionet2019 from .ucr_uea_datasets import preprocess_ucr_uea_datasets from .solar_alabama import preprocess_solar_alabama +from .random_walk import preprocess_random_walk __all__ = [ "preprocess_physionet2012", @@ -25,4 +26,5 @@ "preprocess_pems_traffic", "preprocess_ucr_uea_datasets", "preprocess_solar_alabama", + "preprocess_random_walk", ] diff --git a/benchpots/version.py b/benchpots/version.py index 41151fe..996d4f9 100644 --- a/benchpots/version.py +++ b/benchpots/version.py @@ -22,4 +22,4 @@ # # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' -__version__ = "0.2.2" +__version__ = "0.3"