diff --git a/aeon/datasets/__init__.py b/aeon/datasets/__init__.py index 30d91f27c2..52cc29cbfd 100644 --- a/aeon/datasets/__init__.py +++ b/aeon/datasets/__init__.py @@ -12,6 +12,8 @@ "load_forecasting", "load_regression", "download_all_regression", + "load_time_series_segmentation_benchmark", + "load_human_activity_segmentation_datasets", # Write functions "write_to_tsfile", "write_to_tsf_file", @@ -104,3 +106,7 @@ load_from_timeeval_csv_file, load_kdd_tsad_135, ) +from aeon.datasets._tss_data_loaders import ( + load_human_activity_segmentation_datasets, + load_time_series_segmentation_benchmark, +) diff --git a/aeon/datasets/_tss_data_loaders.py b/aeon/datasets/_tss_data_loaders.py new file mode 100644 index 0000000000..77e2fb42bd --- /dev/null +++ b/aeon/datasets/_tss_data_loaders.py @@ -0,0 +1,274 @@ +"""Dataset loading functions for segmentation.""" + +__all__ = [ + "load_time_series_segmentation_benchmark", + "load_human_activity_segmentation_datasets", +] + +from os import PathLike +from pathlib import Path +from typing import List, Optional, Tuple, Union + +import numpy as np +import pandas as pd + +import aeon + +_DATA_FOLDER = Path(aeon.__file__).parent / "datasets" / "local_data" +_TSSB_URL = ( + "https://raw.githubusercontent.com/ermshaua/time-series-segmentation" + "-benchmark/main/tssb/datasets/tssb.csv.zip" +) +_HAS_URL = ( + "https://raw.githubusercontent.com/patrickzib" + "/human_activity_segmentation_challenge/main/datasets/has2023_master.csv" + ".zip" +) + + +def load_time_series_segmentation_benchmark( + extract_path: Optional[PathLike] = None, + return_metadata: bool = False, +) -> Union[ + Tuple[List[np.ndarray], List[np.ndarray]], + Tuple[List[np.ndarray], List[np.ndarray], List[Tuple[str, int]]], +]: + """Load the Time Series Segmentation Benchmark (TSSB). + + This function loads the Time Series Segmentation Benchmark (TSSB) into memory, + downloading from GitHub (https://github.com/ermshaua/time-series-segmentation + -benchmark) [1] if the data is not available at the specified ``extract_path``. + The benchmark contains 75 annotated TS with 1-9 segments. Each TS is constructed + from one of the UEA & UCR time series classification datasets. TS are grouped by + label and concatenated to create segments with distinctive temporal patterns and + statistical properties. Offsets at which segments change are annotated as CPs. + Addtionally, resampling is applied to control the data resolution. Approximate, + hand-selected window sizes are provided that capture temporal patterns. + + If you do not specify ``extract_path``, it will set the path to + ``aeon/datasets/local_data``. If the problem is not present in ``extract_path``, it + will attempt to download the data. + + Parameters + ---------- + extract_path : str, default=None + The path to look for the data. If no path is provided, the function + looks in `aeon/datasets/local_data/`. If a path is given, it can be an absolute, + e.g., C:/Temp/ or relative, e.g. Temp/ or ./Temp/, path to an existing CSV-file. + return_metadata : boolean, default = False + If True, returns a tuple (X, y, metadata). + + Returns + ------- + X: list of np.ndarray + The list of univariate (1d) time series with variable shape (n_instances,). + y: list of np.ndarray + The list of change points for every time series. + metadata: optional + The list of tuples containing data set names and window sizes + + Raises + ------ + URLError or HTTPError + If the GitHub repository is not accessible. + + Examples + -------- + >>> from aeon.datasets import load_time_series_segmentation_benchmark + >>> X, y = load_time_series_segmentation_benchmark() + ... ) # doctest: +SKIP + + References + ---------- + .. [1] Arik Ermshaus, Patrick Schäfer, Ulf Leser: ClaSP: parameter-free + time series segmentation. Data Mining and Knowledge Discovery, 2023, + DOI:10.1007/s10618-023-00923-x. + """ + # set default/custom data folder + if extract_path is not None: + data_folder = Path(extract_path) + else: + data_folder = _DATA_FOLDER + + benchmark_path = _DATA_FOLDER / "tssb.csv" + + # converters to correctly load benchmark + np_cols = ["change_points", "time_series"] + converters = {col: lambda val: np.array(eval(val)) for col in np_cols} + + # load benchmark from git repo (and save locally) / or load locally + if not benchmark_path.exists(): + data_folder.mkdir(parents=True, exist_ok=True) + df = pd.read_csv(_TSSB_URL, converters=converters, compression="zip") + + # make sure numerical data is correctly saved + for np_col in np_cols: + df[np_col] = df[np_col].apply(np.ndarray.tolist) + df.to_csv(benchmark_path, index=None) + + df = pd.read_csv(benchmark_path, converters=converters) + + # construct return data + X = df.time_series.tolist() + y = df.change_points.tolist() + + # construct meta data + if return_metadata is True: + metadata = [tuple(row) for _, row in df[["dataset", "window_size"]].iterrows()] + return X, y, metadata + + return X, y + + +def load_human_activity_segmentation_datasets( + extract_path: Optional[PathLike] = None, + return_metadata: bool = False, +) -> Union[ + Tuple[List[np.ndarray], List[np.ndarray]], + Tuple[ + List[np.ndarray], List[np.ndarray], List[Tuple[str, str, int, int, np.ndarray]] + ], +]: + """Load the Human Activity Segmentation Challenge data sets. + + This function loads the Human Activity Segmentation challenge data sets into + memory, downloading from GitHub + (https://github.com/patrickzib/human_activity_segmentation_challenge) [1] if the + data is not available at the specified ``extract_path``. The data sets were used + in the discovery challenge held at ECML/PKDD and AALTD 2023. They contain 250 + annotated TS with 1-15 segments, capturing a total of 15 students performing 6 + distinct motion sequences. TS are sampled at 50 Hz, multivariate and consist of + measurements from 9 out 12 smartphone sensors: triaxial accelerometer, gyroscope, + magnetometer as well as latitude, longitude, and speed. Annotations include + information about the challenge split (public / private), groups and subjects, + as well as activity transition offsets (the change points) and activity labels. + + If you do not specify ``extract_path``, it will set the path to + ``aeon/datasets/local_data``. If the problem is not present in ``extract_path``, it + will attempt to download the data. + + Parameters + ---------- + extract_path : str, default=None + The path to look for the data. If no path is provided, the function + looks in `aeon/datasets/local_data/`. If a path is given, it can be an absolute, + e.g., C:/Temp/ or relative, e.g. Temp/ or ./Temp/, path to an existing CSV-file. + return_metadata : boolean, default = False + If True, returns a tuple (X, y, metadata). + + Returns + ------- + X: list of np.ndarray + The list of multivariate (2d) time series with variable shape (n_instances, 9). + y: list of np.ndarray + The list of change points for every time series. + metadata: optional + The list of tuples containing data set names, splits, groups, subjects, and + activities information. + + Raises + ------ + URLError or HTTPError + If the GitHub repository is not accessible. + + Examples + -------- + >>> from aeon.datasets import load_human_activity_segmentation_datasets + >>> X, y = load_human_activity_segmentation_datasets() + ... ) # doctest: +SKIP + + References + ---------- + .. [1] Arik Ermshaus, Patrick Schäfer, Anthony Bagnall, Thomas Guyet, + Georgiana Ifrim, Vincent Lemaire, Ulf Leser, Colin Leverger, + Simon Malinowski: Human Activity Segmentation Challenge @ ECML/PKDD’23. + AALTD@ECML, 2023, DOI:10.1007/978-3-031-49896-1_1. + """ + # set default/custom data folder + if extract_path is not None: + data_folder = Path(extract_path) + else: + data_folder = _DATA_FOLDER + + benchmark_path = _DATA_FOLDER / "has.csv" + + # converters to correctly load benchmark + np_cols = [ + "change_points", + "activities", + "x-acc", + "y-acc", + "z-acc", + "x-gyro", + "y-gyro", + "z-gyro", + "x-mag", + "y-mag", + "z-mag", + "lat", + "lon", + "speed", + ] + converters = { + col: lambda val: np.array([]) if len(val) == 0 else np.array(eval(val)) + for col in np_cols + } + + # load activity data from git repo (and save locally) / or load locally + if not benchmark_path.exists(): + data_folder.mkdir(parents=True, exist_ok=True) + df = pd.read_csv(_HAS_URL, converters=converters, compression="zip") + + # make sure numerical data is correctly saved + for np_col in np_cols: + df[np_col] = df[np_col].apply(np.ndarray.tolist) + df.to_csv(benchmark_path, index=None) + + df = pd.read_csv(benchmark_path, converters=converters) + + # construct return data + X, y, metadata = list(), list(), list() + + for _, row in df.iterrows(): + dataset = ( + f"{row.group}_subject{row.subject}_routine{row.routine} " + f"(id{row.ts_challenge_id})" + ) + + if row.group == "indoor": + ts = np.hstack( + ( + row["x-acc"].reshape(-1, 1), + row["y-acc"].reshape(-1, 1), + row["z-acc"].reshape(-1, 1), + row["x-gyro"].reshape(-1, 1), + row["y-gyro"].reshape(-1, 1), + row["z-gyro"].reshape(-1, 1), + row["x-mag"].reshape(-1, 1), + row["y-mag"].reshape(-1, 1), + row["z-mag"].reshape(-1, 1), + ) + ) + elif row.group == "outdoor": + ts = np.hstack( + ( + row["x-acc"].reshape(-1, 1), + row["y-acc"].reshape(-1, 1), + row["z-acc"].reshape(-1, 1), + row["x-mag"].reshape(-1, 1), + row["y-mag"].reshape(-1, 1), + row["z-mag"].reshape(-1, 1), + row["lat"].reshape(-1, 1), + row["lon"].reshape(-1, 1), + row["speed"].reshape(-1, 1), + ) + ) + + X.append(ts) + y.append(row.change_points) + metadata.append((dataset, row.split, row.group, row.subject, row.activities)) + + if return_metadata is True: + return X, y, metadata + + return X, y diff --git a/aeon/datasets/tests/test_tss_data_loader.py b/aeon/datasets/tests/test_tss_data_loader.py new file mode 100644 index 0000000000..219df84ad7 --- /dev/null +++ b/aeon/datasets/tests/test_tss_data_loader.py @@ -0,0 +1,76 @@ +"""Test segmentation dataset loaders.""" + +import tempfile +from pathlib import Path + +import numpy as np + +from aeon.datasets import ( + load_human_activity_segmentation_datasets, + load_time_series_segmentation_benchmark, +) +from aeon.segmentation import ClaSPSegmenter + + +def test_load_tssb(mocker): + """Test load time series segmentation benchmark.""" + with tempfile.TemporaryDirectory() as tmp: + tmp = Path(tmp) + mocker.patch("aeon.datasets._tss_data_loaders._DATA_FOLDER", tmp) + + # test download + X, y = load_time_series_segmentation_benchmark() + + assert isinstance(X, list) + assert all(isinstance(ts, np.ndarray) for ts in X) + assert all(ts.ndim == 1 for ts in X) + assert len(X) == 75 + + assert isinstance(y, list) + assert all(isinstance(cps, np.ndarray) for cps in y) + assert all(cps.ndim == 1 for cps in y) + assert len(y) == 75 + + # test load + meta data + X, y, metadata = load_time_series_segmentation_benchmark(return_metadata=True) + + assert isinstance(metadata, list) + assert len(y) == 75 + + # test that segmentation works + ts, cps, _, window_size = X[0], y[0], *metadata[0] + clasp = ClaSPSegmenter(period_length=window_size, n_cps=cps.shape[0]) + found_cps = clasp.fit_predict(ts) + assert cps.shape[0] == found_cps.shape[0] + + +def test_load_has_datasets(mocker): + """Test load human activity segmentation data sets.""" + with tempfile.TemporaryDirectory() as tmp: + tmp = Path(tmp) + mocker.patch("aeon.datasets._tss_data_loaders._DATA_FOLDER", tmp) + + # test download + X, y = load_human_activity_segmentation_datasets() + + assert isinstance(X, list) + assert all(isinstance(ts, np.ndarray) for ts in X) + assert all(ts.ndim == 2 for ts in X) + assert len(X) == 250 + + assert isinstance(y, list) + assert all(isinstance(cps, np.ndarray) for cps in y) + assert all(cps.ndim == 1 for cps in y) + assert len(y) == 250 + + # test load + meta data + X, y, metadata = load_human_activity_segmentation_datasets(return_metadata=True) + + assert isinstance(metadata, list) + assert len(y) == 250 + + # test that segmentation works + ts, cps, sample_rate = X[0], y[0], 50 + clasp = ClaSPSegmenter(period_length=sample_rate, n_cps=cps.shape[0]) + found_cps = clasp.fit_predict(ts[:, 0]) + assert cps.shape[0] == found_cps.shape[0]