diff --git a/docs/index.md b/docs/index.md index b85e2c8..26ddc68 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,7 +1,7 @@ # RUL Datasets This library contains a collection of common benchmark datasets for **remaining useful lifetime (RUL)** estimation. -They are provided as [LightningDataModules][pytorch_lightning.core.LightningDataModule] to be readily used in [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/latest/). +They are provided as [LightningDataModules][lightning.pytorch.core.LightningDataModule] to be readily used in [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/latest/). Currently, four datasets are supported: diff --git a/docs/use_cases/libraries.md b/docs/use_cases/libraries.md index cf16133..af4b109 100644 --- a/docs/use_cases/libraries.md +++ b/docs/use_cases/libraries.md @@ -2,7 +2,7 @@ This library was developed to be used in [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/) first and foremost. Lightning helps writing clean and reproducible deep learning code that can run on most common training hardware. -Datasets are represented by [LightningDataModules][pytorch_lightning.core.LightningDataModule] which give access to data loaders for each data split. +Datasets are represented by [LightningDataModules][lightning.pytorch.core.LightningDataModule] which give access to data loaders for each data split. The RUL Datasets library implements several data modules that are 100% compatible with Lightning: ```python diff --git a/poetry.lock b/poetry.lock index 8472026..0867ac5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "absl-py" @@ -2896,6 +2896,23 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] +[[package]] +name = "pytest-mock" +version = "3.12.0" +description = "Thin-wrapper around the mock package for easier use with pytest" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-mock-3.12.0.tar.gz", hash = "sha256:31a40f038c22cad32287bb43932054451ff5583ff094bca6f675df2f8bc1a6e9"}, + {file = "pytest_mock-3.12.0-py3-none-any.whl", hash = "sha256:0972719a7263072da3a21c7f4773069bcc7486027d7e8e1f81d98a47e701bc4f"}, +] + +[package.dependencies] +pytest = ">=5.0" + +[package.extras] +dev = ["pre-commit", "pytest-asyncio", "tox"] + [[package]] name = "python-dateutil" version = "2.8.2" @@ -3882,4 +3899,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools" [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "ab4b3b00123b796736a61ec071c620d9c42946dfcc5d0103bdfc7b1770d35d82" +content-hash = "155cb7ba42d7b76d46c3ce100b66d45bbcfd0cd2c0d5c73a482cd3e85c6adcaa" diff --git a/pyproject.toml b/pyproject.toml index 42c5d0c..a8c2bec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "rul_datasets" -version = "0.0.0" +version = "0.11.3" description = "A collection of datasets for RUL estimation as Lightning Data Modules." authors = ["Krokotsch, Tilman "] license = "MIT" @@ -25,6 +25,7 @@ flake8 = "^5.0.4" mypy = "^1.0.0" hydra-core = "^1.1.1" pytest = "^7.1.3" +pytest-mock = "^3.12.0" [tool.poetry.group.docs] optional = true diff --git a/rul_datasets/adaption.py b/rul_datasets/adaption.py index a5d2d00..2088d8c 100644 --- a/rul_datasets/adaption.py +++ b/rul_datasets/adaption.py @@ -16,7 +16,7 @@ class DomainAdaptionDataModule(pl.LightningDataModule): """ - A higher-order [data module][pytorch_lightning.core.LightningDataModule] used for + A higher-order [data module][lightning.pytorch.core.LightningDataModule] used for unsupervised domain adaption of a labeled source to an unlabeled target domain. The training data of both domains is wrapped in a [AdaptionDataset] [rul_datasets.adaption.AdaptionDataset] which provides a random sample of the @@ -217,7 +217,7 @@ def _get_paired_dataset(self) -> PairedRulDataset: class LatentAlignDataModule(DomainAdaptionDataModule): """ - A higher-order [data module][pytorch_lightning.core.LightningDataModule] based on + A higher-order [data module][lightning.pytorch.core.LightningDataModule] based on [DomainAdaptionDataModule][rul_datasets.adaption.DomainAdaptionDataModule]. It is specifically made to work with the latent space alignment approach by Zhang diff --git a/rul_datasets/baseline.py b/rul_datasets/baseline.py index c569976..253655d 100644 --- a/rul_datasets/baseline.py +++ b/rul_datasets/baseline.py @@ -13,7 +13,7 @@ class BaselineDataModule(pl.LightningDataModule): """ - A higher-order [data module][pytorch_lightning.core.LightningDataModule] that + A higher-order [data module][lightning.pytorch.core.LightningDataModule] that takes a [RulDataModule][rul_datasets.core.RulDataModule]. It provides the training and validation splits of the sub-dataset selected in the underlying data module but provides the test splits of all available subsets of the dataset. This diff --git a/rul_datasets/core.py b/rul_datasets/core.py index b6c5ce0..08734f2 100644 --- a/rul_datasets/core.py +++ b/rul_datasets/core.py @@ -1,7 +1,7 @@ """Basic data modules for experiments involving only a single subset of any RUL dataset. """ -from typing import Dict, List, Optional, Tuple, Any, Callable, cast, Union +from typing import Dict, List, Optional, Tuple, Any, Callable, cast, Union, Literal import numpy as np import pytorch_lightning as pl @@ -14,7 +14,7 @@ class RulDataModule(pl.LightningDataModule): """ - A [data module][pytorch_lightning.core.LightningDataModule] to provide windowed + A [data module][lightning.pytorch.core.LightningDataModule] to provide windowed time series features with RUL targets. It exposes the splits of the underlying dataset for easy usage with PyTorch and PyTorch Lightning. @@ -52,6 +52,12 @@ class RulDataModule(pl.LightningDataModule): ... feature_extractor=lambda x: np.mean(x, axis=1), ... window_size=10 ... ) + + Only Degraded Validation and Test Samples + + >>> import rul_datasets + >>> cmapss = rul_datasets.reader.CmapssReader(fd=1) + >>> dm = rul_datasets.RulDataModule(cmapss, 32, degraded_only=["val", "test"]) """ _data: Dict[str, Tuple[torch.Tensor, torch.Tensor]] @@ -62,13 +68,14 @@ def __init__( batch_size: int, feature_extractor: Optional[Callable] = None, window_size: Optional[int] = None, + degraded_only: Optional[List[Literal["dev", "val", "test"]]] = None, ): """ Create a new RUL data module from a reader. This data module exposes a training, validation and test data loader for the underlying dataset. First, `prepare_data` is called to download and - pre-process the dataset. Afterwards, `setup_data` is called to load all + pre-process the dataset. Afterward, `setup_data` is called to load all splits into memory. If a `feature_extractor` is supplied, the data module extracts new features @@ -85,11 +92,13 @@ def __init__( `[num_windows, window_size, features]`. Args: - reader: The dataset reader for the desired dataset, e.g. CmapssLoader. - batch_size: The size of the batches build by the data loaders. + reader: The dataset reader for the desired dataset, e.g., CmapssLoader. + batch_size: The size of the batches built by the data loaders. feature_extractor: A feature extractor that extracts feature vectors from windows. window_size: The new window size to apply after the feature extractor. + degraded_only: Whether to load only degraded samples for the `dev`, 'val' + or 'test' split. """ super().__init__() @@ -97,6 +106,7 @@ def __init__( self.batch_size = batch_size self.feature_extractor = feature_extractor self.window_size = window_size + self.degraded_only = degraded_only if (self.feature_extractor is None) and (self.window_size is not None): raise ValueError( @@ -111,6 +121,7 @@ def __init__( str(self.feature_extractor) if self.feature_extractor else None ), "window_size": self.window_size, + "degraded_only": self.degraded_only, } self.save_hyperparameters(hparams) @@ -217,24 +228,40 @@ def setup(self, stage: Optional[str] = None) -> None: } def load_split( - self, split: str, alias: Optional[str] = None + self, + split: str, + alias: Optional[str] = None, + degraded_only: Optional[bool] = None, ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: """ Load a split from the underlying reader and apply the feature extractor. By setting alias, it is possible to load a split aliased as another split, - e.g. load the test split and treat it as the dev split. The data of the split is - loaded but all pre-processing steps of alias are carried out. + e.g., load the test split and treat it as the dev split. The data of the split + is loaded, but all pre-processing steps of alias are carried out. + + If `degraded_only` is set, only degraded samples are loaded. This is only + possible if the underlying reader has a `max_rul` set or `norm_rul` is set to + `True`. The `degraded_only` argument takes precedence over the `degraded_only` + of the data module. Args: split: The desired split to load. alias: The split as which the loaded data should be treated. + degraded_only: Whether to only load degraded samples. Returns: The feature and target tensors of the split's runs. """ features, targets = self.reader.load_split(split, alias) features, targets = self._apply_feature_extractor_per_run(features, targets) tensor_features, tensor_targets = utils.to_tensor(features, targets) + if degraded_only is None: + degraded_only = ( + self.degraded_only is not None + and (alias or split) in self.degraded_only + ) + if degraded_only: + self._filter_out_healthy(tensor_features, tensor_targets) return tensor_features, tensor_targets @@ -269,6 +296,21 @@ def _extract_and_window( return features, targets + def _filter_out_healthy(self, tensor_features, tensor_targets): + if self.reader.max_rul is not None: + thresh = self.reader.max_rul + elif hasattr(self.reader, "norm_rul") and self.reader.norm_rul: + thresh = 1.0 + else: + raise ValueError( + "Cannot filter degraded samples if no max_rul is set and " + "norm_rul is False." + ) + for i in range(len(tensor_targets)): + degraded = tensor_targets[i] < thresh + tensor_features[i] = tensor_features[i][degraded] + tensor_targets[i] = tensor_targets[i][degraded] + def train_dataloader(self, *args: Any, **kwargs: Any) -> DataLoader: """ Create a [data loader][torch.utils.data.DataLoader] for the training split. @@ -383,6 +425,7 @@ def __init__( min_distance: int, deterministic: bool = False, mode: str = "linear", + degraded_only: bool = False, ): super().__init__() @@ -392,6 +435,7 @@ def __init__( self.num_samples = num_samples self.deterministic = deterministic self.mode = mode + self.degraded_only = degraded_only for dm in self.dms: dm.check_compatibility(self.dms[0]) @@ -430,7 +474,9 @@ def _prepare_datasets(self): features = [] labels = [] for domain_idx, dm in enumerate(self.dms): - run_features, run_labels = dm.load_split(self.split) + run_features, run_labels = dm.load_split( + self.split, degraded_only=self.degraded_only + ) for feat, lab in zip(run_features, run_labels): if len(feat) > self.min_distance: run_domain_idx.append(domain_idx) diff --git a/rul_datasets/ssl.py b/rul_datasets/ssl.py index 4b8ad55..bb7dfc2 100644 --- a/rul_datasets/ssl.py +++ b/rul_datasets/ssl.py @@ -11,7 +11,7 @@ class SemiSupervisedDataModule(pl.LightningDataModule): """ - A higher-order [data module][pytorch_lightning.core.LightningDataModule] used for + A higher-order [data module][lightning.pytorch.core.LightningDataModule] used for semi-supervised learning with a labeled data module and an unlabeled one. It makes sure that both data modules come from the same sub-dataset. diff --git a/tests/test_core.py b/tests/test_core.py index 6cc8c2b..cca7d1e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -37,6 +37,7 @@ def test_created_correctly(self, mock_loader): "batch_size": 16, "window_size": None, "feature_extractor": None, + "degraded_only": None, } @pytest.mark.parametrize("window_size", [2, None]) @@ -53,8 +54,18 @@ def test_created_correctly_with_feature_extractor(self, mock_loader, window_size "batch_size": 16, "window_size": window_size, "feature_extractor": str(fe), + "degraded_only": None, } + @pytest.mark.parametrize("degraded_only", [None, ["dev"], ["val", "test"]]) + def test_created_correctly_with_degraded_only(self, mock_loader, degraded_only): + dataset = core.RulDataModule( + mock_loader, batch_size=16, degraded_only=degraded_only + ) + + assert "degraded_only" in dataset.hparams + assert dataset.hparams["degraded_only"] == degraded_only + def test_prepare_data(self, mock_loader): dataset = core.RulDataModule(mock_loader, batch_size=16) dataset.prepare_data() @@ -71,6 +82,36 @@ def test_setup(self, mock_loader, mock_runs): mock_runs = tuple(torch.tensor(np.concatenate(r)) for r in mock_runs) assert dataset._data == {"dev": mock_runs, "val": mock_runs, "test": mock_runs} + @pytest.mark.parametrize("split", ["dev", "val", "test"]) + def test_load_split_degraded_only(self, mock_loader, mocker, split): + mock_loader.max_rul = 125 + dataset = core.RulDataModule(mock_loader, batch_size=16, degraded_only=[split]) + spy__filter_out_healthy = mocker.spy(dataset, "_filter_out_healthy") + + dataset.load_split(split) + spy__filter_out_healthy.assert_called() + spy__filter_out_healthy.reset_mock() + for other_split in {"dev", "val", "test"} - {split}: + dataset.load_split(other_split) + spy__filter_out_healthy.assert_not_called() + + @pytest.mark.parametrize("degraded_only", [True, False]) + def test_load_split_degraded_only_override( + self, mock_loader, mocker, degraded_only + ): + mock_loader.max_rul = 125 + dataset = core.RulDataModule( + mock_loader, batch_size=16, degraded_only=None if degraded_only else ["dev"] + ) + spy__filter_out_healthy = mocker.spy(dataset, "_filter_out_healthy") + + dataset.load_split("dev", degraded_only=degraded_only) + + if degraded_only: + spy__filter_out_healthy.assert_called() + else: + spy__filter_out_healthy.assert_not_called() + def test_empty_dataset(self, mock_loader): """Should not crash on empty dataset.""" mock_loader.load_split.return_value = [], [] @@ -248,21 +289,23 @@ def test_feature_extractor_no_rewindowing(self, mock_loader): class DummyRul(reader.AbstractReader): fd: int = 1 window_size: int = 30 - max_rul: int = 125 percent_broken = None percent_fail_runs = None truncate_val = False truncate_degraded_only = False - def __init__(self, length): + def __init__(self, length, norm_rul=False): + self.norm_rul = norm_rul + self.max_rul = None if norm_rul else 125 + norm = 125 if norm_rul else 1 self.data = { "dev": ( [np.zeros((length, self.window_size, 5))], - [np.clip(np.arange(length, 0, step=-1), a_min=None, a_max=125)], + [np.clip(np.arange(length, 0, step=-1), a_min=None, a_max=125) / norm], ), "val": ( [np.zeros((100, self.window_size, 5))], - [np.clip(np.arange(100, 0, step=-1), a_min=None, a_max=125)], + [np.clip(np.arange(100, 0, step=-1), a_min=None, a_max=125) / norm], ), } @@ -357,6 +400,11 @@ def cmapss_normal(length): return RulDataModule(DummyRul(length), 32) +@pytest.fixture +def cmapss_normed(length): + return RulDataModule(DummyRul(length, norm_rul=True), 32) + + @pytest.fixture def cmapss_short(): return RulDataModule(DummyRulShortRuns(), 32) @@ -537,3 +585,12 @@ def test_compatability_check(self): core.PairedRulDataset(dms, "dev", 1000, 1) assert 2 == mock_check_compat.call_count + + @pytest.mark.parametrize("degraded_only", [True, False]) + def test_degraded_only(self, degraded_only, cmapss_normal, mocker): + spy_load_split = mocker.spy(cmapss_normal, "load_split") + core.PairedRulDataset( + [cmapss_normal], "dev", 1000, 1, degraded_only=degraded_only + ) + + spy_load_split.assert_called_with("dev", degraded_only=degraded_only)