From c52149f9a88afb2a2a94955242f775225a8c7fb7 Mon Sep 17 00:00:00 2001 From: melisande-c Date: Fri, 11 Oct 2024 17:43:29 +0200 Subject: [PATCH 1/3] feat: predict to disk with outer loop implementation (file by file) --- src/careamics/careamist.py | 183 +++++++++++++++++++++++++++++++++++-- 1 file changed, 175 insertions(+), 8 deletions(-) diff --git a/src/careamics/careamist.py b/src/careamics/careamist.py index 6601bd46..42420b8d 100644 --- a/src/careamics/careamist.py +++ b/src/careamics/careamist.py @@ -20,7 +20,8 @@ SupportedData, SupportedLogger, ) -from careamics.dataset.dataset_utils import reshape_array +from careamics.dataset.dataset_utils import list_files, reshape_array +from careamics.file_io import WriteFunc, get_write_func from careamics.lightning import ( FCNModule, HyperParametersCallback, @@ -518,9 +519,11 @@ def predict( # numpydoc ignore=GL08 *, batch_size: int = 1, tile_size: Optional[tuple[int, ...]] = None, - tile_overlap: tuple[int, ...] = (48, 48), + tile_overlap: Optional[tuple[int, ...]] = (48, 48), axes: Optional[str] = None, - data_type: Optional[Literal["tiff", "custom"]] = None, + data_type: Optional[ + Union[Literal["array", "tiff", "custom"], SupportedData] + ] = None, tta_transforms: bool = True, dataloader_params: Optional[dict] = None, read_source_func: Optional[Callable] = None, @@ -534,9 +537,11 @@ def predict( # numpydoc ignore=GL08 *, batch_size: int = 1, tile_size: Optional[tuple[int, ...]] = None, - tile_overlap: tuple[int, ...] = (48, 48), + tile_overlap: Optional[tuple[int, ...]] = (48, 48), axes: Optional[str] = None, - data_type: Optional[Literal["array"]] = None, + data_type: Optional[ + Union[Literal["array", "tiff", "custom"], SupportedData] + ] = None, tta_transforms: bool = True, dataloader_params: Optional[dict] = None, ) -> Union[list[NDArray], NDArray]: ... @@ -545,11 +550,13 @@ def predict( self, source: Union[PredictDataModule, Path, str, NDArray], *, - batch_size: Optional[int] = None, + batch_size: int = 1, tile_size: Optional[tuple[int, ...]] = None, tile_overlap: Optional[tuple[int, ...]] = (48, 48), axes: Optional[str] = None, - data_type: Optional[Literal["array", "tiff", "custom"]] = None, + data_type: Optional[ + Union[Literal["array", "tiff", "custom"], SupportedData] + ] = None, tta_transforms: bool = True, dataloader_params: Optional[dict] = None, read_source_func: Optional[Callable] = None, @@ -579,7 +586,7 @@ def predict( Parameters ---------- - source : CAREamicsPredData, pathlib.Path, str or numpy.ndarray + source : PredictDataModule, pathlib.Path, str or numpy.ndarray Data to predict on. batch_size : int, default=1 Batch size for prediction. @@ -667,6 +674,166 @@ def predict( ) return convert_outputs(predictions, self.pred_datamodule.tiled) + def predict_to_disk( + self, + source: Union[PredictDataModule, Path, str], + *, + batch_size: int = 1, + tile_size: Optional[tuple[int, ...]] = None, + tile_overlap: Optional[tuple[int, ...]] = (48, 48), + axes: Optional[str] = None, + data_type: Optional[ + Union[Literal["array", "tiff", "custom"], SupportedData] + ] = None, + tta_transforms: bool = True, + dataloader_params: Optional[dict] = None, + read_source_func: Optional[Callable] = None, + extension_filter: str = "", + write_type: Literal["tiff", "custom"] = "tiff", + write_extension: Optional[str] = None, + write_func: Optional[WriteFunc] = None, + write_func_kwargs: Optional[dict[str, Any]] = None, + **kwargs, + ) -> None: + """ + Make predictions on the provided data and save outputs to files. + + The predictions will be saved in a new directory 'predictions' within the set + working directory. + + The `source` must be from files and not arrays. The file names of the + predictions will match those of the source. If there is more than one sample + within a file, the samples will be saved to seperate files. The file names of + samples will have the name of the corresponding source file but with the sample + index appended. E.g. If the the source file name is 'images.tiff' then the first + sample's prediction will be saved with the file name "image_0.tiff". + Input can be a PredictDataModule instance, a path to a data file, or a numpy + array. + + If `data_type`, `axes` and `tile_size` are not provided, the training + configuration parameters will be used, with the `patch_size` instead of + `tile_size`. + + Test-time augmentation (TTA) can be switched off using the `tta_transforms` + parameter. The TTA augmentation applies all possible flip and 90 degrees + rotations to the prediction input and averages the predictions. TTA augmentation + should not be used if you did not train with these augmentations. + + Note that if you are using a UNet model and tiling, the tile size must be + divisible in every dimension by 2**d, where d is the depth of the model. This + avoids artefacts arising from the broken shift invariance induced by the + pooling layers of the UNet. If your image has less dimensions, as it may + happen in the Z dimension, consider padding your image. + + Parameters + ---------- + source : PredictDataModule, pathlib.Path or str + Data to predict on. + batch_size : int, default=1 + Batch size for prediction. + tile_size : tuple of int, optional + Size of the tiles to use for prediction. + tile_overlap : tuple of int, default=(48, 48) + Overlap between tiles. + axes : str, optional + Axes of the input data, by default None. + data_type : {"array", "tiff", "custom"}, optional + Type of the input data. + tta_transforms : bool, default=True + Whether to apply test-time augmentation. + dataloader_params : dict, optional + Parameters to pass to the dataloader. + read_source_func : Callable, optional + Function to read the source data. + extension_filter : str, default="" + Filter for the file extension. + write_type : {"tiff", "custom"}, default="tiff" + The data type to save as, includes custom. + write_extension : str, optional + If a known `write_type` is selected this argument is ignored. For a custom + `write_type` an extension to save the data with must be passed. + write_func : WriteFunc, optional + If a known `write_type` is selected this argument is ignored. For a custom + `write_type` a function to save the data must be passed. See notes below. + write_func_kwargs : dict of {str: any}, optional + Additional keyword arguments to be passed to the save function. + **kwargs : Any + Unused. + + Raises + ------ + ValueError + If `write_type` is custom and `write_extension` is None. + ValueError + If `write_type` is custom and `write_fun is None. + ValueError + If `source` is not `str`, `Path` or `PredictDataModule` + """ + if write_func_kwargs is None: + write_func_kwargs = {} + + data_type = SupportedData(data_type) + # TODO: make configurable? + + write_dir = self.work_dir / "predictions" + + # guards for custom types + if write_type == SupportedData.CUSTOM: + if write_extension is None: + raise ValueError( + "A `write_extension` must be provided for custom write types." + ) + if write_func is None: + raise ValueError( + "A `write_func` must be provided for custom write types." + ) + else: + write_func = get_write_func(write_type) + write_extension = SupportedData.get_extension(write_type) + + # extract file names + if isinstance(source, PredictDataModule): + # assert not isinstance(source.pred_data, ) + data_type = SupportedData(source.data_type) + extension_filter = source.extension_filter + source_file_paths = list_files( + source.pred_data, data_type, extension_filter + ) + elif isinstance(source, (str, Path)): + source_file_paths = list_files(source, data_type, extension_filter) + else: + raise ValueError(f"Unsupported source type: '{type(source)}'.") + + # predict and write each file in turn + for source_path in source_file_paths: + # source_path is relative to original source path... + # should mirror original directory structure + prediction = self.predict( + source=source_path, + batch_size=batch_size, + tile_size=tile_size, + tile_overlap=tile_overlap, + axes=axes, + data_type=data_type, + tta_transforms=tta_transforms, + dataloader_params=dataloader_params, + read_source_func=read_source_func, + extension_filter=extension_filter, + **kwargs, + ) + # TODO: cast to float16? + write_data = np.concatenate(prediction) + + # create directory structure and write path + file_write_dir = write_dir / source_path.parent + file_write_dir.mkdir(parents=True, exist_ok=True) + write_path = (file_write_dir / source_path.name).with_suffix( + write_extension + ) + + # write data + write_func(file_path=write_path, img=write_data) + def export_to_bmz( self, path_to_archive: Union[Path, str], From a16259d10cf68433c1d444764e7e3d740ce7b7b4 Mon Sep 17 00:00:00 2001 From: melisande-c Date: Mon, 14 Oct 2024 12:50:43 +0200 Subject: [PATCH 2/3] test: add predict_to_disk tests --- src/careamics/careamist.py | 14 ++-- tests/test_careamist.py | 168 +++++++++++++++++++++++++++++++++++++ 2 files changed, 175 insertions(+), 7 deletions(-) diff --git a/src/careamics/careamist.py b/src/careamics/careamist.py index 42420b8d..f238eef8 100644 --- a/src/careamics/careamist.py +++ b/src/careamics/careamist.py @@ -682,9 +682,7 @@ def predict_to_disk( tile_size: Optional[tuple[int, ...]] = None, tile_overlap: Optional[tuple[int, ...]] = (48, 48), axes: Optional[str] = None, - data_type: Optional[ - Union[Literal["array", "tiff", "custom"], SupportedData] - ] = None, + data_type: Optional[Literal["array", "tiff", "custom"]] = None, tta_transforms: bool = True, dataloader_params: Optional[dict] = None, read_source_func: Optional[Callable] = None, @@ -772,9 +770,9 @@ def predict_to_disk( if write_func_kwargs is None: write_func_kwargs = {} - data_type = SupportedData(data_type) - # TODO: make configurable? + # write_type = SupportedData(write_type) + # TODO: make configurable? write_dir = self.work_dir / "predictions" # guards for custom types @@ -794,12 +792,14 @@ def predict_to_disk( # extract file names if isinstance(source, PredictDataModule): # assert not isinstance(source.pred_data, ) - data_type = SupportedData(source.data_type) + data_type = source.data_type extension_filter = source.extension_filter source_file_paths = list_files( source.pred_data, data_type, extension_filter ) elif isinstance(source, (str, Path)): + data_type = data_type or self.cfg.data_config.data_type + extension_filter = SupportedData.get_extension_pattern(data_type) source_file_paths = list_files(source, data_type, extension_filter) else: raise ValueError(f"Unsupported source type: '{type(source)}'.") @@ -825,7 +825,7 @@ def predict_to_disk( write_data = np.concatenate(prediction) # create directory structure and write path - file_write_dir = write_dir / source_path.parent + file_write_dir = write_dir / source_path.parent.name file_write_dir.mkdir(parents=True, exist_ok=True) write_path = (file_write_dir / source_path.name).with_suffix( write_extension diff --git a/tests/test_careamist.py b/tests/test_careamist.py index 85e5345e..78f65e69 100644 --- a/tests/test_careamist.py +++ b/tests/test_careamist.py @@ -4,6 +4,7 @@ import numpy as np import pytest import tifffile +from numpy.typing import NDArray from pytorch_lightning import Trainer from pytorch_lightning.callbacks import Callback, EarlyStopping, ModelCheckpoint @@ -11,6 +12,7 @@ from careamics.config.support import SupportedAlgorithm, SupportedData from careamics.dataset.dataset_utils import reshape_array from careamics.lightning.callbacks import HyperParametersCallback, ProgressBarCallback +from careamics.lightning.predict_data_module import create_predict_datamodule def random_array(shape: Tuple[int, ...], seed: int = 42): @@ -818,6 +820,172 @@ def test_export_bmz_pretrained_with_array(tmp_path: Path, pre_trained: Path): assert (tmp_path / "model2.zip").exists() +def test_predict_to_disk_path_tiff(tmp_path, minimum_configuration): + """Test predict_to_disk function with path source and tiff write type.""" + + # prepare dummy data + train_array = random_array((32, 32)) + + image_dir = tmp_path / "images" + image_dir.mkdir() + n_samples = 2 + # save files + for i in range(n_samples): + train_file = image_dir / f"image_{i}.tiff" + tifffile.imwrite(train_file, train_array) + + # create configuration + config = Configuration(**minimum_configuration) + config.training_config.num_epochs = 1 + config.data_config.axes = "YX" + config.data_config.batch_size = 2 + config.data_config.data_type = SupportedData.TIFF.value + config.data_config.patch_size = (8, 8) + + # train + careamist = CAREamist(source=config, work_dir=tmp_path) + careamist.train(train_source=image_dir) + + # predict to disk + careamist.predict_to_disk(source=image_dir) + + for i in range(n_samples): + assert (tmp_path / "predictions" / "images" / f"image_{i}.tiff").is_file() + + +def test_predict_to_disk_datamodule_tiff(tmp_path, minimum_configuration): + """Test predict_to_disk function with datamodule source and tiff write type.""" + + # prepare dummy data + train_array = random_array((32, 32)) + + image_dir = tmp_path / "images" + image_dir.mkdir() + n_samples = 2 + # save files + for i in range(n_samples): + train_file = image_dir / f"image_{i}.tiff" + tifffile.imwrite(train_file, train_array) + + # create configuration + config = Configuration(**minimum_configuration) + config.training_config.num_epochs = 1 + config.data_config.axes = "YX" + config.data_config.batch_size = 2 + config.data_config.data_type = SupportedData.TIFF.value + config.data_config.patch_size = (8, 8) + + # train + careamist = CAREamist(source=config, work_dir=tmp_path) + careamist.train(train_source=image_dir) + + datamodule = create_predict_datamodule( + pred_data=image_dir, + data_type=config.data_config.data_type, + axes=config.data_config.axes, + image_means=careamist.cfg.data_config.image_means, + image_stds=careamist.cfg.data_config.image_stds, + ) + + # predict to disk + careamist.predict_to_disk(source=datamodule) + + for i in range(n_samples): + assert (tmp_path / "predictions" / "images" / f"image_{i}.tiff").is_file() + + +def test_predict_to_disk_custom(tmp_path, minimum_configuration): + """Test predict_to_disk function with custom write type.""" + + def write_numpy(file_path: Path, img: NDArray, *args, **kwargs) -> None: + np.save(file=file_path, arr=img) + + # prepare dummy data + train_array = random_array((32, 32)) + + image_dir = tmp_path / "images" + image_dir.mkdir() + n_samples = 2 + # save files + for i in range(n_samples): + train_file = image_dir / f"image_{i}.tiff" + tifffile.imwrite(train_file, train_array) + + # create configuration + config = Configuration(**minimum_configuration) + config.training_config.num_epochs = 1 + config.data_config.axes = "YX" + config.data_config.batch_size = 2 + config.data_config.data_type = SupportedData.TIFF.value + config.data_config.patch_size = (8, 8) + + # train + careamist = CAREamist(source=config, work_dir=tmp_path) + careamist.train(train_source=image_dir) + + # predict to disk + careamist.predict_to_disk( + source=image_dir, + write_type=SupportedData.CUSTOM, + write_extension=".npy", + write_func=write_numpy, + ) + + for i in range(n_samples): + assert (tmp_path / "predictions" / "images" / f"image_{i}.npy").is_file() + + +def test_predict_to_disk_custom_raises(tmp_path, minimum_configuration): + """ + Test predict_to_disk custom write type raises ValueError. + + ValueError should be raised if no write_extension or no write_func is provided. + """ + + def write_numpy(file_path: Path, img: NDArray, *args, **kwargs) -> None: + np.save(file=file_path, arr=img) + + # prepare dummy data + train_array = random_array((32, 32)) + + image_dir = tmp_path / "images" + image_dir.mkdir() + n_samples = 2 + # save files + for i in range(n_samples): + train_file = image_dir / f"image_{i}.tiff" + tifffile.imwrite(train_file, train_array) + + # create configuration + config = Configuration(**minimum_configuration) + config.training_config.num_epochs = 1 + config.data_config.axes = "YX" + config.data_config.batch_size = 2 + config.data_config.data_type = SupportedData.TIFF.value + config.data_config.patch_size = (8, 8) + + # train + careamist = CAREamist(source=config, work_dir=tmp_path) + careamist.train(train_source=image_dir) + + with pytest.raises(ValueError): + # no write extension provided + careamist.predict_to_disk( + source=image_dir, + write_type=SupportedData.CUSTOM, + write_extension=None, + write_func=write_numpy, + ) + with pytest.raises(ValueError): + # no write func provided. + careamist.predict_to_disk( + source=image_dir, + write_type=SupportedData.CUSTOM, + write_extension=".npy", + write_func=None, + ) + + def test_add_custom_callback(tmp_path, minimum_configuration): """Test that custom callback can be added to the CAREamist.""" From 3df36e3539c0e3a01973393ffcbb457c941e5f67 Mon Sep 17 00:00:00 2001 From: melisande-c Date: Tue, 15 Oct 2024 18:58:30 +0200 Subject: [PATCH 3/3] feat: configurable save directory param; docs: update to reflect previous changes. --- src/careamics/careamist.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/careamics/careamist.py b/src/careamics/careamist.py index 7020c834..73985f61 100644 --- a/src/careamics/careamist.py +++ b/src/careamics/careamist.py @@ -568,7 +568,7 @@ def predict( configuration parameters will be used, with the `patch_size` instead of `tile_size`. - Test-time augmentation (TTA) can be switched off using the `tta_transforms` + Test-time augmentation (TTA) can be switched on using the `tta_transforms` parameter. The TTA augmentation applies all possible flip and 90 degrees rotations to the prediction input and averages the predictions. TTA augmentation should not be used if you did not train with these augmentations. @@ -686,6 +686,7 @@ def predict_to_disk( write_extension: Optional[str] = None, write_func: Optional[WriteFunc] = None, write_func_kwargs: Optional[dict[str, Any]] = None, + prediction_dir: Union[Path, str] = "predictions", **kwargs, ) -> None: """ @@ -708,7 +709,7 @@ def predict_to_disk( configuration parameters will be used, with the `patch_size` instead of `tile_size`. - Test-time augmentation (TTA) can be switched off using the `tta_transforms` + Test-time augmentation (TTA) can be switched on using the `tta_transforms` parameter. The TTA augmentation applies all possible flip and 90 degrees rotations to the prediction input and averages the predictions. TTA augmentation should not be used if you did not train with these augmentations. @@ -751,6 +752,10 @@ def predict_to_disk( `write_type` a function to save the data must be passed. See notes below. write_func_kwargs : dict of {str: any}, optional Additional keyword arguments to be passed to the save function. + prediction_dir : Path | str, default="predictions" + The path to save the prediction results to. If `prediction_dir` is not + absolute, the directory will be assumed to be relative to the pre-set + `work_dir`. If the directory does not exist it will be created. **kwargs : Any Unused. @@ -766,10 +771,11 @@ def predict_to_disk( if write_func_kwargs is None: write_func_kwargs = {} - # write_type = SupportedData(write_type) - - # TODO: make configurable? - write_dir = self.work_dir / "predictions" + if Path(prediction_dir).is_absolute(): + write_dir = Path(prediction_dir) + else: + write_dir = self.work_dir / prediction_dir + write_dir.mkdir(exist_ok=True, parents=True) # guards for custom types if write_type == SupportedData.CUSTOM: