refac: refactored serialization of arrays for noise model and likelih…

…ood configs (CAREamics#232) ### Description - **What**: Implemented logic to serialize arrays defined within pydantic models (specifically with reference to `nm_model.py` and `likelihood_model.py`). - **Why**: Because during training we need to save info in the configs. Specifically, since arrays cannot be deserialized by default we need to decide whether to keep them or not. - **How**: Excluded large arrays from serialization, wrote custom serializers for others. ### Changes Made - **Added**: - Custom serializer `array_to_json()`. - 2 different custom deserializers: `list_to_numpy()`, `list_to_torch()`. - **Modified**: Excluded some arrays from serialization. - **Removed**: None. ### For further discussion NOTE1: why deserializer takes in a list? Because in our use case we need deserializers to move config files usually stored as dicts into pydantic models. But, such dicts are often the result of loading config file mainly stored as `json`, `pkl`, or `yml`. The loader for these file types automatically deserialize strings into lists. That's why what is left to do for us is to move lists into arrays or tensors. NOTE2: why 2 different deserializers? Because in some cases we want to deserialize list to torch tensor and in some other cases we prefer numpy arrays. After discussing with @jdeschamps we realized that some large arrays should not be part of configs (e.g., `signal`, `observation` in `nm_model.py`). Therefore, I left TODOs to remind about discussing this for future refactoring. --- **Please ensure your PR meets the following requirements:** - [x] Code builds and passes tests locally, including doctests - [x] New tests have been added (for bug fixes/features) - [x] Pre-commit passes - [ ] PR to the documentation exists (for bug fixes / features) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Joran Deschamps <6367888+jdeschamps@users.noreply.github.com>
federico-carrara · Sep 6, 2024 · 1b60f07 · 1b60f07
1 parent 44aee3e
commit 1b60f07
Show file tree

Hide file tree

Showing 4 changed files with 177 additions and 9 deletions.
diff --git a/src/careamics/config/likelihood_model.py b/src/careamics/config/likelihood_model.py
@@ -2,16 +2,30 @@
 
 from typing import Literal, Optional, Union
 
+import numpy as np
 import torch
-from pydantic import BaseModel, ConfigDict
+from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
+from typing_extensions import Annotated
 
 from careamics.models.lvae.noise_models import (
     GaussianMixtureNoiseModel,
     MultiChannelNoiseModel,
 )
+from careamics.utils.serializers import _array_to_json, _to_torch
 
 NoiseModel = Union[GaussianMixtureNoiseModel, MultiChannelNoiseModel]
 
+# TODO: this is a temporary solution to serialize and deserialize tensor fields
+# in pydantic models. Specifically, the aim is to enable saving and loading configs
+# with such tensors to/from JSON files during, resp., training and evaluation.
+Tensor = Annotated[
+    Union[np.ndarray, torch.Tensor],
+    PlainSerializer(_array_to_json, return_type=str),
+    PlainValidator(_to_torch),
+]
+"""Annotated tensor type, used to serialize arrays or tensors to JSON strings
+and deserialize them back to tensors."""
+
 
 class GaussianLikelihoodConfig(BaseModel):
     """Gaussian likelihood configuration."""
@@ -31,13 +45,16 @@ class NMLikelihoodConfig(BaseModel):
 
     model_config = ConfigDict(validate_assignment=True, arbitrary_types_allowed=True)
 
-    data_mean: Union[torch.Tensor] = torch.zeros(1)
+    # TODO remove and use as parameters to the likelihood functions?
+    data_mean: Tensor = torch.zeros(1)
     """The mean of the data, used to unnormalize data for noise model evaluation.
     Shape is (target_ch,) (or (1, target_ch, [1], 1, 1))."""
 
-    data_std: Union[torch.Tensor] = torch.ones(1)
+    # TODO remove and use as parameters to the likelihood functions?
+    data_std: Tensor = torch.ones(1)
     """The standard deviation of the data, used to unnormalize data for noise
     model evaluation. Shape is (target_ch,) (or (1, target_ch, [1], 1, 1))."""
 
-    noise_model: Union[NoiseModel, None] = None
+    # TODO: serialization/deserialization for this
+    noise_model: Optional[NoiseModel] = Field(default=None, exclude=True)
     """The noise model instance used to compute the likelihood."""
diff --git a/src/careamics/config/nm_model.py b/src/careamics/config/nm_model.py
@@ -4,8 +4,30 @@
 from typing import Literal, Optional, Union
 
 import numpy as np
-from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Self
+import torch
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    PlainSerializer,
+    PlainValidator,
+    model_validator,
+)
+from typing_extensions import Annotated, Self
+
+from careamics.utils.serializers import _array_to_json, _to_numpy
+
+# TODO: this is a temporary solution to serialize and deserialize array fields
+# in pydantic models. Specifically, the aim is to enable saving and loading configs
+# with such arrays to/from JSON files during, resp., training and evaluation.
+Array = Annotated[
+    Union[np.ndarray, torch.Tensor],
+    PlainSerializer(_array_to_json, return_type=str),
+    PlainValidator(_to_numpy),
+]
+"""Annotated array type, used to serialize arrays or tensors to JSON strings
+and deserialize them back to arrays."""
+
 
 # TODO: add histogram-based noise model
 
@@ -26,13 +48,17 @@ class GaussianMixtureNMConfig(BaseModel):
     """Path to the directory where the trained noise model (*.npz) is saved in the
     `train` method."""
 
-    signal: Optional[Union[str, Path, np.ndarray]] = None
+    # TODO remove and use as parameters to the NM functions?
+    signal: Optional[Union[str, Path, np.ndarray]] = Field(default=None, exclude=True)
     """Path to the file containing signal or respective numpy array."""
 
-    observation: Optional[Union[str, Path, np.ndarray]] = None
+    # TODO remove and use as parameters to the NM functions?
+    observation: Optional[Union[str, Path, np.ndarray]] = Field(
+        default=None, exclude=True
+    )
     """Path to the file containing observation or respective numpy array."""
 
-    weight: Optional[np.ndarray] = None
+    weight: Optional[Array] = None
     """A [3*n_gaussian, n_coeff] sized array containing the values of the weights
     describing the GMM noise model, with each row corresponding to one
     parameter of each gaussian, namely [mean, standard deviation and weight].

diff --git a/src/careamics/utils/serializers.py b/src/careamics/utils/serializers.py
@@ -0,0 +1,60 @@
+"""A script for serializers in the careamics package."""
+
+import ast
+import json
+from typing import Union
+
+import numpy as np
+import torch
+
+
+def _array_to_json(arr: Union[np.ndarray, torch.Tensor]) -> str:
+    """Convert an array to a list and then to a JSON string.
+
+    Parameters
+    ----------
+    arr : Union[np.ndarray, torch.Tensor]
+        Array to be serialized.
+
+    Returns
+    -------
+    str
+        JSON string representing the array.
+    """
+    return json.dumps(arr.tolist())
+
+
+def _to_numpy(lst: Union[str, list]) -> np.ndarray:
+    """Deserialize a list or string representing a list into `np.ndarray`.
+
+    Parameters
+    ----------
+    lst : list
+        List or string representing a list with the array content to be deserialized.
+
+    Returns
+    -------
+    np.ndarray
+        The deserialized array.
+    """
+    if isinstance(lst, str):
+        lst = ast.literal_eval(lst)
+    return np.asarray(lst)
+
+
+def _to_torch(lst: Union[str, list]) -> torch.Tensor:
+    """Deserialize list or string representing a list into `torch.Tensor`.
+
+    Parameters
+    ----------
+    lst : Union[str, list]
+        List or string representing a list swith the array content to be deserialized.
+
+    Returns
+    -------
+    torch.Tensor
+        The deserialized tensor.
+    """
+    if isinstance(lst, str):
+        lst = ast.literal_eval(lst)
+    return torch.tensor(lst)
diff --git a/tests/utils/test_serializers.py b/tests/utils/test_serializers.py
@@ -0,0 +1,65 @@
+import json
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+import pytest
+import torch
+from pydantic import BaseModel, ConfigDict
+
+from careamics.config.likelihood_model import Tensor
+from careamics.config.nm_model import Array
+
+
+class MyArray(BaseModel):
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    arr: Array
+
+
+class MyTensor(BaseModel):
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    arr: Tensor
+
+
+@pytest.mark.parametrize("arr", [np.array([1, 2]), torch.tensor([1, 2])])
+def test_serialize_array(arr: Union[np.ndarray, torch.Tensor]):
+    """Test array_to_json function."""
+    arr_model = MyArray(arr=arr)
+    assert arr_model.model_dump() == {"arr": "[1, 2]"}
+
+
+@pytest.mark.parametrize("arr", [np.array([1, 2]), torch.tensor([1, 2])])
+def test_serialize_tensor(arr: Union[np.ndarray, torch.Tensor]):
+    """Test array_to_json function."""
+    arr_model = MyTensor(arr=arr)
+    assert arr_model.model_dump() == {"arr": "[1, 2]"}
+
+
+def test_deserialize_array(tmp_path: Path):
+    """Test list_to_numpy function."""
+    arr_model = MyArray(arr=np.array([1, 2]))
+    # save to JSON
+    with open(tmp_path / "array_config.json", "w") as f:
+        f.write(arr_model.model_dump_json())
+    # load from JSON
+    with open(tmp_path / "array_config.json") as f:
+        config = json.load(f)
+    new_arr_model = MyArray(**config)
+    assert np.array_equal(new_arr_model.arr, np.array([1, 2]))
+
+
+def test_deserialize_tensor(tmp_path: Path):
+    """Test list_to_tensor function."""
+    arr_model = MyTensor(arr=torch.tensor([1, 2]))
+    # save to JSON
+    with open(tmp_path / "tensor_config.json", "w") as f:
+        f.write(arr_model.model_dump_json())
+    # load from JSON
+    with open(tmp_path / "tensor_config.json") as f:
+        config = json.load(f)
+    new_arr_model = MyTensor(**config)
+    assert torch.equal(new_arr_model.arr, torch.tensor([1, 2]))