Skip to content

Commit

Permalink
refac: refactored serialization of arrays for noise model and likelih…
Browse files Browse the repository at this point in the history
…ood configs (CAREamics#232)

### Description

- **What**: Implemented logic to serialize arrays defined within
pydantic models (specifically with reference to `nm_model.py` and
`likelihood_model.py`).
- **Why**: Because during training we need to save info in the configs.
Specifically, since arrays cannot be deserialized by default we need to
decide whether to keep them or not.
- **How**: Excluded large arrays from serialization, wrote custom
serializers for others.

### Changes Made

- **Added**: 
  - Custom serializer `array_to_json()`.
- 2 different custom deserializers: `list_to_numpy()`,
`list_to_torch()`.
- **Modified**: Excluded some arrays from serialization.
- **Removed**: None.

### For further discussion

NOTE1: why deserializer takes in a list? Because in our use case we need
deserializers to move config files usually stored as dicts into pydantic
models. But, such dicts are often the result of loading config file
mainly stored as `json`, `pkl`, or `yml`. The loader for these file
types automatically deserialize strings into lists. That's why what is
left to do for us is to move lists into arrays or tensors.

NOTE2: why 2 different deserializers? Because in some cases we want to
deserialize list to torch tensor and in some other cases we prefer numpy
arrays.

After discussing with @jdeschamps we realized that some large arrays
should not be part of configs (e.g., `signal`, `observation` in
`nm_model.py`). Therefore, I left TODOs to remind about discussing this
for future refactoring.

---

**Please ensure your PR meets the following requirements:**

- [x] Code builds and passes tests locally, including doctests
- [x] New tests have been added (for bug fixes/features)
- [x] Pre-commit passes
- [ ] PR to the documentation exists (for bug fixes / features)

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Joran Deschamps <6367888+jdeschamps@users.noreply.github.com>
  • Loading branch information
3 people authored Sep 6, 2024
1 parent 44aee3e commit 1b60f07
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 9 deletions.
25 changes: 21 additions & 4 deletions src/careamics/config/likelihood_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,30 @@

from typing import Literal, Optional, Union

import numpy as np
import torch
from pydantic import BaseModel, ConfigDict
from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator
from typing_extensions import Annotated

from careamics.models.lvae.noise_models import (
GaussianMixtureNoiseModel,
MultiChannelNoiseModel,
)
from careamics.utils.serializers import _array_to_json, _to_torch

NoiseModel = Union[GaussianMixtureNoiseModel, MultiChannelNoiseModel]

# TODO: this is a temporary solution to serialize and deserialize tensor fields
# in pydantic models. Specifically, the aim is to enable saving and loading configs
# with such tensors to/from JSON files during, resp., training and evaluation.
Tensor = Annotated[
Union[np.ndarray, torch.Tensor],
PlainSerializer(_array_to_json, return_type=str),
PlainValidator(_to_torch),
]
"""Annotated tensor type, used to serialize arrays or tensors to JSON strings
and deserialize them back to tensors."""


class GaussianLikelihoodConfig(BaseModel):
"""Gaussian likelihood configuration."""
Expand All @@ -31,13 +45,16 @@ class NMLikelihoodConfig(BaseModel):

model_config = ConfigDict(validate_assignment=True, arbitrary_types_allowed=True)

data_mean: Union[torch.Tensor] = torch.zeros(1)
# TODO remove and use as parameters to the likelihood functions?
data_mean: Tensor = torch.zeros(1)
"""The mean of the data, used to unnormalize data for noise model evaluation.
Shape is (target_ch,) (or (1, target_ch, [1], 1, 1))."""

data_std: Union[torch.Tensor] = torch.ones(1)
# TODO remove and use as parameters to the likelihood functions?
data_std: Tensor = torch.ones(1)
"""The standard deviation of the data, used to unnormalize data for noise
model evaluation. Shape is (target_ch,) (or (1, target_ch, [1], 1, 1))."""

noise_model: Union[NoiseModel, None] = None
# TODO: serialization/deserialization for this
noise_model: Optional[NoiseModel] = Field(default=None, exclude=True)
"""The noise model instance used to compute the likelihood."""
36 changes: 31 additions & 5 deletions src/careamics/config/nm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,30 @@
from typing import Literal, Optional, Union

import numpy as np
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
import torch
from pydantic import (
BaseModel,
ConfigDict,
Field,
PlainSerializer,
PlainValidator,
model_validator,
)
from typing_extensions import Annotated, Self

from careamics.utils.serializers import _array_to_json, _to_numpy

# TODO: this is a temporary solution to serialize and deserialize array fields
# in pydantic models. Specifically, the aim is to enable saving and loading configs
# with such arrays to/from JSON files during, resp., training and evaluation.
Array = Annotated[
Union[np.ndarray, torch.Tensor],
PlainSerializer(_array_to_json, return_type=str),
PlainValidator(_to_numpy),
]
"""Annotated array type, used to serialize arrays or tensors to JSON strings
and deserialize them back to arrays."""


# TODO: add histogram-based noise model

Expand All @@ -26,13 +48,17 @@ class GaussianMixtureNMConfig(BaseModel):
"""Path to the directory where the trained noise model (*.npz) is saved in the
`train` method."""

signal: Optional[Union[str, Path, np.ndarray]] = None
# TODO remove and use as parameters to the NM functions?
signal: Optional[Union[str, Path, np.ndarray]] = Field(default=None, exclude=True)
"""Path to the file containing signal or respective numpy array."""

observation: Optional[Union[str, Path, np.ndarray]] = None
# TODO remove and use as parameters to the NM functions?
observation: Optional[Union[str, Path, np.ndarray]] = Field(
default=None, exclude=True
)
"""Path to the file containing observation or respective numpy array."""

weight: Optional[np.ndarray] = None
weight: Optional[Array] = None
"""A [3*n_gaussian, n_coeff] sized array containing the values of the weights
describing the GMM noise model, with each row corresponding to one
parameter of each gaussian, namely [mean, standard deviation and weight].
Expand Down
60 changes: 60 additions & 0 deletions src/careamics/utils/serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""A script for serializers in the careamics package."""

import ast
import json
from typing import Union

import numpy as np
import torch


def _array_to_json(arr: Union[np.ndarray, torch.Tensor]) -> str:
"""Convert an array to a list and then to a JSON string.
Parameters
----------
arr : Union[np.ndarray, torch.Tensor]
Array to be serialized.
Returns
-------
str
JSON string representing the array.
"""
return json.dumps(arr.tolist())


def _to_numpy(lst: Union[str, list]) -> np.ndarray:
"""Deserialize a list or string representing a list into `np.ndarray`.
Parameters
----------
lst : list
List or string representing a list with the array content to be deserialized.
Returns
-------
np.ndarray
The deserialized array.
"""
if isinstance(lst, str):
lst = ast.literal_eval(lst)
return np.asarray(lst)


def _to_torch(lst: Union[str, list]) -> torch.Tensor:
"""Deserialize list or string representing a list into `torch.Tensor`.
Parameters
----------
lst : Union[str, list]
List or string representing a list swith the array content to be deserialized.
Returns
-------
torch.Tensor
The deserialized tensor.
"""
if isinstance(lst, str):
lst = ast.literal_eval(lst)
return torch.tensor(lst)
65 changes: 65 additions & 0 deletions tests/utils/test_serializers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import json
from pathlib import Path
from typing import Union

import numpy as np
import pytest
import torch
from pydantic import BaseModel, ConfigDict

from careamics.config.likelihood_model import Tensor
from careamics.config.nm_model import Array


class MyArray(BaseModel):

model_config = ConfigDict(arbitrary_types_allowed=True)

arr: Array


class MyTensor(BaseModel):

model_config = ConfigDict(arbitrary_types_allowed=True)

arr: Tensor


@pytest.mark.parametrize("arr", [np.array([1, 2]), torch.tensor([1, 2])])
def test_serialize_array(arr: Union[np.ndarray, torch.Tensor]):
"""Test array_to_json function."""
arr_model = MyArray(arr=arr)
assert arr_model.model_dump() == {"arr": "[1, 2]"}


@pytest.mark.parametrize("arr", [np.array([1, 2]), torch.tensor([1, 2])])
def test_serialize_tensor(arr: Union[np.ndarray, torch.Tensor]):
"""Test array_to_json function."""
arr_model = MyTensor(arr=arr)
assert arr_model.model_dump() == {"arr": "[1, 2]"}


def test_deserialize_array(tmp_path: Path):
"""Test list_to_numpy function."""
arr_model = MyArray(arr=np.array([1, 2]))
# save to JSON
with open(tmp_path / "array_config.json", "w") as f:
f.write(arr_model.model_dump_json())
# load from JSON
with open(tmp_path / "array_config.json") as f:
config = json.load(f)
new_arr_model = MyArray(**config)
assert np.array_equal(new_arr_model.arr, np.array([1, 2]))


def test_deserialize_tensor(tmp_path: Path):
"""Test list_to_tensor function."""
arr_model = MyTensor(arr=torch.tensor([1, 2]))
# save to JSON
with open(tmp_path / "tensor_config.json", "w") as f:
f.write(arr_model.model_dump_json())
# load from JSON
with open(tmp_path / "tensor_config.json") as f:
config = json.load(f)
new_arr_model = MyTensor(**config)
assert torch.equal(new_arr_model.arr, torch.tensor([1, 2]))

0 comments on commit 1b60f07

Please sign in to comment.