Skip to content

Commit

Permalink
Fix default fill_value for datetime64 (#206)
Browse files Browse the repository at this point in the history
* Set ZArray fill_value back to nan

* Set NaT as datetime64 default fill value

* Fixups

* Change back to 0
* Added integration test

* changelog

---------

Co-authored-by: Tom Augspurger <taugspurger@microsoft.com>
  • Loading branch information
thodson-usgs and Tom Augspurger authored Aug 5, 2024
1 parent b34a1ee commit 04a566c
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 7 deletions.
3 changes: 3 additions & 0 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ Bug fixes

- Exclude empty chunks during `ChunkDict` construction. (:pull:`198`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
- Fixed regression in `fill_value` handling for datetime dtypes making virtual
Zarr stores unreadable (:pr:`206`)
By `Timothy Hodson <https://github.com/thodson-usgs>`_

Documentation
~~~~~~~~~~~~~
Expand Down
47 changes: 47 additions & 0 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import xarray.testing as xrt

from virtualizarr import open_virtual_dataset
from virtualizarr.manifests.array import ManifestArray
from virtualizarr.manifests.manifest import ChunkManifest
from virtualizarr.zarr import ZArray


@pytest.mark.parametrize(
Expand Down Expand Up @@ -166,6 +169,50 @@ def test_non_dimension_coordinates(self, tmpdir, format):
# assert equal to original dataset
xrt.assert_identical(roundtrip, ds)

def test_datetime64_dtype_fill_value(self, tmpdir, format):
chunks_dict = {
"0.0.0": {"path": "foo.nc", "offset": 100, "length": 100},
}
manifest = ChunkManifest(entries=chunks_dict)
chunks = (1, 1, 1)
shape = (1, 1, 1)
zarray = ZArray(
chunks=chunks,
compressor={"id": "zlib", "level": 1},
dtype=np.dtype("<M8[ns]"),
# fill_value=0.0,
filters=None,
order="C",
shape=shape,
zarr_format=2,
)
marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest)
ds = xr.Dataset(
{
"a": xr.DataArray(
marr1,
attrs={
"_FillValue": np.datetime64("1970-01-01T00:00:00.000000000")
},
)
}
)

if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = ds.virtualize.to_kerchunk(format=format)

# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk")
else:
# write those references to disk as kerchunk references format
ds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)

# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")

assert roundtrip.a.attrs == ds.a.attrs


def test_open_scalar_variable(tmpdir):
# regression test for GH issue #100
Expand Down
15 changes: 8 additions & 7 deletions virtualizarr/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,14 @@
) # just the .zattrs (for one array or for the whole store/group)
FillValueT = bool | str | float | int | list | None

ZARR_DEFAULT_FILL_VALUE: dict[np.dtype, FillValueT] = {
ZARR_DEFAULT_FILL_VALUE: dict[str, FillValueT] = {
# numpy dtypes's hierarchy lets us avoid checking for all the widths
# https://numpy.org/doc/stable/reference/arrays.scalars.html
np.dtype("bool"): False,
np.dtype("int"): 0,
np.dtype("float"): 0.0,
np.dtype("complex"): [0.0, 0.0],
np.dtype("bool").kind: False,
np.dtype("int").kind: 0,
np.dtype("float").kind: 0.0,
np.dtype("complex").kind: [0.0, 0.0],
np.dtype("datetime64").kind: 0,
}
"""
The value and format of the fill_value depend on the `data_type` of the array.
Expand Down Expand Up @@ -67,7 +68,7 @@ class ZArray(BaseModel):
chunks: tuple[int, ...]
compressor: dict | None = None
dtype: np.dtype
fill_value: FillValueT = Field(default=0.0, validate_default=True)
fill_value: FillValueT = Field(None, validate_default=True)
filters: list[dict] | None = None
order: Literal["C", "F"]
shape: tuple[int, ...]
Expand All @@ -90,7 +91,7 @@ def __post_init__(self) -> None:
@model_validator(mode="after")
def _check_fill_value(self) -> Self:
if self.fill_value is None:
self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0)
self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0)
return self

@property
Expand Down

0 comments on commit 04a566c

Please sign in to comment.