Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix v3 codec pipeline #4

Merged
merged 6 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions virtualizarr/writers/icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ async def write_variable_to_icechunk(
)
else:
# TODO is writing loadable_variables just normal xarray ds.to_zarr?
raise NotImplementedError()
# raise NotImplementedError()
print("skipping non-virtual variable", name)


async def write_virtual_variable_to_icechunk(
Expand All @@ -117,13 +118,12 @@ async def write_virtual_variable_to_icechunk(
zarray = ma.zarray

# creates array if it doesn't already exist
codecs = zarray._v3_codec_pipeline()
arr = group.require_array(
name=name,
shape=zarray.shape,
chunk_shape=zarray.chunks,
dtype=encode_dtype(zarray.dtype),
#codecs=codecs,
codecs=zarray._v3_codec_pipeline(),
dimension_names=var.dims,
fill_value=zarray.fill_value,
# TODO fill_value?
Expand Down
62 changes: 33 additions & 29 deletions virtualizarr/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,9 @@ def codec(self) -> Codec:
@classmethod
def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
# coerce type of fill_value as kerchunk can be inconsistent with this
dtype = np.dtype(decoded_arr_refs_zarray["dtype"])
fill_value = decoded_arr_refs_zarray["fill_value"]
if fill_value is None or fill_value == "NaN" or fill_value == "nan":
if np.issubdtype(dtype, np.floating) and (fill_value is None or fill_value == "NaN" or fill_value == "nan"):
fill_value = np.nan

compressor = decoded_arr_refs_zarray["compressor"]
Expand All @@ -86,7 +87,7 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
return ZArray(
chunks=tuple(decoded_arr_refs_zarray["chunks"]),
compressor=compressor,
dtype=np.dtype(decoded_arr_refs_zarray["dtype"]),
dtype=dtype,
fill_value=fill_value,
filters=decoded_arr_refs_zarray["filters"],
order=decoded_arr_refs_zarray["order"],
Expand Down Expand Up @@ -140,7 +141,7 @@ def replace(
replacements["zarr_format"] = zarr_format
return dataclasses.replace(self, **replacements)

def _v3_codec_pipeline(self) -> list:
def _v3_codec_pipeline(self) -> Any:
"""
VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes
from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects.
Expand All @@ -153,44 +154,46 @@ def _v3_codec_pipeline(self) -> list:
post_compressor: Iterable[BytesBytesCodec] #optional
```
"""
if self.filters:
filter_codecs_configs = [
numcodecs.get_codec(filter).get_config() for filter in self.filters
]
filters = [
dict(name=codec.pop("id"), configuration=codec)
for codec in filter_codecs_configs
]
else:
filters = []

# Noting here that zarr v3 has very few codecs specificed in the official spec,
# and that there are far more codecs in `numcodecs`. We take a gamble and assume
# that the codec names and configuration are simply mapped into zarrv3 "configurables".
if self.compressor:
compressor = [_num_codec_config_to_configurable(self.compressor)]
else:
compressor = []
try:
from zarr.core.metadata.v3 import parse_codecs
except ImportError:
raise ImportError(
"zarr v3 is required to generate v3 codec pipelines"
)

codec_configs = []

# https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1
# Either "C" or "F", defining the layout of bytes within each chunk of the array.
# "C" means row-major order, i.e., the last dimension varies fastest;
# "F" means column-major order, i.e., the first dimension varies fastest.
if self.order == "C":
order = tuple(range(len(self.shape)))
elif self.order == "F":
# For now, we only need transpose if the order is not "C"
if self.order == "F":
order = tuple(reversed(range(len(self.shape))))
transpose = dict(name="transpose", configuration=dict(order=order))
codec_configs.append(transpose)

transpose = dict(name="transpose", configuration=dict(order=order))
# https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097
# "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec"
bytes = dict(
name="bytes", configuration={}
) # TODO need to handle endianess configuration
codec_configs.append(bytes)

# Noting here that zarr v3 has very few codecs specificed in the official spec,
# and that there are far more codecs in `numcodecs`. We take a gamble and assume
# that the codec names and configuration are simply mapped into zarrv3 "configurables".
if self.filters:
codec_configs.extend([
_num_codec_config_to_configurable(filter) for filter in self.filters
])

if self.compressor:
codec_configs.append(_num_codec_config_to_configurable(self.compressor))

# convert the pipeline repr into actual v3 codec objects
codec_pipeline = parse_codecs(codec_configs)

# The order here is significant!
# [ArrayArray] -> ArrayBytes -> [BytesBytes]
codec_pipeline = [transpose, bytes] + compressor + filters
return codec_pipeline


Expand All @@ -213,4 +216,5 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict:
Convert a numcodecs codec into a zarr v3 configurable.
"""
num_codec_copy = num_codec.copy()
return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy}
name = "numcodecs." + num_codec_copy.pop("id")
Copy link
Author

@mpiannucci mpiannucci Oct 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is subject to zarr-developers/numcodecs#597 being installed.. but it works

return {"name": name, "configuration": num_codec_copy}