Add Icechunk Support (#256)

* move vds_with_manifest_arrays fixture up * sketch implementation * test that we can create an icechunk store * fixture to create icechunk filestore in temporary directory * get the async fixture working properly * split into more functions * change mode * try creating zarr group and arrays explicitly * create root group from store * todos * do away with the async pytest fixtures/functions * successfully writes root group attrs * check array metadata is correct * try to write array attributes * sketch test for checking virtual references have been set correctly * test setting single virtual ref * use async properly * better separation of handling of loadable variables * fix chunk key format * use require_array * check that store supports writes * removed outdated note about awaiting * fix incorrect chunk key in test * absolute path * convert to file URI before handing to icechunk * test that without encoding we can definitely read one chunk * Work on encoding test * Update test to match * Quick comment * more comprehensive * add attrtirbute encoding * Fix array dimensions * Fix v3 codec pipeline * Put xarray dep back * Handle codecs, but get bad results * Gzip an d zlib are not directly working * Get up working with numcodecs zarr 3 codecs * Update codec pipeline * oUdpate to latest icechunk using sync api * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Some type stuff * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update zarr and icechunk tests, fix zarr v3 metadata * Update import we dont need * Update kerhcunk tests * Check for v3 metadata import in zarr test * More tests * type checker * types * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * More types * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * ooops * One left * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Finally done being dumb * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Support loadables without tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add test for multiple chunks to check order * Add loadable varaible test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add accessor, simple docs * Update icechunk.py Co-authored-by: Tom Nicholas <tom@cworthy.org> * Update accessor.py Co-authored-by: Tom Nicholas <tom@cworthy.org> * Fix attributes when loadables are available * Protect zarr import * Fix import errors in icechunk writer * More protection * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * i am bad at this * Add xarray roundtrip asserts * Add icechunk to api.rst * Update virtualizarr/tests/test_writers/test_icechunk.py Co-authored-by: Tom Nicholas <tom@cworthy.org> * More test improvements, update realeses.rst * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tmore testing * Figure out tests for real this time * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: TomNicholas <tom@cworthy.org> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
zarr-developers · Oct 22, 2024 · 775c2c8 · 775c2c8
1 parent 4b7612e
commit 775c2c8
Show file tree

Hide file tree

Showing 16 changed files with 622 additions and 59 deletions.
diff --git a/ci/upstream.yml b/ci/upstream.yml
@@ -24,7 +24,7 @@ dependencies:
   - fsspec
   - pip
   - pip:
-    - zarr==3.0.0b1  # beta release of zarr-python v3
+    - icechunk # Installs zarr v3 as dependency
     - git+https://github.com/pydata/xarray@zarr-v3  # zarr-v3 compatibility branch
     - git+https://github.com/zarr-developers/numcodecs@zarr3-codecs  # zarr-v3 compatibility branch
     # - git+https://github.com/fsspec/kerchunk@main  # kerchunk is currently incompatible with zarr-python v3 (https://github.com/fsspec/kerchunk/pull/516)
diff --git a/conftest.py b/conftest.py
@@ -1,6 +1,8 @@
 import h5py
+import numpy as np
 import pytest
 import xarray as xr
+from xarray.core.variable import Variable
 
 
 def pytest_addoption(parser):
@@ -96,3 +98,16 @@ def hdf5_scalar(tmpdir):
     dataset = f.create_dataset("scalar", data=0.1, dtype="float32")
     dataset.attrs["scalar"] = "true"
     return filepath
+
+
+@pytest.fixture
+def simple_netcdf4(tmpdir):
+    filepath = f"{tmpdir}/simple.nc"
+
+    arr = np.arange(12, dtype=np.dtype("int32")).reshape(3, 4)
+    var = Variable(data=arr, dims=["x", "y"])
+    ds = xr.Dataset({"foo": var})
+
+    ds.to_netcdf(filepath)
+
+    return filepath
diff --git a/docs/api.rst b/docs/api.rst
@@ -39,6 +39,7 @@ Serialization
 
     VirtualiZarrDatasetAccessor.to_kerchunk
     VirtualiZarrDatasetAccessor.to_zarr
+    VirtualiZarrDatasetAccessor.to_icechunk
 
 
 Rewriting

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -31,6 +31,9 @@ New Features
 - Support empty files (:pull:`260`)
   By `Justus Magin <https://github.com/keewis>`_.
 
+- Can write virtual datasets to Icechunk stores using `vitualize.to_icechunk` (:pull:`256`)
+  By `Matt Iannucci <https://github.com/mpiannucci>`_.
+
 Breaking changes
 ~~~~~~~~~~~~~~~~
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -396,6 +396,23 @@ combined_ds = xr.open_dataset('combined.parq', engine="kerchunk")
 
 By default references are placed in separate parquet file when the total number of references exceeds `record_size`. If there are fewer than `categorical_threshold` unique urls referenced by a particular variable, url will be stored as a categorical variable.
 
+### Writing to an Icechunk Store
+
+We can also write these references out as an [IcechunkStore](https://icechunk.io/). `Icechunk` is a Open-source, cloud-native transactional tensor storage engine that is compatible with zarr version 3. To export our virtual dataset to an `Icechunk` Store, we simply use the {py:meth}`ds.virtualize.to_icechunk <virtualizarr.xarray.VirtualiZarrDatasetAccessor.to_icechunk>` accessor method.
+
+```python
+# create an icechunk store
+from icechunk import IcechunkStore, StorageConfig, StoreConfig, VirtualRefConfig
+storage = StorageConfig.filesystem(str('combined'))
+store = IcechunkStore.create(storage=storage, mode="w", config=StoreConfig(
+    virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'),
+))
+
+combined_vds.virtualize.to_icechunk(store)
+```
+
+See the [Icechunk documentation](https://icechunk.io/icechunk-python/virtual/#creating-a-virtual-dataset-with-virtualizarr) for more details.
+
 ### Writing as Zarr
 
 Alternatively, we can write these references out as an actual Zarr store, at least one that is compliant with the [proposed "Chunk Manifest" ZEP](https://github.com/zarr-developers/zarr-specs/issues/287). To do this we simply use the {py:meth}`ds.virtualize.to_zarr <virtualizarr.xarray.VirtualiZarrDatasetAccessor.to_zarr>` accessor method.

diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 from typing import (
+    TYPE_CHECKING,
     Callable,
     Literal,
     overload,
@@ -12,6 +13,9 @@
 from virtualizarr.writers.kerchunk import dataset_to_kerchunk_refs
 from virtualizarr.writers.zarr import dataset_to_zarr
 
+if TYPE_CHECKING:
+    from icechunk import IcechunkStore  # type: ignore[import-not-found]
+
 
 @register_dataset_accessor("virtualize")
 class VirtualiZarrDatasetAccessor:
@@ -39,6 +43,20 @@ def to_zarr(self, storepath: str) -> None:
         """
         dataset_to_zarr(self.ds, storepath)
 
+    def to_icechunk(self, store: "IcechunkStore") -> None:
+        """
+        Write an xarray dataset to an Icechunk store.
+
+        Any variables backed by ManifestArray objects will be be written as virtual references, any other variables will be loaded into memory before their binary chunk data is written into the store.
+
+        Parameters
+        ----------
+        store: IcechunkStore
+        """
+        from virtualizarr.writers.icechunk import dataset_to_icechunk
+
+        dataset_to_icechunk(self.ds, store)
+
     @overload
     def to_kerchunk(
         self, filepath: None, format: Literal["dict"]

diff --git a/virtualizarr/readers/zarr_v3.py b/virtualizarr/readers/zarr_v3.py
@@ -150,5 +150,7 @@ def _configurable_to_num_codec_config(configurable: dict) -> dict:
     """
     configurable_copy = configurable.copy()
     codec_id = configurable_copy.pop("name")
+    if codec_id.startswith("numcodecs."):
+        codec_id = codec_id[len("numcodecs.") :]
     configuration = configurable_copy.pop("configuration")
     return numcodecs.get_codec({"id": codec_id, **configuration}).get_config()
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -27,7 +27,7 @@ def test_kerchunk_roundtrip_in_memory_no_concat():
             chunks=(2, 2),
             compressor=None,
             filters=None,
-            fill_value=np.nan,
+            fill_value=None,
             order="C",
         ),
         chunkmanifest=manifest,

diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
@@ -47,7 +47,7 @@ def test_create_manifestarray_from_kerchunk_refs(self):
         assert marr.chunks == (2, 3)
         assert marr.dtype == np.dtype("int64")
         assert marr.zarray.compressor is None
-        assert marr.zarray.fill_value is np.nan
+        assert marr.zarray.fill_value == 0
         assert marr.zarray.filters is None
         assert marr.zarray.order == "C"
 

diff --git a/virtualizarr/tests/test_readers/test_kerchunk.py b/virtualizarr/tests/test_readers/test_kerchunk.py
@@ -37,7 +37,7 @@ def test_dataset_from_df_refs():
 
     assert da.data.zarray.compressor is None
     assert da.data.zarray.filters is None
-    assert da.data.zarray.fill_value is np.nan
+    assert da.data.zarray.fill_value == 0
     assert da.data.zarray.order == "C"
 
     assert da.data.manifest.dict() == {

diff --git a/virtualizarr/tests/test_writers/conftest.py b/virtualizarr/tests/test_writers/conftest.py
@@ -0,0 +1,27 @@
+import numpy as np
+import pytest
+from xarray import Dataset
+from xarray.core.variable import Variable
+
+from virtualizarr.manifests import ChunkManifest, ManifestArray
+
+
+@pytest.fixture
+def vds_with_manifest_arrays() -> Dataset:
+    arr = ManifestArray(
+        chunkmanifest=ChunkManifest(
+            entries={"0.0": dict(path="/test.nc", offset=6144, length=48)}
+        ),
+        zarray=dict(
+            shape=(2, 3),
+            dtype=np.dtype("<i8"),
+            chunks=(2, 3),
+            compressor={"id": "zlib", "level": 1},
+            filters=None,
+            fill_value=0,
+            order="C",
+            zarr_format=3,
+        ),
+    )
+    var = Variable(dims=["x", "y"], data=arr, attrs={"units": "km"})
+    return Dataset({"a": var}, attrs={"something": 0})