From deb2082ab6e648b7e87cd26a74d084262bd1cfdf Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 22 Jun 2024 11:03:37 -0700 Subject: [PATCH] Improve zarr chunks docs (#9140) * Improve zarr chunks docs Makes them more structure, consistent. I think removes a mistake re the default chunks arg in `open_zarr` (it's not `None`, it's `auto`). Adds a comment re performance with `chunks=None`, closing https://github.com/pydata/xarray/issues/9111 --- doc/whats-new.rst | 2 ++ xarray/backends/api.py | 43 +++++++++++++++++++++++++---------------- xarray/backends/zarr.py | 18 +++++++++++------ 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e7a48458ae2..51a2c98fb9c 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -40,6 +40,8 @@ Bug fixes Documentation ~~~~~~~~~~~~~ +- Improvements to Zarr & chunking docs (:pull:`9139`, :pull:`9140`, :pull:`9132`) + By `Maximilian Roos `_ Internal Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ea3639db5c4..7054c62126e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -425,15 +425,19 @@ def open_dataset( is chosen based on available dependencies, with a preference for "netcdf4". A custom backend class (a subclass of ``BackendEntrypoint``) can also be used. - chunks : int, dict, 'auto' or None, optional - If chunks is provided, it is used to load the new dataset into dask - arrays. ``chunks=-1`` loads the dataset with dask using a single - chunk for all arrays. ``chunks={}`` loads the dataset with dask using - engine preferred chunks if exposed by the backend, otherwise with - a single chunk for all arrays. In order to reproduce the default behavior - of ``xr.open_zarr(...)`` use ``xr.open_dataset(..., engine='zarr', chunks={})``. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. See dask chunking for more details. + chunks : int, dict, 'auto' or None, default: None + If provided, used to load the data into dask arrays. + + - ``chunks="auto"`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. + - ``chunks=None`` skips using dask, which is generally faster for + small arrays. + - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. + - ``chunks={}`` loads the data with dask using the engine's preferred chunk + size, generally identical to the format's chunk size. If not available, a + single chunk for all arrays. + + See dask chunking for more details. cache : bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- @@ -631,14 +635,19 @@ def open_dataarray( Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for "netcdf4". - chunks : int, dict, 'auto' or None, optional - If chunks is provided, it is used to load the new dataset into dask - arrays. ``chunks=-1`` loads the dataset with dask using a single - chunk for all arrays. `chunks={}`` loads the dataset with dask using - engine preferred chunks if exposed by the backend, otherwise with - a single chunk for all arrays. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. See dask chunking for more details. + chunks : int, dict, 'auto' or None, default: None + If provided, used to load the data into dask arrays. + + - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. + - ``chunks=None`` skips using dask, which is generally faster for + small arrays. + - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. + - ``chunks={}`` loads the data with dask using engine preferred chunks if + exposed by the backend, otherwise with a single chunk for all arrays. + + See dask chunking for more details. + cache : bool, optional If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 5f6aa0f119c..9796fcbf9e2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -973,12 +973,18 @@ def open_zarr( Array synchronizer provided to zarr group : str, optional Group path. (a.k.a. `path` in zarr terminology.) - chunks : int or dict or tuple or {None, 'auto'}, optional - Chunk sizes along each dimension, e.g., ``5`` or - ``{'x': 5, 'y': 5}``. If `chunks='auto'`, dask chunks are created - based on the variable's zarr chunks. If `chunks=None`, zarr array - data will lazily convert to numpy arrays upon access. This accepts - all the chunk specifications as Dask does. + chunks : int, dict, 'auto' or None, default: 'auto' + If provided, used to load the data into dask arrays. + + - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. + - ``chunks=None`` skips using dask, which is generally faster for + small arrays. + - ``chunks=-1`` loads the data with dask using a single chunk for all arrays. + - ``chunks={}`` loads the data with dask using engine preferred chunks if + exposed by the backend, otherwise with a single chunk for all arrays. + + See dask chunking for more details. overwrite_encoded_chunks : bool, optional Whether to drop the zarr chunks encoded for each variable when a dataset is loaded with specified chunk sizes (default: False)