From f961606c48c50fd437a271e1b71c6816621e95dc Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 25 Sep 2020 19:05:48 +0200 Subject: [PATCH 01/46] add in api.open_dataset dispatching to stub apiv2 --- xarray/backends/api.py | 8 +- xarray/backends/apiv2.py | 307 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 314 insertions(+), 1 deletion(-) create mode 100644 xarray/backends/apiv2.py diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9ea222954f4..3486981d0b2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1,4 +1,4 @@ -import os.path +import os import warnings from glob import glob from io import BytesIO @@ -428,6 +428,12 @@ def open_dataset( -------- open_mfdataset """ + if os.environ.get("XARRAY_BACKEND_API", "v1") == "v1": + kwargs = locals().copy() + from . import apiv2 + if engine in apiv2.ENGINES: + return apiv2.open_dataset(**kwargs) + if autoclose is not None: warnings.warn( "The autoclose argument is no longer used by " diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py new file mode 100644 index 00000000000..13a70425739 --- /dev/null +++ b/xarray/backends/apiv2.py @@ -0,0 +1,307 @@ +import os +import warnings +from pathlib import Path +from typing import ( + TYPE_CHECKING, + +) + +from .. import backends, conventions + +from ..core.dataset import Dataset +from ..core.utils import close_on_error, is_grib_path, is_remote_uri +from .api import ( + _get_backend_cls, + _get_default_engine, + _get_engine_from_magic_number, + _normalize_path, + _protect_dataset_variables_inplace, +) +from .common import AbstractDataStore, ArrayWriter + +if TYPE_CHECKING: + try: + from dask.delayed import Delayed + except ImportError: + Delayed = None + + +DATAARRAY_NAME = "__xarray_dataarray_name__" +DATAARRAY_VARIABLE = "__xarray_dataarray_variable__" + +ENGINES = { + "h5netcdf": backends.H5NetCDFStore.open, +} + + +def open_dataset( + filename_or_obj, + group=None, + decode_cf=True, + mask_and_scale=None, + decode_times=True, + autoclose=None, + concat_characters=True, + decode_coords=True, + engine=None, + chunks=None, + lock=None, + cache=None, + drop_variables=None, + backend_kwargs=None, + use_cftime=None, + decode_timedelta=None, +): + """Open and decode a dataset from a file or file-like object. + + Parameters + ---------- + filename_or_obj : str, Path, file-like or DataStore + Strings and Path objects are interpreted as a path to a netCDF file + or an OpenDAP URL and opened with python-netCDF4, unless the filename + ends with .gz, in which case the file is gunzipped and opened with + scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like + objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). + group : str, optional + Path to the netCDF4 group in the given file to open (only works for + netCDF4 files). + decode_cf : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + mask_and_scale : bool, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. mask_and_scale defaults to True except for the + pseudonetcdf backend. + decode_times : bool, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + autoclose : bool, optional + If True, automatically close files to avoid OS Error of too many files + being open. However, this option doesn't work with streams, e.g., + BytesIO. + concat_characters : bool, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + decode_coords : bool, optional + If True, decode the 'coordinates' attribute to identify coordinates in + the resulting dataset. + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ + "pseudonetcdf", "zarr"}, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". + chunks : int or dict, optional + If chunks is provided, it is used to load the new dataset into dask + arrays. ``chunks={}`` loads the dataset with dask using a single + chunk for all arrays. When using ``engine="zarr"``, setting + ``chunks='auto'`` will create dask chunks based on the variable's zarr + chunks. + lock : False or lock-like, optional + Resource lock to use when reading data from disk. Only relevant when + using dask or another form of parallelism. By default, appropriate + locks are chosen to safely read and write files with the currently + active dask scheduler. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. + drop_variables: str or iterable, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + backend_kwargs: dict, optional + A dictionary of keyword arguments to pass on to the backend. This + may be useful when backend options would improve performance or + allow user control of dataset processing. + use_cftime: bool, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. + decode_timedelta : bool, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + + Returns + ------- + dataset : Dataset + The newly created dataset. + + Notes + ----- + ``open_dataset`` opens the file with read-only access. When you modify + values of a Dataset, even one linked to files on disk, only the in-memory + copy you are manipulating in xarray is modified: the original file on disk + is never touched. + + See Also + -------- + open_mfdataset + """ + + if autoclose is not None: + warnings.warn( + "The autoclose argument is no longer used by " + "xarray.open_dataset() and is now ignored; it will be removed in " + "a future version of xarray. If necessary, you can control the " + "maximum number of simultaneous open files with " + "xarray.set_options(file_cache_maxsize=...).", + FutureWarning, + stacklevel=2, + ) + + if mask_and_scale is None: + mask_and_scale = not engine == "pseudonetcdf" + + if not decode_cf: + mask_and_scale = False + decode_times = False + concat_characters = False + decode_coords = False + decode_timedelta = False + + if cache is None: + cache = chunks is None + + if backend_kwargs is None: + backend_kwargs = {} + extra_kwargs = {} + + def maybe_decode_store(store, chunks, lock=False): + ds = conventions.decode_cf( + store, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + drop_variables=drop_variables, + use_cftime=use_cftime, + decode_timedelta=decode_timedelta, + ) + + _protect_dataset_variables_inplace(ds, cache) + + if chunks is not None and engine != "zarr": + from dask.base import tokenize + + # if passed an actual file path, augment the token with + # the file modification time + if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): + mtime = os.path.getmtime(filename_or_obj) + else: + mtime = None + token = tokenize( + filename_or_obj, + mtime, + group, + decode_cf, + mask_and_scale, + decode_times, + concat_characters, + decode_coords, + engine, + chunks, + drop_variables, + use_cftime, + decode_timedelta, + ) + name_prefix = "open_dataset-%s" % token + ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) + + elif engine == "zarr": + # adapted from Dataset.Chunk() and taken from open_zarr + if not (isinstance(chunks, (int, dict)) or chunks is None): + if chunks != "auto": + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) + + if chunks == "auto": + try: + import dask.array # noqa + except ImportError: + chunks = None + + # auto chunking needs to be here and not in ZarrStore because + # the variable chunks does not survive decode_cf + # return trivial case + if chunks is None: + return ds + + if isinstance(chunks, int): + chunks = dict.fromkeys(ds.dims, chunks) + + variables = { + k: store.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) + for k, v in ds.variables.items() + } + ds2 = ds._replace(variables) + + else: + ds2 = ds + ds2._file_obj = ds._file_obj + return ds2 + + if isinstance(filename_or_obj, Path): + filename_or_obj = str(filename_or_obj) + + if isinstance(filename_or_obj, AbstractDataStore): + store = filename_or_obj + else: + if isinstance(filename_or_obj, str): + filename_or_obj = _normalize_path(filename_or_obj) + + if engine is None: + engine = _get_default_engine(filename_or_obj, allow_remote=True) + elif engine != "zarr": + if engine not in [None, "scipy", "h5netcdf"]: + raise ValueError( + "can only read bytes or file-like objects " + "with engine='scipy' or 'h5netcdf'" + ) + engine = _get_engine_from_magic_number(filename_or_obj) + + if engine in ["netcdf4", "h5netcdf"]: + extra_kwargs["group"] = group + extra_kwargs["lock"] = lock + elif engine in ["pynio", "pseudonetcdf", "cfgrib"]: + extra_kwargs["lock"] = lock + elif engine == "zarr": + backend_kwargs = backend_kwargs.copy() + overwrite_encoded_chunks = backend_kwargs.pop( + "overwrite_encoded_chunks", None + ) + extra_kwargs["mode"] = "r" + extra_kwargs["group"] = group + + opener = _get_backend_cls(engine) + store = opener(filename_or_obj, **extra_kwargs, **backend_kwargs) + + with close_on_error(store): + ds = maybe_decode_store(store, chunks) + + # Ensure source filename always stored in dataset object (GH issue #2550) + if "source" not in ds.encoding: + if isinstance(filename_or_obj, str): + ds.encoding["source"] = filename_or_obj + + return ds + From fb166faaad0c30aa5d504e7ff3485ba9ce27ef78 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 25 Sep 2020 19:17:23 +0200 Subject: [PATCH 02/46] remove in apiv2 check for input AbstractDataStore --- xarray/backends/apiv2.py | 56 +++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 13a70425739..65d3e528fe4 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -263,37 +263,34 @@ def maybe_decode_store(store, chunks, lock=False): if isinstance(filename_or_obj, Path): filename_or_obj = str(filename_or_obj) - if isinstance(filename_or_obj, AbstractDataStore): - store = filename_or_obj - else: - if isinstance(filename_or_obj, str): - filename_or_obj = _normalize_path(filename_or_obj) - - if engine is None: - engine = _get_default_engine(filename_or_obj, allow_remote=True) - elif engine != "zarr": - if engine not in [None, "scipy", "h5netcdf"]: - raise ValueError( - "can only read bytes or file-like objects " - "with engine='scipy' or 'h5netcdf'" - ) - engine = _get_engine_from_magic_number(filename_or_obj) - - if engine in ["netcdf4", "h5netcdf"]: - extra_kwargs["group"] = group - extra_kwargs["lock"] = lock - elif engine in ["pynio", "pseudonetcdf", "cfgrib"]: - extra_kwargs["lock"] = lock - elif engine == "zarr": - backend_kwargs = backend_kwargs.copy() - overwrite_encoded_chunks = backend_kwargs.pop( - "overwrite_encoded_chunks", None + if isinstance(filename_or_obj, str): + filename_or_obj = _normalize_path(filename_or_obj) + + if engine is None: + engine = _get_default_engine(filename_or_obj, allow_remote=True) + elif engine != "zarr": + if engine not in [None, "scipy", "h5netcdf"]: + raise ValueError( + "can only read bytes or file-like objects " + "with engine='scipy' or 'h5netcdf'" ) - extra_kwargs["mode"] = "r" - extra_kwargs["group"] = group + engine = _get_engine_from_magic_number(filename_or_obj) + + if engine in ["netcdf4", "h5netcdf"]: + extra_kwargs["group"] = group + extra_kwargs["lock"] = lock + elif engine in ["pynio", "pseudonetcdf", "cfgrib"]: + extra_kwargs["lock"] = lock + elif engine == "zarr": + backend_kwargs = backend_kwargs.copy() + overwrite_encoded_chunks = backend_kwargs.pop( + "overwrite_encoded_chunks", None + ) + extra_kwargs["mode"] = "r" + extra_kwargs["group"] = group - opener = _get_backend_cls(engine) - store = opener(filename_or_obj, **extra_kwargs, **backend_kwargs) + opener = _get_backend_cls(engine) + store = opener(filename_or_obj, **extra_kwargs, **backend_kwargs) with close_on_error(store): ds = maybe_decode_store(store, chunks) @@ -304,4 +301,3 @@ def maybe_decode_store(store, chunks, lock=False): ds.encoding["source"] = filename_or_obj return ds - From 0221eec647228ad5d296c614f4307d3707209be9 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 25 Sep 2020 20:15:18 +0200 Subject: [PATCH 03/46] bugfix typo --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3486981d0b2..ce14c63de36 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -428,7 +428,7 @@ def open_dataset( -------- open_mfdataset """ - if os.environ.get("XARRAY_BACKEND_API", "v1") == "v1": + if os.environ.get("XARRAY_BACKEND_API", "v1") == "v2": kwargs = locals().copy() from . import apiv2 if engine in apiv2.ENGINES: From 36a02c706cf62808c5f49d2a9abfc8951cce9200 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 25 Sep 2020 20:17:26 +0200 Subject: [PATCH 04/46] add kwarg engines in _get_backend_cls needed by apiv2 --- xarray/backends/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ce14c63de36..a177634fa5d 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -163,10 +163,10 @@ def _get_default_engine(path, allow_remote=False): return engine -def _get_backend_cls(engine): +def _get_backend_cls(engine, engines=ENGINES): """Select open_dataset method based on current engine""" try: - return ENGINES[engine] + return engines[engine] except KeyError: raise ValueError( "unrecognized engine for open_dataset: {}\n" From cfb8cb8b592e149079f32d72aa203ef743d95084 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 25 Sep 2020 20:29:51 +0200 Subject: [PATCH 05/46] add alpha support for h5netcdf --- xarray/backends/apiv2.py | 70 +++++++++++------------------------- xarray/backends/h5netcdf_.py | 43 +++++++++++++++++++++- 2 files changed, 62 insertions(+), 51 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 65d3e528fe4..92dbd2f307b 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -17,7 +17,7 @@ _normalize_path, _protect_dataset_variables_inplace, ) -from .common import AbstractDataStore, ArrayWriter +from . import h5netcdf_ if TYPE_CHECKING: try: @@ -30,7 +30,7 @@ DATAARRAY_VARIABLE = "__xarray_dataarray_variable__" ENGINES = { - "h5netcdf": backends.H5NetCDFStore.open, + "h5netcdf": h5netcdf_.open_dataset_h5necdf_, } @@ -184,21 +184,10 @@ def open_dataset( backend_kwargs = {} extra_kwargs = {} - def maybe_decode_store(store, chunks, lock=False): - ds = conventions.decode_cf( - store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, - ) - + def chunk_backend_ds(ds, chunks, lock=False): _protect_dataset_variables_inplace(ds, cache) - if chunks is not None and engine != "zarr": + if chunks is not None: from dask.base import tokenize # if passed an actual file path, augment the token with @@ -225,36 +214,6 @@ def maybe_decode_store(store, chunks, lock=False): name_prefix = "open_dataset-%s" % token ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) - elif engine == "zarr": - # adapted from Dataset.Chunk() and taken from open_zarr - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - if chunks == "auto": - try: - import dask.array # noqa - except ImportError: - chunks = None - - # auto chunking needs to be here and not in ZarrStore because - # the variable chunks does not survive decode_cf - # return trivial case - if chunks is None: - return ds - - if isinstance(chunks, int): - chunks = dict.fromkeys(ds.dims, chunks) - - variables = { - k: store.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) - for k, v in ds.variables.items() - } - ds2 = ds._replace(variables) - else: ds2 = ds ds2._file_obj = ds._file_obj @@ -289,11 +248,22 @@ def maybe_decode_store(store, chunks, lock=False): extra_kwargs["mode"] = "r" extra_kwargs["group"] = group - opener = _get_backend_cls(engine) - store = opener(filename_or_obj, **extra_kwargs, **backend_kwargs) - - with close_on_error(store): - ds = maybe_decode_store(store, chunks) + opener = _get_backend_cls(engine, engines=ENGINES) + + backend_ds = opener( + filename_or_obj, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + drop_variables=drop_variables, + use_cftime=use_cftime, + decode_timedelta=decode_timedelta, + **backend_kwargs, + **extra_kwargs + ) + + ds = chunk_backend_ds(backend_ds, chunks, lock=False) # Ensure source filename always stored in dataset object (GH issue #2550) if "source" not in ds.encoding: diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index f3e61eeee74..4735d9d87ea 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -3,8 +3,10 @@ import numpy as np +from .. import conventions from ..core import indexing -from ..core.utils import FrozenDict, is_remote_uri +from ..core.dataset import Dataset +from ..core.utils import close_on_error, FrozenDict, is_remote_uri from ..core.variable import Variable from .common import WritableCFDataStore, find_root_and_group from .file_manager import CachingFileManager, DummyFileManager @@ -302,3 +304,42 @@ def sync(self): def close(self, **kwargs): self._manager.close(**kwargs) + + +def open_dataset_h5necdf_( + filename_or_obj, + mask_and_scale=None, + decode_times=None, + concat_characters=None, + decode_coords=None, + drop_variables=None, + use_cftime=None, + decode_timedelta=None, + **kwargs +): + store = H5NetCDFStore.open(filename_or_obj, **kwargs) + + with close_on_error(store): + vars, attrs = store.load() + extra_coords = set() + file_obj = store + encoding = store.get_encoding() + + vars, attrs, coord_names = conventions.decode_cf_variables( + vars, + attrs, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + drop_variables=drop_variables, + use_cftime=use_cftime, + decode_timedelta=decode_timedelta, + ) + + ds = Dataset(vars, attrs=attrs) + ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) + ds._file_obj = file_obj + ds.encoding = encoding + + return ds \ No newline at end of file From 4256bc89c645648ef0ef7d55ae38e2dd2cef5f06 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 28 Sep 2020 08:55:12 +0200 Subject: [PATCH 06/46] style: clean not used code, modify some variable/function name --- xarray/backends/apiv2.py | 22 +++------------------- xarray/backends/h5netcdf_.py | 2 +- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 92dbd2f307b..7e79d1246c4 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,12 +1,7 @@ import os import warnings from pathlib import Path -from typing import ( - TYPE_CHECKING, -) - -from .. import backends, conventions from ..core.dataset import Dataset from ..core.utils import close_on_error, is_grib_path, is_remote_uri @@ -19,18 +14,8 @@ ) from . import h5netcdf_ -if TYPE_CHECKING: - try: - from dask.delayed import Delayed - except ImportError: - Delayed = None - - -DATAARRAY_NAME = "__xarray_dataarray_name__" -DATAARRAY_VARIABLE = "__xarray_dataarray_variable__" - ENGINES = { - "h5netcdf": h5netcdf_.open_dataset_h5necdf_, + "h5netcdf": h5netcdf_.open_backend_dataset_h5necdf, } @@ -155,7 +140,6 @@ def open_dataset( -------- open_mfdataset """ - if autoclose is not None: warnings.warn( "The autoclose argument is no longer used by " @@ -248,9 +232,9 @@ def chunk_backend_ds(ds, chunks, lock=False): extra_kwargs["mode"] = "r" extra_kwargs["group"] = group - opener = _get_backend_cls(engine, engines=ENGINES) + open_backend_dataset = _get_backend_cls(engine, engines=ENGINES) - backend_ds = opener( + backend_ds = open_backend_dataset( filename_or_obj, mask_and_scale=mask_and_scale, decode_times=decode_times, diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 4735d9d87ea..4af0947f16d 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -306,7 +306,7 @@ def close(self, **kwargs): self._manager.close(**kwargs) -def open_dataset_h5necdf_( +def open_backend_dataset_h5necdf( filename_or_obj, mask_and_scale=None, decode_times=None, From 1bc73919a85cb86b5d0d2e2c894e619edf838184 Mon Sep 17 00:00:00 2001 From: TheRed86 Date: Mon, 28 Sep 2020 11:51:59 +0200 Subject: [PATCH 07/46] Add ENGINES entry for cfgrib. --- xarray/backends/apiv2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 7e79d1246c4..6a259101db7 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -12,10 +12,11 @@ _normalize_path, _protect_dataset_variables_inplace, ) -from . import h5netcdf_ +from . import h5netcdf_, cfgrib_ ENGINES = { "h5netcdf": h5netcdf_.open_backend_dataset_h5necdf, + "cfgrib": cfgrib_.open_backend_dataset_cfgrib, } From 748fe5acc80f4ee9302f2f62dd2adf03d0f4c80a Mon Sep 17 00:00:00 2001 From: TheRed86 Date: Mon, 28 Sep 2020 11:53:32 +0200 Subject: [PATCH 08/46] Define function open_backend_dataset_cfgrib() to be used in apiv2.py. Add necessary imports for this function. --- xarray/backends/cfgrib_.py | 43 +++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index bd946df89b2..1be7c4c7504 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -1,7 +1,9 @@ import numpy as np +from .. import conventions from ..core import indexing -from ..core.utils import Frozen, FrozenDict +from ..core.dataset import Dataset +from ..core.utils import close_on_error, Frozen, FrozenDict from ..core.variable import Variable from .common import AbstractDataStore, BackendArray from .locks import SerializableLock, ensure_lock @@ -69,3 +71,42 @@ def get_encoding(self): dims = self.get_dimensions() encoding = {"unlimited_dims": {k for k, v in dims.items() if v is None}} return encoding + + +def open_backend_dataset_cfgrib( + filename_or_obj, + mask_and_scale=None, + decode_times=None, + concat_characters=None, + decode_coords=None, + drop_variables=None, + use_cftime=None, + decode_timedelta=None, + **kwargs +): + store = CfGribDataStore(filename_or_obj, **kwargs) + + with close_on_error(store): + vars, attrs = store.load() + extra_coords = set() + file_obj = store + encoding = store.get_encoding() + + vars, attrs, coord_names = conventions.decode_cf_variables( + vars, + attrs, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + drop_variables=drop_variables, + use_cftime=use_cftime, + decode_timedelta=decode_timedelta, + ) + + ds = Dataset(vars, attrs=attrs) + ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) + ds._file_obj = file_obj + ds.encoding = encoding + + return ds From fb368feca2354f84b1bc91738d32fed90433b62d Mon Sep 17 00:00:00 2001 From: TheRed86 Date: Mon, 28 Sep 2020 11:55:22 +0200 Subject: [PATCH 09/46] Apply black to check formatting. --- xarray/backends/apiv2.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 6a259101db7..0d1ae216af5 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -227,9 +227,7 @@ def chunk_backend_ds(ds, chunks, lock=False): extra_kwargs["lock"] = lock elif engine == "zarr": backend_kwargs = backend_kwargs.copy() - overwrite_encoded_chunks = backend_kwargs.pop( - "overwrite_encoded_chunks", None - ) + overwrite_encoded_chunks = backend_kwargs.pop("overwrite_encoded_chunks", None) extra_kwargs["mode"] = "r" extra_kwargs["group"] = group @@ -245,7 +243,7 @@ def chunk_backend_ds(ds, chunks, lock=False): use_cftime=use_cftime, decode_timedelta=decode_timedelta, **backend_kwargs, - **extra_kwargs + **extra_kwargs, ) ds = chunk_backend_ds(backend_ds, chunks, lock=False) From 80e111cbd7a3393f1faef5645dc34878856e4351 Mon Sep 17 00:00:00 2001 From: TheRed86 Date: Mon, 28 Sep 2020 11:56:59 +0200 Subject: [PATCH 10/46] Apply black to check formatting. --- xarray/backends/cfgrib_.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 1be7c4c7504..6b038142ea2 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -74,15 +74,15 @@ def get_encoding(self): def open_backend_dataset_cfgrib( - filename_or_obj, - mask_and_scale=None, - decode_times=None, - concat_characters=None, - decode_coords=None, - drop_variables=None, - use_cftime=None, - decode_timedelta=None, - **kwargs + filename_or_obj, + mask_and_scale=None, + decode_times=None, + concat_characters=None, + decode_coords=None, + drop_variables=None, + use_cftime=None, + decode_timedelta=None, + **kwargs, ): store = CfGribDataStore(filename_or_obj, **kwargs) From e15ca6b3965cf846da438abb128a6b5d1128a03e Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 28 Sep 2020 18:17:06 +0200 Subject: [PATCH 11/46] add dummy zarr apiv2 backend --- xarray/backends/apiv2.py | 36 ++++++++++++++++++++++++++++++--- xarray/backends/zarr.py | 43 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 7e79d1246c4..728dd04cc6c 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -12,10 +12,11 @@ _normalize_path, _protect_dataset_variables_inplace, ) -from . import h5netcdf_ +from . import h5netcdf_, zarr ENGINES = { "h5netcdf": h5netcdf_.open_backend_dataset_h5necdf, + "zaar": zarr.open_backend_dataset_zarr, } @@ -170,8 +171,7 @@ def open_dataset( def chunk_backend_ds(ds, chunks, lock=False): _protect_dataset_variables_inplace(ds, cache) - - if chunks is not None: + if chunks is not None and engine != "zarr": from dask.base import tokenize # if passed an actual file path, augment the token with @@ -198,6 +198,36 @@ def chunk_backend_ds(ds, chunks, lock=False): name_prefix = "open_dataset-%s" % token ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) + elif engine == "zarr": + # adapted from Dataset.Chunk() and taken from open_zarr + if not (isinstance(chunks, (int, dict)) or chunks is None): + if chunks != "auto": + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) + + if chunks == "auto": + try: + import dask.array # noqa + except ImportError: + chunks = None + + # auto chunking needs to be here and not in ZarrStore because + # the variable chunks does not survive decode_cf + # return trivial case + if chunks is None: + return ds + + if isinstance(chunks, int): + chunks = dict.fromkeys(ds.dims, chunks) + + variables = { + k: store.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) + for k, v in ds.variables.items() + } + ds2 = ds._replace(variables) + else: ds2 = ds ds2._file_obj = ds._file_obj diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 260d27fbabe..bf80fa87bf2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -2,10 +2,12 @@ import numpy as np + from .. import coding, conventions from ..core import indexing +from ..core.dataset import Dataset from ..core.pycompat import integer_types -from ..core.utils import FrozenDict, HiddenKeyDict +from ..core.utils import close_on_error, FrozenDict, HiddenKeyDict from ..core.variable import Variable from .common import AbstractWritableDataStore, BackendArray, _encode_variable_name @@ -677,3 +679,42 @@ def open_zarr( ) return ds + + +def open_backend_dataset_zarr( + filename_or_obj, + mask_and_scale=None, + decode_times=None, + concat_characters=None, + decode_coords=None, + drop_variables=None, + use_cftime=None, + decode_timedelta=None, + **kwargs +): + store = ZarrStore.open_group.open(filename_or_obj, **kwargs) + + with close_on_error(store): + vars, attrs = store.load() + extra_coords = set() + file_obj = store + encoding = store.get_encoding() + + vars, attrs, coord_names = conventions.decode_cf_variables( + vars, + attrs, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters, + decode_coords=decode_coords, + drop_variables=drop_variables, + use_cftime=use_cftime, + decode_timedelta=decode_timedelta, + ) + + ds = Dataset(vars, attrs=attrs) + ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) + ds._file_obj = file_obj + ds.encoding = encoding + + return ds From 4b19399d17a10df55a0114f0d4dee41a4b3163b0 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Mon, 28 Sep 2020 18:48:35 +0200 Subject: [PATCH 12/46] align apiv2.open_dataset to api.open_dataset --- xarray/backends/api.py | 1 + xarray/backends/apiv2.py | 17 ++++------------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 58f1a6806d7..2c6949987c7 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -154,6 +154,7 @@ def _get_default_engine(path, allow_remote=False): engine = _get_default_engine_netcdf() return engine + def _autodetect_engine(filename_or_obj): if isinstance(filename_or_obj, str): engine = _get_default_engine(filename_or_obj, allow_remote=True) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 728dd04cc6c..92fc8e7fd67 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -11,6 +11,7 @@ _get_engine_from_magic_number, _normalize_path, _protect_dataset_variables_inplace, + _autodetect_engine, ) from . import h5netcdf_, zarr @@ -233,21 +234,11 @@ def chunk_backend_ds(ds, chunks, lock=False): ds2._file_obj = ds._file_obj return ds2 - if isinstance(filename_or_obj, Path): - filename_or_obj = str(filename_or_obj) - if isinstance(filename_or_obj, str): - filename_or_obj = _normalize_path(filename_or_obj) + filename_or_obj = _normalize_path(filename_or_obj) - if engine is None: - engine = _get_default_engine(filename_or_obj, allow_remote=True) - elif engine != "zarr": - if engine not in [None, "scipy", "h5netcdf"]: - raise ValueError( - "can only read bytes or file-like objects " - "with engine='scipy' or 'h5netcdf'" - ) - engine = _get_engine_from_magic_number(filename_or_obj) + if engine is None: + engine = _autodetect_engine(filename_or_obj) if engine in ["netcdf4", "h5netcdf"]: extra_kwargs["group"] = group From 572595f5e2aeb94b25cfd3e60c22667f59c72912 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Tue, 29 Sep 2020 09:16:00 +0200 Subject: [PATCH 13/46] remove unused extra_coords in open_backend_dataset_* --- xarray/backends/apiv2.py | 1 - xarray/backends/h5netcdf_.py | 3 +-- xarray/backends/zarr.py | 3 +-- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 92fc8e7fd67..f9d345ff486 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -234,7 +234,6 @@ def chunk_backend_ds(ds, chunks, lock=False): ds2._file_obj = ds._file_obj return ds2 - filename_or_obj = _normalize_path(filename_or_obj) if engine is None: diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index fc6480e41bb..8dcb0a28681 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -340,7 +340,6 @@ def open_backend_dataset_h5necdf( with close_on_error(store): vars, attrs = store.load() - extra_coords = set() file_obj = store encoding = store.get_encoding() @@ -357,7 +356,7 @@ def open_backend_dataset_h5necdf( ) ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) + ds = ds.set_coords(coord_names.intersection(vars)) ds._file_obj = file_obj ds.encoding = encoding diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index bf80fa87bf2..7e7e3fc0040 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -696,7 +696,6 @@ def open_backend_dataset_zarr( with close_on_error(store): vars, attrs = store.load() - extra_coords = set() file_obj = store encoding = store.get_encoding() @@ -713,7 +712,7 @@ def open_backend_dataset_zarr( ) ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) + ds = ds.set_coords(coord_names.intersection(vars)) ds._file_obj = file_obj ds.encoding = encoding From 74aba140a5dafd8fe366680fa368b9c49eaf818f Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Tue, 29 Sep 2020 09:38:10 +0200 Subject: [PATCH 14/46] remove extra_coords in open_backend_dataset_cfgrib --- xarray/backends/cfgrib_.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 6b038142ea2..6f44eee3827 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -88,7 +88,6 @@ def open_backend_dataset_cfgrib( with close_on_error(store): vars, attrs = store.load() - extra_coords = set() file_obj = store encoding = store.get_encoding() @@ -105,7 +104,7 @@ def open_backend_dataset_cfgrib( ) ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) + ds = ds.set_coords(coord_names.intersection(vars)) ds._file_obj = file_obj ds.encoding = encoding From d6280ec1e65659cf130fe30387d5f3e20dc3845a Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Tue, 29 Sep 2020 11:42:09 +0200 Subject: [PATCH 15/46] transform zarr maybe_chunk and get_chunks in classmethod - to be used in apiv2 without instantiate the object --- xarray/backends/zarr.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7e7e3fc0040..76d6ac8a61b 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -363,7 +363,8 @@ def encode_variable(self, variable): def encode_attribute(self, a): return encode_zarr_attr_value(a) - def get_chunk(self, name, var, chunks): + @classmethod + def get_chunk(cls, name, var, chunks): chunk_spec = dict(zip(var.dims, var.encoding.get("chunks"))) # Coordinate labels aren't chunked @@ -392,8 +393,9 @@ def get_chunk(self, name, var, chunks): chunk_spec[dim] = chunks[dim] return chunk_spec - def maybe_chunk(self, name, var, chunks, overwrite_encoded_chunks): - chunk_spec = self.get_chunk(name, var, chunks) + @classmethod + def maybe_chunk(cls, name, var, chunks, overwrite_encoded_chunks): + chunk_spec = cls.get_chunk(name, var, chunks) if (var.ndim > 0) and (chunk_spec is not None): from dask.base import tokenize @@ -692,7 +694,7 @@ def open_backend_dataset_zarr( decode_timedelta=None, **kwargs ): - store = ZarrStore.open_group.open(filename_or_obj, **kwargs) + store = ZarrStore.open_group(filename_or_obj, **kwargs) with close_on_error(store): vars, attrs = store.load() From c0e0f344c1409152fb26558c4e94706b45e4ef41 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Tue, 29 Sep 2020 11:48:37 +0200 Subject: [PATCH 16/46] make alpha zarr apiv2 working --- xarray/backends/apiv2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index c159719b674..5e596d87059 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -17,7 +17,7 @@ ENGINES = { "h5netcdf": h5netcdf_.open_backend_dataset_h5necdf, - "zaar": zarr.open_backend_dataset_zarr, + "zarr": zarr.open_backend_dataset_zarr, "cfgrib": cfgrib_.open_backend_dataset_cfgrib, } @@ -225,7 +225,7 @@ def chunk_backend_ds(ds, chunks, lock=False): chunks = dict.fromkeys(ds.dims, chunks) variables = { - k: store.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) + k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) for k, v in ds.variables.items() } ds2 = ds._replace(variables) From 6431101b61f4f9ac22b549e3fa25cb6f9f160977 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Tue, 29 Sep 2020 17:19:39 +0200 Subject: [PATCH 17/46] refactor apiv2.open_dataset: - modify signature - move default setting inside backends --- xarray/backends/apiv2.py | 72 +++++------------------------------- xarray/backends/cfgrib_.py | 37 +++++++++++++++++- xarray/backends/h5netcdf_.py | 36 ++++++++++++++++-- xarray/backends/zarr.py | 48 +++++++++++++++++++----- xarray/core/variable.py | 7 +--- 5 files changed, 116 insertions(+), 84 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 5e596d87059..405e23399c7 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,14 +1,9 @@ import os -import warnings -from pathlib import Path - from ..core.dataset import Dataset from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .api import ( _get_backend_cls, - _get_default_engine, - _get_engine_from_magic_number, _normalize_path, _protect_dataset_variables_inplace, _autodetect_engine, @@ -22,23 +17,18 @@ } +def filter_None(d): + return {k: v for k, v in d.items() if v is not None} + + def open_dataset( filename_or_obj, - group=None, - decode_cf=True, - mask_and_scale=None, - decode_times=True, - autoclose=None, - concat_characters=True, - decode_coords=True, + *, engine=None, chunks=None, - lock=None, cache=None, - drop_variables=None, backend_kwargs=None, - use_cftime=None, - decode_timedelta=None, + **kwargs ): """Open and decode a dataset from a file or file-like object. @@ -143,33 +133,12 @@ def open_dataset( -------- open_mfdataset """ - if autoclose is not None: - warnings.warn( - "The autoclose argument is no longer used by " - "xarray.open_dataset() and is now ignored; it will be removed in " - "a future version of xarray. If necessary, you can control the " - "maximum number of simultaneous open files with " - "xarray.set_options(file_cache_maxsize=...).", - FutureWarning, - stacklevel=2, - ) - - if mask_and_scale is None: - mask_and_scale = not engine == "pseudonetcdf" - - if not decode_cf: - mask_and_scale = False - decode_times = False - concat_characters = False - decode_coords = False - decode_timedelta = False if cache is None: cache = chunks is None if backend_kwargs is None: backend_kwargs = {} - extra_kwargs = {} def chunk_backend_ds(ds, chunks, lock=False): _protect_dataset_variables_inplace(ds, cache) @@ -185,17 +154,10 @@ def chunk_backend_ds(ds, chunks, lock=False): token = tokenize( filename_or_obj, mtime, - group, - decode_cf, - mask_and_scale, - decode_times, - concat_characters, - decode_coords, engine, chunks, - drop_variables, - use_cftime, - decode_timedelta, + **backend_kwargs, + **kwargs ) name_prefix = "open_dataset-%s" % token ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) @@ -240,32 +202,18 @@ def chunk_backend_ds(ds, chunks, lock=False): if engine is None: engine = _autodetect_engine(filename_or_obj) - if engine in ["netcdf4", "h5netcdf"]: - extra_kwargs["group"] = group - extra_kwargs["lock"] = lock - elif engine in ["pynio", "pseudonetcdf", "cfgrib"]: - extra_kwargs["lock"] = lock - elif engine == "zarr": + if engine == "zarr": backend_kwargs = backend_kwargs.copy() overwrite_encoded_chunks = backend_kwargs.pop( "overwrite_encoded_chunks", None ) - extra_kwargs["mode"] = "r" - extra_kwargs["group"] = group open_backend_dataset = _get_backend_cls(engine, engines=ENGINES) backend_ds = open_backend_dataset( filename_or_obj, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, **backend_kwargs, - **extra_kwargs + **filter_None(kwargs), ) ds = chunk_backend_ds(backend_ds, chunks, lock=False) diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 6f44eee3827..2764b0cab5a 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -1,3 +1,4 @@ +import warnings import numpy as np from .. import conventions @@ -75,16 +76,48 @@ def get_encoding(self): def open_backend_dataset_cfgrib( filename_or_obj, - mask_and_scale=None, + *, + decode_cf=True, + mask_and_scale=True, decode_times=None, concat_characters=None, decode_coords=None, drop_variables=None, use_cftime=None, decode_timedelta=None, + lock=None, + indexpath='{path}.{short_hash}.idx', + filter_by_keys={}, + read_keys=[], + encode_cf=('parameter', 'time', 'geography', 'vertical'), + squeeze=True, + time_dims=('time', 'step'), **kwargs, ): - store = CfGribDataStore(filename_or_obj, **kwargs) + if kwargs: + warnings.warn( + "The following keywords are not supported by cfgrib " + "backend and they will bw ignored:%r" % kwargs + ) + + if not decode_cf: + mask_and_scale = False + decode_times = False + concat_characters = False + decode_coords = False + decode_timedelta = False + + + store = CfGribDataStore( + filename_or_obj, + indexpath=indexpath, + filter_by_keys=filter_by_keys, + read_keys=read_keys, + encode_cf=encode_cf, + squeeze=squeeze, + time_dims=time_dims, + lock=lock, + ) with close_on_error(store): vars, attrs = store.load() diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 8dcb0a28681..980f8e25235 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -1,4 +1,5 @@ import functools +import warnings from distutils.version import LooseVersion import numpy as np @@ -327,16 +328,43 @@ def close(self, **kwargs): def open_backend_dataset_h5necdf( filename_or_obj, - mask_and_scale=None, + *, + decode_cf=True, + mask_and_scale=True, decode_times=None, concat_characters=None, decode_coords=None, drop_variables=None, use_cftime=None, decode_timedelta=None, - **kwargs + format=None, + group=None, + lock=None, + invalid_netcdf=None, + phony_dims=None, + **kwargs, ): - store = H5NetCDFStore.open(filename_or_obj, **kwargs) + if kwargs: + warnings.warn( + "The following keywords are not supported by h5netcdf " + "and they will bw ignored:%r" % kwargs + ) + + if not decode_cf: + mask_and_scale = False + decode_times = False + concat_characters = False + decode_coords = False + decode_timedelta = False + + store = H5NetCDFStore.open( + filename_or_obj, + format=format, + group=group, + lock=lock, + invalid_netcdf=invalid_netcdf, + phony_dims=phony_dims, + ) with close_on_error(store): vars, attrs = store.load() @@ -360,4 +388,4 @@ def open_backend_dataset_h5necdf( ds._file_obj = file_obj ds.encoding = encoding - return ds \ No newline at end of file + return ds diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 76d6ac8a61b..ed7972e5390 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -684,17 +684,45 @@ def open_zarr( def open_backend_dataset_zarr( - filename_or_obj, - mask_and_scale=None, - decode_times=None, - concat_characters=None, - decode_coords=None, - drop_variables=None, - use_cftime=None, - decode_timedelta=None, - **kwargs + filename_or_obj, + decode_cf=True, + mask_and_scale=True, + decode_times=None, + concat_characters=None, + decode_coords=None, + drop_variables=None, + use_cftime=None, + decode_timedelta=None, + group=None, + mode='r', + synchronizer=None, + consolidated=False, + consolidate_on_close=False, + chunk_store=None, + **kwargs, ): - store = ZarrStore.open_group(filename_or_obj, **kwargs) + if kwargs: + warnings.warn( + "The following keywords are not supported by zarr" + "and they will bw ignored:%r" % kwargs + ) + + if not decode_cf: + mask_and_scale = False + decode_times = False + concat_characters = False + decode_coords = False + decode_timedelta = False + + store = ZarrStore.open_group( + filename_or_obj, + group=group, + mode=mode, + synchronizer=synchronizer, + consolidated=consolidated, + consolidate_on_close=consolidate_on_close, + chunk_store=chunk_store, + ) with close_on_error(store): vars, attrs = store.load() diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c55e61cb816..d880dbc6737 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -51,12 +51,7 @@ ) NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( - ( - indexing.ExplicitlyIndexed, - pd.Index, - ) - + dask_array_type - + cupy_array_type + (indexing.ExplicitlyIndexed, pd.Index,) + dask_array_type + cupy_array_type ) # https://github.com/python/mypy/issues/224 BASIC_INDEXING_TYPES = integer_types + (slice,) # type: ignore From 50d1ebe4b71d1c646a4517711975ac1b138ecbc4 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 30 Sep 2020 12:06:17 +0200 Subject: [PATCH 18/46] move dataset_from_backend_dataset out of apiv2.open_dataset --- xarray/backends/apiv2.py | 146 +++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 69 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 405e23399c7..55aa6b615da 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -21,6 +21,77 @@ def filter_None(d): return {k: v for k, v in d.items() if v is not None} +def dataset_from_backend_dataset ( + ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + backend_kwargs, + **kwargs, +): + if not (isinstance(chunks, (int, dict)) or chunks is None): + if chunks != "auto": + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) + + _protect_dataset_variables_inplace(ds, cache) + if chunks is not None and engine != "zarr": + from dask.base import tokenize + + # if passed an actual file path, augment the token with + # the file modification time + if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): + mtime = os.path.getmtime(filename_or_obj) + else: + mtime = None + token = tokenize( + filename_or_obj, + mtime, + engine, + chunks, + **backend_kwargs, + **kwargs + ) + name_prefix = "open_dataset-%s" % token + ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) + + elif engine == "zarr": + + if chunks == "auto": + try: + import dask.array # noqa + except ImportError: + chunks = None + + if chunks is None: + return ds + + if isinstance(chunks, int): + chunks = dict.fromkeys(ds.dims, chunks) + + variables = { + k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) + for k, v in ds.variables.items() + } + ds2 = ds._replace(variables) + + else: + ds2 = ds + ds2._file_obj = ds._file_obj + + # Ensure source filename always stored in dataset object (GH issue #2550) + if "source" not in ds.encoding: + if isinstance(filename_or_obj, str): + ds.encoding["source"] = filename_or_obj + + return ds2 + + + def open_dataset( filename_or_obj, *, @@ -140,73 +211,16 @@ def open_dataset( if backend_kwargs is None: backend_kwargs = {} - def chunk_backend_ds(ds, chunks, lock=False): - _protect_dataset_variables_inplace(ds, cache) - if chunks is not None and engine != "zarr": - from dask.base import tokenize - - # if passed an actual file path, augment the token with - # the file modification time - if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): - mtime = os.path.getmtime(filename_or_obj) - else: - mtime = None - token = tokenize( - filename_or_obj, - mtime, - engine, - chunks, - **backend_kwargs, - **kwargs - ) - name_prefix = "open_dataset-%s" % token - ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) - - elif engine == "zarr": - # adapted from Dataset.Chunk() and taken from open_zarr - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - if chunks == "auto": - try: - import dask.array # noqa - except ImportError: - chunks = None - - # auto chunking needs to be here and not in ZarrStore because - # the variable chunks does not survive decode_cf - # return trivial case - if chunks is None: - return ds - - if isinstance(chunks, int): - chunks = dict.fromkeys(ds.dims, chunks) - - variables = { - k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) - for k, v in ds.variables.items() - } - ds2 = ds._replace(variables) - - else: - ds2 = ds - ds2._file_obj = ds._file_obj - return ds2 - filename_or_obj = _normalize_path(filename_or_obj) if engine is None: engine = _autodetect_engine(filename_or_obj) - if engine == "zarr": - backend_kwargs = backend_kwargs.copy() - overwrite_encoded_chunks = backend_kwargs.pop( - "overwrite_encoded_chunks", None - ) + + backend_kwargs = backend_kwargs.copy() + overwrite_encoded_chunks = backend_kwargs.pop( + "overwrite_encoded_chunks", None + ) open_backend_dataset = _get_backend_cls(engine, engines=ENGINES) @@ -215,12 +229,6 @@ def chunk_backend_ds(ds, chunks, lock=False): **backend_kwargs, **filter_None(kwargs), ) - - ds = chunk_backend_ds(backend_ds, chunks, lock=False) - - # Ensure source filename always stored in dataset object (GH issue #2550) - if "source" not in ds.encoding: - if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj + ds = dataset_from_backend_dataset(backend_ds, filename_or_obj, engine, chunks, cache, overwrite_encoded_chunks, backend_kwargs, **kwargs) return ds From 383d32303e8c478ea6dbc64e5ef80cf9aadaa48c Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 30 Sep 2020 13:18:34 +0200 Subject: [PATCH 19/46] remove blank lines --- xarray/backends/apiv2.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 55aa6b615da..e0c3010224f 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -216,14 +216,12 @@ def open_dataset( if engine is None: engine = _autodetect_engine(filename_or_obj) - backend_kwargs = backend_kwargs.copy() overwrite_encoded_chunks = backend_kwargs.pop( "overwrite_encoded_chunks", None ) open_backend_dataset = _get_backend_cls(engine, engines=ENGINES) - backend_ds = open_backend_dataset( filename_or_obj, **backend_kwargs, From 457a09cf02bea6571d761d975ed05c13df9557f8 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 30 Sep 2020 13:19:53 +0200 Subject: [PATCH 20/46] remove blank lines --- xarray/backends/apiv2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index e0c3010224f..b7587fc7c52 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -91,7 +91,6 @@ def dataset_from_backend_dataset ( return ds2 - def open_dataset( filename_or_obj, *, From 2803fe360bfebe307de21e54f70fcbf37c324a3d Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 30 Sep 2020 21:33:11 +0200 Subject: [PATCH 21/46] style --- xarray/backends/apiv2.py | 19 ++++++++++++------- xarray/backends/cfgrib_.py | 1 - xarray/backends/zarr.py | 1 - xarray/core/variable.py | 7 ++++++- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index b7587fc7c52..849459a338b 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -17,11 +17,7 @@ } -def filter_None(d): - return {k: v for k, v in d.items() if v is not None} - - -def dataset_from_backend_dataset ( +def dataset_from_backend_dataset( ds, filename_or_obj, engine, @@ -224,8 +220,17 @@ def open_dataset( backend_ds = open_backend_dataset( filename_or_obj, **backend_kwargs, - **filter_None(kwargs), + **{k: v for k, v in kwargs.items() if v is not None}, + ) + ds = dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + backend_kwargs, + **kwargs ) - ds = dataset_from_backend_dataset(backend_ds, filename_or_obj, engine, chunks, cache, overwrite_encoded_chunks, backend_kwargs, **kwargs) return ds diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 2764b0cab5a..357290a0d9c 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -107,7 +107,6 @@ def open_backend_dataset_cfgrib( decode_coords = False decode_timedelta = False - store = CfGribDataStore( filename_or_obj, indexpath=indexpath, diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index ed7972e5390..ba8b3e86463 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -2,7 +2,6 @@ import numpy as np - from .. import coding, conventions from ..core import indexing from ..core.dataset import Dataset diff --git a/xarray/core/variable.py b/xarray/core/variable.py index d880dbc6737..c55e61cb816 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -51,7 +51,12 @@ ) NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( - (indexing.ExplicitlyIndexed, pd.Index,) + dask_array_type + cupy_array_type + ( + indexing.ExplicitlyIndexed, + pd.Index, + ) + + dask_array_type + + cupy_array_type ) # https://github.com/python/mypy/issues/224 BASIC_INDEXING_TYPES = integer_types + (slice,) # type: ignore From 08db0bdc5abfdc1fa0517dda8685829d0b7bc3af Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Wed, 30 Sep 2020 21:42:19 +0200 Subject: [PATCH 22/46] Re-write error messages --- xarray/backends/cfgrib_.py | 4 ++-- xarray/backends/h5netcdf_.py | 4 ++-- xarray/backends/zarr.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 357290a0d9c..0b17b85c01d 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -96,8 +96,8 @@ def open_backend_dataset_cfgrib( ): if kwargs: warnings.warn( - "The following keywords are not supported by cfgrib " - "backend and they will bw ignored:%r" % kwargs + "The following keywords are not supported by the engine " + "and will be ignored: %r" % kwargs ) if not decode_cf: diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 980f8e25235..d157c5e96f1 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -346,8 +346,8 @@ def open_backend_dataset_h5necdf( ): if kwargs: warnings.warn( - "The following keywords are not supported by h5netcdf " - "and they will bw ignored:%r" % kwargs + "The following keywords are not supported by the engine " + "and will be ignored: %r" % kwargs ) if not decode_cf: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index ba8b3e86463..b65429854ee 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -702,8 +702,8 @@ def open_backend_dataset_zarr( ): if kwargs: warnings.warn( - "The following keywords are not supported by zarr" - "and they will bw ignored:%r" % kwargs + "The following keywords are not supported by the engine " + "and will be ignored: %r" % kwargs ) if not decode_cf: From 1f11845f6c617326606ab24a59e8699045be0b37 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Wed, 30 Sep 2020 21:48:17 +0200 Subject: [PATCH 23/46] Fix code style --- xarray/backends/api.py | 1 + xarray/backends/apiv2.py | 31 ++++++++++++------------------- xarray/backends/cfgrib_.py | 6 +++--- xarray/backends/zarr.py | 2 +- 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 2c6949987c7..e5031705837 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -434,6 +434,7 @@ def open_dataset( if os.environ.get("XARRAY_BACKEND_API", "v1") == "v2": kwargs = locals().copy() from . import apiv2 + if engine in apiv2.ENGINES: return apiv2.open_dataset(**kwargs) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 849459a338b..7086f8d499e 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -18,14 +18,14 @@ def dataset_from_backend_dataset( - ds, - filename_or_obj, - engine, - chunks, - cache, - overwrite_encoded_chunks, - backend_kwargs, - **kwargs, + ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + backend_kwargs, + **kwargs, ): if not (isinstance(chunks, (int, dict)) or chunks is None): if chunks != "auto": @@ -45,12 +45,7 @@ def dataset_from_backend_dataset( else: mtime = None token = tokenize( - filename_or_obj, - mtime, - engine, - chunks, - **backend_kwargs, - **kwargs + filename_or_obj, mtime, engine, chunks, **backend_kwargs, **kwargs ) name_prefix = "open_dataset-%s" % token ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) @@ -94,7 +89,7 @@ def open_dataset( chunks=None, cache=None, backend_kwargs=None, - **kwargs + **kwargs, ): """Open and decode a dataset from a file or file-like object. @@ -212,9 +207,7 @@ def open_dataset( engine = _autodetect_engine(filename_or_obj) backend_kwargs = backend_kwargs.copy() - overwrite_encoded_chunks = backend_kwargs.pop( - "overwrite_encoded_chunks", None - ) + overwrite_encoded_chunks = backend_kwargs.pop("overwrite_encoded_chunks", None) open_backend_dataset = _get_backend_cls(engine, engines=ENGINES) backend_ds = open_backend_dataset( @@ -230,7 +223,7 @@ def open_dataset( cache, overwrite_encoded_chunks, backend_kwargs, - **kwargs + **kwargs, ) return ds diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 0b17b85c01d..5e201a90cb9 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -86,12 +86,12 @@ def open_backend_dataset_cfgrib( use_cftime=None, decode_timedelta=None, lock=None, - indexpath='{path}.{short_hash}.idx', + indexpath="{path}.{short_hash}.idx", filter_by_keys={}, read_keys=[], - encode_cf=('parameter', 'time', 'geography', 'vertical'), + encode_cf=("parameter", "time", "geography", "vertical"), squeeze=True, - time_dims=('time', 'step'), + time_dims=("time", "step"), **kwargs, ): if kwargs: diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b65429854ee..708b88288da 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -693,7 +693,7 @@ def open_backend_dataset_zarr( use_cftime=None, decode_timedelta=None, group=None, - mode='r', + mode="r", synchronizer=None, consolidated=False, consolidate_on_close=False, From 93303b14b40e8d869504620bfee12e4318614206 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Wed, 30 Sep 2020 21:49:52 +0200 Subject: [PATCH 24/46] Fix code style --- xarray/backends/apiv2.py | 4 ++-- xarray/backends/cfgrib_.py | 3 ++- xarray/backends/h5netcdf_.py | 2 +- xarray/backends/zarr.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 7086f8d499e..820cc7f105c 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -2,13 +2,13 @@ from ..core.dataset import Dataset from ..core.utils import close_on_error, is_grib_path, is_remote_uri +from . import cfgrib_, h5netcdf_, zarr from .api import ( + _autodetect_engine, _get_backend_cls, _normalize_path, _protect_dataset_variables_inplace, - _autodetect_engine, ) -from . import h5netcdf_, zarr, cfgrib_ ENGINES = { "h5netcdf": h5netcdf_.open_backend_dataset_h5necdf, diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 5e201a90cb9..9fd2d92f2c8 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -1,10 +1,11 @@ import warnings + import numpy as np from .. import conventions from ..core import indexing from ..core.dataset import Dataset -from ..core.utils import close_on_error, Frozen, FrozenDict +from ..core.utils import Frozen, FrozenDict, close_on_error from ..core.variable import Variable from .common import AbstractDataStore, BackendArray from .locks import SerializableLock, ensure_lock diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index d157c5e96f1..5e83754575d 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -7,7 +7,7 @@ from .. import conventions from ..core import indexing from ..core.dataset import Dataset -from ..core.utils import close_on_error, FrozenDict, is_remote_uri +from ..core.utils import FrozenDict, close_on_error, is_remote_uri from ..core.variable import Variable from .common import WritableCFDataStore, find_root_and_group from .file_manager import CachingFileManager, DummyFileManager diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 708b88288da..0cef8aba8fc 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -6,7 +6,7 @@ from ..core import indexing from ..core.dataset import Dataset from ..core.pycompat import integer_types -from ..core.utils import close_on_error, FrozenDict, HiddenKeyDict +from ..core.utils import FrozenDict, HiddenKeyDict, close_on_error from ..core.variable import Variable from .common import AbstractWritableDataStore, BackendArray, _encode_variable_name From bc2fe008d171b742becd2d750914b958de482f51 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Wed, 30 Sep 2020 21:54:33 +0200 Subject: [PATCH 25/46] remove unused import --- xarray/backends/apiv2.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 820cc7f105c..f30eb240625 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,7 +1,6 @@ import os -from ..core.dataset import Dataset -from ..core.utils import close_on_error, is_grib_path, is_remote_uri +from ..core.utils import is_remote_uri from . import cfgrib_, h5netcdf_, zarr from .api import ( _autodetect_engine, From 102b00adb290d1b4fb5714977ebbbad26332f563 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 1 Oct 2020 08:52:54 +0200 Subject: [PATCH 26/46] zarr chunking refactor draft not working --- xarray/backends/apiv2.py | 77 +++++++++++++++++++----------------- xarray/backends/cfgrib_.py | 2 +- xarray/backends/h5netcdf_.py | 2 +- xarray/backends/zarr.py | 5 ++- 4 files changed, 45 insertions(+), 41 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index f30eb240625..f7318029045 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -15,6 +15,23 @@ "cfgrib": cfgrib_.open_backend_dataset_cfgrib, } +def get_mtime(filename_or_obj): + # if passed an actual file path, augment the token with + # the file modification time + if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): + mtime = os.path.getmtime(filename_or_obj) + else: + mtime = None + return mtime + + +def add_source(ds, filename_or_obj): + if "source" not in ds.encoding: + if isinstance(filename_or_obj, str): + ds.encoding["source"] = filename_or_obj + return ds + + def dataset_from_backend_dataset( ds, @@ -26,59 +43,45 @@ def dataset_from_backend_dataset( backend_kwargs, **kwargs, ): - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - + if not (isinstance(chunks, (int, dict)) or (chunks is None) or (chunks == "auto")): + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) _protect_dataset_variables_inplace(ds, cache) - if chunks is not None and engine != "zarr": - from dask.base import tokenize - # if passed an actual file path, augment the token with - # the file modification time - if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): - mtime = os.path.getmtime(filename_or_obj) - else: - mtime = None + if chunks is None: + return add_source(ds, filename_or_obj) + + file_obj = ds._file_obj + if engine != "zarr": + from dask.base import tokenize + mtime = get_mtime(filename_or_obj) token = tokenize( - filename_or_obj, mtime, engine, chunks, **backend_kwargs, **kwargs + filename_or_obj, + mtime, + engine, + chunks, + **backend_kwargs, + **kwargs ) name_prefix = "open_dataset-%s" % token - ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) - - elif engine == "zarr": - + ds = ds.chunk(chunks, name_prefix=name_prefix, token=token) + else: if chunks == "auto": try: import dask.array # noqa except ImportError: chunks = None - - if chunks is None: - return ds - if isinstance(chunks, int): chunks = dict.fromkeys(ds.dims, chunks) - variables = { k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) for k, v in ds.variables.items() } - ds2 = ds._replace(variables) - - else: - ds2 = ds - ds2._file_obj = ds._file_obj - - # Ensure source filename always stored in dataset object (GH issue #2550) - if "source" not in ds.encoding: - if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj - - return ds2 + ds = ds._replace(variables) + ds._file_obj = file_obj + return add_source(ds, filename_or_obj) def open_dataset( diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 9fd2d92f2c8..7c00b880d76 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -121,7 +121,7 @@ def open_backend_dataset_cfgrib( with close_on_error(store): vars, attrs = store.load() - file_obj = store + # file_obj = store encoding = store.get_encoding() vars, attrs, coord_names = conventions.decode_cf_variables( diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 5e83754575d..3b49f137138 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -368,7 +368,7 @@ def open_backend_dataset_h5necdf( with close_on_error(store): vars, attrs = store.load() - file_obj = store + # file_obj = store encoding = store.get_encoding() vars, attrs, coord_names = conventions.decode_cf_variables( diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0cef8aba8fc..0f56310fde2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -369,7 +369,6 @@ def get_chunk(cls, name, var, chunks): # Coordinate labels aren't chunked if var.ndim == 1 and var.dims[0] == name: return chunk_spec - if chunks == "auto": return chunk_spec @@ -378,7 +377,9 @@ def get_chunk(cls, name, var, chunks): spec = chunks[dim] if isinstance(spec, int): spec = (spec,) + if isinstance(spec, (tuple, list)) and chunk_spec[dim]: + if len(spec) > 1: if any(s % chunk_spec[dim] for s in spec): warnings.warn( "Specified Dask chunks %r would " @@ -725,7 +726,7 @@ def open_backend_dataset_zarr( with close_on_error(store): vars, attrs = store.load() - file_obj = store + #file_obj = store encoding = store.get_encoding() vars, attrs, coord_names = conventions.decode_cf_variables( From f47605af5e6c6546693377a671ece337eef7a594 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 2 Oct 2020 11:11:47 +0200 Subject: [PATCH 27/46] refactor dataset_from_backend_dataset --- xarray/backends/apiv2.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index f7318029045..8cf617ccf6a 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -25,16 +25,15 @@ def get_mtime(filename_or_obj): return mtime -def add_source(ds, filename_or_obj): +def set_source(ds, filename_or_obj): if "source" not in ds.encoding: if isinstance(filename_or_obj, str): ds.encoding["source"] = filename_or_obj return ds - def dataset_from_backend_dataset( - ds, + backend_ds, filename_or_obj, engine, chunks, @@ -48,25 +47,19 @@ def dataset_from_backend_dataset( "chunks must be an int, dict, 'auto', or None. " "Instead found %s. " % chunks ) - _protect_dataset_variables_inplace(ds, cache) + _protect_dataset_variables_inplace(backend_ds, cache) if chunks is None: - return add_source(ds, filename_or_obj) + return set_source(backend_ds, filename_or_obj) - file_obj = ds._file_obj if engine != "zarr": from dask.base import tokenize mtime = get_mtime(filename_or_obj) token = tokenize( - filename_or_obj, - mtime, - engine, - chunks, - **backend_kwargs, - **kwargs + filename_or_obj, mtime, engine, chunks, **backend_kwargs, **kwargs ) name_prefix = "open_dataset-%s" % token - ds = ds.chunk(chunks, name_prefix=name_prefix, token=token) + ds = backend_ds.chunk(chunks, name_prefix=name_prefix, token=token) else: if chunks == "auto": try: @@ -74,14 +67,14 @@ def dataset_from_backend_dataset( except ImportError: chunks = None if isinstance(chunks, int): - chunks = dict.fromkeys(ds.dims, chunks) + chunks = dict.fromkeys(backend_ds.dims, chunks) variables = { k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) - for k, v in ds.variables.items() + for k, v in backend_ds.variables.items() } - ds = ds._replace(variables) - ds._file_obj = file_obj - return add_source(ds, filename_or_obj) + ds = backend_ds._replace(variables) + ds._file_obj = backend_ds._file_obj + return set_source(ds, filename_or_obj) def open_dataset( From b632b054ed6376ca5a4315553d6c6ae45b237ee1 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 2 Oct 2020 11:12:52 +0200 Subject: [PATCH 28/46] fix wrong commit --- xarray/backends/cfgrib_.py | 2 +- xarray/backends/h5netcdf_.py | 2 +- xarray/backends/zarr.py | 4 +--- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 7c00b880d76..9fd2d92f2c8 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -121,7 +121,7 @@ def open_backend_dataset_cfgrib( with close_on_error(store): vars, attrs = store.load() - # file_obj = store + file_obj = store encoding = store.get_encoding() vars, attrs, coord_names = conventions.decode_cf_variables( diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 3b49f137138..5e83754575d 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -368,7 +368,7 @@ def open_backend_dataset_h5necdf( with close_on_error(store): vars, attrs = store.load() - # file_obj = store + file_obj = store encoding = store.get_encoding() vars, attrs, coord_names = conventions.decode_cf_variables( diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0f56310fde2..bd122f8d89d 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -377,9 +377,7 @@ def get_chunk(cls, name, var, chunks): spec = chunks[dim] if isinstance(spec, int): spec = (spec,) - if isinstance(spec, (tuple, list)) and chunk_spec[dim]: - if len(spec) > 1: if any(s % chunk_spec[dim] for s in spec): warnings.warn( "Specified Dask chunks %r would " @@ -726,7 +724,7 @@ def open_backend_dataset_zarr( with close_on_error(store): vars, attrs = store.load() - #file_obj = store + file_obj = store encoding = store.get_encoding() vars, attrs, coord_names = conventions.decode_cf_variables( From b437f025dd3ec53eca84b5711be2a5033fb08199 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 2 Oct 2020 14:19:15 +0200 Subject: [PATCH 29/46] add get_chunk in apiv2 --- xarray/backends/apiv2.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 8cf617ccf6a..2c6b4a5b7b6 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,4 +1,5 @@ import os +import warnings from ..core.utils import is_remote_uri from . import cfgrib_, h5netcdf_, zarr @@ -15,6 +16,36 @@ "cfgrib": cfgrib_.open_backend_dataset_cfgrib, } + +def check_chunks_compatibility(chunks, chunk_spec): + for dim in chunk_spec: + if dim in chunks: + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if any(s % chunk_spec[dim] for s in spec): + warnings.warn( + "Specified Dask chunks %r would " + "separate on disks chunk shape %r for " + "dimension %r. This could " + "degrades performance. Consider " + "rechunking after loading instead." + % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2, + ) + + +def get_chunk(name, var, chunks): + chunk_spec = dict(zip(var.dims, var.encoding.get("chunks", {}))) + if chunks == 'auto': + return dict(zip(var.dims, var.encoding.get("chunks", {}))) + # Coordinate labels aren't chunked + if var.ndim == 1 and var.dims[0] == name: + return chunk_spec + check_chunks_compatibility(chunks, chunk_spec) + return chunk_spec + + def get_mtime(filename_or_obj): # if passed an actual file path, augment the token with # the file modification time From d69414666c9e262233daf7efb7dde243941e67b4 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 8 Oct 2020 07:03:03 +0200 Subject: [PATCH 30/46] replace warning with ValueError for not supported kwargs in backends --- xarray/backends/cfgrib_.py | 2 +- xarray/backends/h5netcdf_.py | 2 +- xarray/backends/zarr.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index 9fd2d92f2c8..e2e994bbc53 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -96,7 +96,7 @@ def open_backend_dataset_cfgrib( **kwargs, ): if kwargs: - warnings.warn( + raise ValueError( "The following keywords are not supported by the engine " "and will be ignored: %r" % kwargs ) diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 5e83754575d..4fbfc583298 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -345,7 +345,7 @@ def open_backend_dataset_h5necdf( **kwargs, ): if kwargs: - warnings.warn( + raise ValueError( "The following keywords are not supported by the engine " "and will be ignored: %r" % kwargs ) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 0cef8aba8fc..70987490548 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -701,7 +701,7 @@ def open_backend_dataset_zarr( **kwargs, ): if kwargs: - warnings.warn( + raise ValueError( "The following keywords are not supported by the engine " "and will be ignored: %r" % kwargs ) From 56f4d3f0ad8ade78d850f31da5342fbbe41c25a3 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 8 Oct 2020 07:25:09 +0200 Subject: [PATCH 31/46] change zarr.ZarStore.get_chunks into a static method --- xarray/backends/zarr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 70987490548..1c0f10dfd16 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -362,8 +362,8 @@ def encode_variable(self, variable): def encode_attribute(self, a): return encode_zarr_attr_value(a) - @classmethod - def get_chunk(cls, name, var, chunks): + @staticmethod + def get_chunk(name, var, chunks): chunk_spec = dict(zip(var.dims, var.encoding.get("chunks"))) # Coordinate labels aren't chunked From df23b1825a2762e854eb4b8fa6ebcadb33a54876 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 8 Oct 2020 08:09:39 +0200 Subject: [PATCH 32/46] group `backend_kwargs` and `kwargs` in `extra_tokes` argument in apiv2.dataset_from_backend_dataset` --- xarray/backends/apiv2.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index f30eb240625..4c58995947e 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -23,8 +23,7 @@ def dataset_from_backend_dataset( chunks, cache, overwrite_encoded_chunks, - backend_kwargs, - **kwargs, + extra_tokens, ): if not (isinstance(chunks, (int, dict)) or chunks is None): if chunks != "auto": @@ -44,7 +43,7 @@ def dataset_from_backend_dataset( else: mtime = None token = tokenize( - filename_or_obj, mtime, engine, chunks, **backend_kwargs, **kwargs + filename_or_obj, mtime, engine, chunks, **extra_tokens ) name_prefix = "open_dataset-%s" % token ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) @@ -221,8 +220,7 @@ def open_dataset( chunks, cache, overwrite_encoded_chunks, - backend_kwargs, - **kwargs, + {**backend_kwargs, **kwargs}, ) return ds From a04e6acb61fb5cecc582fdf5105e3425790bd4ec Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 8 Oct 2020 08:14:04 +0200 Subject: [PATCH 33/46] remove in open_backend_dayaset_${engine} signature kwarags and the related error message --- xarray/backends/cfgrib_.py | 8 -------- xarray/backends/h5netcdf_.py | 7 ------- xarray/backends/zarr.py | 6 ------ 3 files changed, 21 deletions(-) diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index e2e994bbc53..cfff1b146e3 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -1,5 +1,3 @@ -import warnings - import numpy as np from .. import conventions @@ -93,13 +91,7 @@ def open_backend_dataset_cfgrib( encode_cf=("parameter", "time", "geography", "vertical"), squeeze=True, time_dims=("time", "step"), - **kwargs, ): - if kwargs: - raise ValueError( - "The following keywords are not supported by the engine " - "and will be ignored: %r" % kwargs - ) if not decode_cf: mask_and_scale = False diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 4fbfc583298..c4d23741a77 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -1,5 +1,4 @@ import functools -import warnings from distutils.version import LooseVersion import numpy as np @@ -342,13 +341,7 @@ def open_backend_dataset_h5necdf( lock=None, invalid_netcdf=None, phony_dims=None, - **kwargs, ): - if kwargs: - raise ValueError( - "The following keywords are not supported by the engine " - "and will be ignored: %r" % kwargs - ) if not decode_cf: mask_and_scale = False diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 1c0f10dfd16..0f60a436d80 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -698,13 +698,7 @@ def open_backend_dataset_zarr( consolidated=False, consolidate_on_close=False, chunk_store=None, - **kwargs, ): - if kwargs: - raise ValueError( - "The following keywords are not supported by the engine " - "and will be ignored: %r" % kwargs - ) if not decode_cf: mask_and_scale = False From de29a4cc49904323eab99637def7b50a4beb203b Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 8 Oct 2020 10:46:56 +0200 Subject: [PATCH 34/46] black --- xarray/backends/apiv2.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 4c58995947e..27b0b3f7ee2 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -17,13 +17,7 @@ def dataset_from_backend_dataset( - ds, - filename_or_obj, - engine, - chunks, - cache, - overwrite_encoded_chunks, - extra_tokens, + ds, filename_or_obj, engine, chunks, cache, overwrite_encoded_chunks, extra_tokens, ): if not (isinstance(chunks, (int, dict)) or chunks is None): if chunks != "auto": @@ -42,9 +36,7 @@ def dataset_from_backend_dataset( mtime = os.path.getmtime(filename_or_obj) else: mtime = None - token = tokenize( - filename_or_obj, mtime, engine, chunks, **extra_tokens - ) + token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) name_prefix = "open_dataset-%s" % token ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) From cf77dc3ee07964ccd89e1733aa7ab28775c718d6 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 8 Oct 2020 12:16:48 +0200 Subject: [PATCH 35/46] remove not used apiv2.set_source --- xarray/backends/apiv2.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 40b33997462..660543a1bb6 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -35,15 +35,13 @@ def check_chunks_compatibility(chunks, chunk_spec): ) -def get_chunk(name, var, chunks): +def get_chunk(var, chunks): chunk_spec = dict(zip(var.dims, var.encoding.get("chunks", {}))) if chunks == 'auto': - return dict(zip(var.dims, var.encoding.get("chunks", {}))) - # Coordinate labels aren't chunked - if var.ndim == 1 and var.dims[0] == name: - return chunk_spec - check_chunks_compatibility(chunks, chunk_spec) - return chunk_spec + chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) + if chunks is not None: + check_chunks_compatibility(chunks, chunk_spec) + return chunks def get_mtime(filename_or_obj): @@ -56,13 +54,6 @@ def get_mtime(filename_or_obj): return mtime -def set_source(ds, filename_or_obj): - if "source" not in ds.encoding: - if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj - return ds - - def dataset_from_backend_dataset( backend_ds, filename_or_obj, engine, chunks, cache, overwrite_encoded_chunks, extra_tokens, ): @@ -72,12 +63,16 @@ def dataset_from_backend_dataset( "chunks must be an int, dict, 'auto', or None. " "Instead found %s. " % chunks ) + if chunks == "auto": + try: + import dask.array # noqa + except ImportError: + chunks = None _protect_dataset_variables_inplace(backend_ds, cache) if chunks is None: - return set_source(backend_ds, filename_or_obj) - - if engine != "zarr": + ds = backend_ds + elif engine != "zarr": from dask.base import tokenize mtime = get_mtime(filename_or_obj) token = tokenize( From c1b763a9a34b78a4b886e451978cf783bfa494fb Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Fri, 9 Oct 2020 11:36:10 +0200 Subject: [PATCH 36/46] remove `auto` as chunk value --- xarray/backends/apiv2.py | 64 ++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 660543a1bb6..42bfb7f6051 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -17,30 +17,30 @@ } -def check_chunks_compatibility(chunks, chunk_spec): - for dim in chunk_spec: - if dim in chunks: - spec = chunks[dim] - if isinstance(spec, int): - spec = (spec,) - if any(s % chunk_spec[dim] for s in spec): - warnings.warn( - "Specified Dask chunks %r would " - "separate on disks chunk shape %r for " - "dimension %r. This could " - "degrades performance. Consider " - "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim), - stacklevel=2, - ) +def check_chunks_compatibility(dim, chunks, chunk_spec): + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if any(s % chunk_spec[dim] for s in spec): + warnings.warn( + "Specified Dask chunks %r would " + "separate on disks chunk shape %r for " + "dimension %r. This could " + "degrades performance. Consider " + "rechunking after loading instead." + % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2, + ) def get_chunk(var, chunks): chunk_spec = dict(zip(var.dims, var.encoding.get("chunks", {}))) - if chunks == 'auto': - chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) if chunks is not None: - check_chunks_compatibility(chunks, chunk_spec) + for dim in chunk_spec: + if dim in chunks: + check_chunks_compatibility(chunks, chunk_spec) + else: + chunks[dim] = chunk_spec[dim] return chunks @@ -58,16 +58,10 @@ def dataset_from_backend_dataset( backend_ds, filename_or_obj, engine, chunks, cache, overwrite_encoded_chunks, extra_tokens, ): if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - if chunks == "auto": - try: - import dask.array # noqa - except ImportError: - chunks = None + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) _protect_dataset_variables_inplace(backend_ds, cache) if chunks is None: @@ -81,11 +75,7 @@ def dataset_from_backend_dataset( name_prefix = "open_dataset-%s" % token ds = backend_ds.chunk(chunks, name_prefix=name_prefix, token=token) else: - if chunks == "auto": - try: - import dask.array # noqa - except ImportError: - chunks = None + if isinstance(chunks, int): chunks = dict.fromkeys(backend_ds.dims, chunks) variables = { @@ -94,7 +84,11 @@ def dataset_from_backend_dataset( } ds = backend_ds._replace(variables) ds._file_obj = backend_ds._file_obj - return set_source(ds, filename_or_obj) + + if "source" not in ds.encoding: + if isinstance(filename_or_obj, str): + ds.encoding["source"] = filename_or_obj + return ds def open_dataset( From 5eb0dafbd05394887ce389bedaf177db8f7388ce Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 29 Oct 2020 06:52:30 +0100 Subject: [PATCH 37/46] - align with api.py --- xarray/backends/apiv2.py | 56 ++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 16 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 42bfb7f6051..2d2d61cc714 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -2,6 +2,7 @@ import warnings from ..core.utils import is_remote_uri +from ..core.dataset import _maybe_chunk from . import cfgrib_, h5netcdf_, zarr from .api import ( _autodetect_engine, @@ -33,15 +34,20 @@ def check_chunks_compatibility(dim, chunks, chunk_spec): ) -def get_chunk(var, chunks): - chunk_spec = dict(zip(var.dims, var.encoding.get("chunks", {}))) +def get_chunk(name, var, chunks): + preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) + if var.ndim == 1 and var.dims[0] == name: + return preferred_chunks + + output_chunks = {} if chunks is not None: - for dim in chunk_spec: + for dim in preferred_chunks: if dim in chunks: - check_chunks_compatibility(chunks, chunk_spec) + check_chunks_compatibility(dim, chunks, preferred_chunks) + output_chunks[dim] = chunks[dim] else: - chunks[dim] = chunk_spec[dim] - return chunks + output_chunks[dim] = preferred_chunks[dim] + return output_chunks def get_mtime(filename_or_obj): @@ -55,13 +61,14 @@ def get_mtime(filename_or_obj): def dataset_from_backend_dataset( - backend_ds, filename_or_obj, engine, chunks, cache, overwrite_encoded_chunks, extra_tokens, + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + extra_tokens, ): - if not (isinstance(chunks, (int, dict)) or chunks is None): - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) _protect_dataset_variables_inplace(backend_ds, cache) if chunks is None: @@ -74,20 +81,28 @@ def dataset_from_backend_dataset( ) name_prefix = "open_dataset-%s" % token ds = backend_ds.chunk(chunks, name_prefix=name_prefix, token=token) - else: + else: if isinstance(chunks, int): chunks = dict.fromkeys(backend_ds.dims, chunks) + variables = { - k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks) - for k, v in backend_ds.variables.items() + name: _maybe_chunk( + name, + var, + get_chunk(name, var, chunks), + overwrite_encoded_chunks=overwrite_encoded_chunks, + ) + for name, var in backend_ds.variables.items() } ds = backend_ds._replace(variables) + ds._file_obj = backend_ds._file_obj if "source" not in ds.encoding: if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj + backend_ds.encoding["source"] = filename_or_obj + return ds @@ -204,6 +219,15 @@ def open_dataset( open_mfdataset """ + if chunks == "auto": + chunks = {} + + if not (isinstance(chunks, (int, dict)) or (chunks is None)): + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) + if cache is None: cache = chunks is None From 4fc1d8da6b8130048f7b923cbcbabb826e0831f5 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 29 Oct 2020 11:37:30 +0100 Subject: [PATCH 38/46] unify backends chunking --- xarray/backends/apiv2.py | 7 +++---- xarray/backends/zarr.py | 45 ---------------------------------------- 2 files changed, 3 insertions(+), 49 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 2d2d61cc714..3319746a7bd 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -73,16 +73,13 @@ def dataset_from_backend_dataset( _protect_dataset_variables_inplace(backend_ds, cache) if chunks is None: ds = backend_ds - elif engine != "zarr": + else: from dask.base import tokenize mtime = get_mtime(filename_or_obj) token = tokenize( filename_or_obj, mtime, engine, chunks, **extra_tokens ) name_prefix = "open_dataset-%s" % token - ds = backend_ds.chunk(chunks, name_prefix=name_prefix, token=token) - - else: if isinstance(chunks, int): chunks = dict.fromkeys(backend_ds.dims, chunks) @@ -92,6 +89,8 @@ def dataset_from_backend_dataset( var, get_chunk(name, var, chunks), overwrite_encoded_chunks=overwrite_encoded_chunks, + name_prefix=name_prefix, + token=token, ) for name, var in backend_ds.variables.items() } diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 33dd7fc5c90..54cf40b4c17 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -362,51 +362,6 @@ def encode_variable(self, variable): def encode_attribute(self, a): return encode_zarr_attr_value(a) - @staticmethod - def get_chunk(name, var, chunks): - chunk_spec = dict(zip(var.dims, var.encoding.get("chunks"))) - - # Coordinate labels aren't chunked - if var.ndim == 1 and var.dims[0] == name: - return chunk_spec - if chunks == "auto": - return chunk_spec - - for dim in var.dims: - if dim in chunks: - spec = chunks[dim] - if isinstance(spec, int): - spec = (spec,) - if isinstance(spec, (tuple, list)) and chunk_spec[dim]: - if any(s % chunk_spec[dim] for s in spec): - warnings.warn( - "Specified Dask chunks %r would " - "separate Zarr chunk shape %r for " - "dimension %r. This significantly " - "degrades performance. Consider " - "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim), - stacklevel=2, - ) - chunk_spec[dim] = chunks[dim] - return chunk_spec - - @classmethod - def maybe_chunk(cls, name, var, chunks, overwrite_encoded_chunks): - chunk_spec = cls.get_chunk(name, var, chunks) - - if (var.ndim > 0) and (chunk_spec is not None): - from dask.base import tokenize - - # does this cause any data to be read? - token2 = tokenize(name, var._data, chunks) - name2 = f"xarray-{name}-{token2}" - var = var.chunk(chunk_spec, name=name2, lock=None) - if overwrite_encoded_chunks and var.chunks is not None: - var.encoding["chunks"] = tuple(x[0] for x in var.chunks) - return var - else: - return var def store( self, From 1cf69682346ea79eb4d528654dfdf393c990de16 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 29 Oct 2020 11:53:41 +0100 Subject: [PATCH 39/46] move get_chunk funtion in dataset --- xarray/backends/api.py | 4 ++-- xarray/backends/apiv2.py | 41 ++-------------------------------------- xarray/core/dataset.py | 35 ++++++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 41 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 8dd431c5f62..90d83284632 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -25,7 +25,7 @@ combine_by_coords, ) from ..core.dataarray import DataArray -from ..core.dataset import Dataset, _maybe_chunk +from ..core.dataset import Dataset, _maybe_chunk, _get_chunk from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler @@ -535,7 +535,7 @@ def maybe_decode_store(store, chunks): k: _maybe_chunk( k, v, - store.get_chunk(k, v, chunks), + _get_chunk(k, v, chunks), overwrite_encoded_chunks=overwrite_encoded_chunks, ) for k, v in ds.variables.items() diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 3319746a7bd..8dd7b6556f3 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -2,7 +2,7 @@ import warnings from ..core.utils import is_remote_uri -from ..core.dataset import _maybe_chunk +from ..core.dataset import _maybe_chunk, _get_chunk from . import cfgrib_, h5netcdf_, zarr from .api import ( _autodetect_engine, @@ -18,37 +18,6 @@ } -def check_chunks_compatibility(dim, chunks, chunk_spec): - spec = chunks[dim] - if isinstance(spec, int): - spec = (spec,) - if any(s % chunk_spec[dim] for s in spec): - warnings.warn( - "Specified Dask chunks %r would " - "separate on disks chunk shape %r for " - "dimension %r. This could " - "degrades performance. Consider " - "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim), - stacklevel=2, - ) - - -def get_chunk(name, var, chunks): - preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) - if var.ndim == 1 and var.dims[0] == name: - return preferred_chunks - - output_chunks = {} - if chunks is not None: - for dim in preferred_chunks: - if dim in chunks: - check_chunks_compatibility(dim, chunks, preferred_chunks) - output_chunks[dim] = chunks[dim] - else: - output_chunks[dim] = preferred_chunks[dim] - return output_chunks - def get_mtime(filename_or_obj): # if passed an actual file path, augment the token with @@ -87,7 +56,7 @@ def dataset_from_backend_dataset( name: _maybe_chunk( name, var, - get_chunk(name, var, chunks), + _get_chunk(name, var, chunks), overwrite_encoded_chunks=overwrite_encoded_chunks, name_prefix=name_prefix, token=token, @@ -221,12 +190,6 @@ def open_dataset( if chunks == "auto": chunks = {} - if not (isinstance(chunks, (int, dict)) or (chunks is None)): - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - if cache is None: cache = chunks is None diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1ceb5623abd..8617b02b2da 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -359,6 +359,41 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None: raise ValueError(msg % args) +def _check_chunks_compatibility(dim, chunks, chunk_spec): + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if any(s % chunk_spec[dim] for s in spec): + warnings.warn( + "Specified Dask chunks %r would " + "separate on disks chunk shape %r for " + "dimension %r. This could " + "degrades performance. Consider " + "rechunking after loading instead." + % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2, + ) + + +def _get_chunk(name, var, chunks): + if chunks == "auto": + chunks = {} + + preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) + if var.ndim == 1 and var.dims[0] == name: + return preferred_chunks + + output_chunks = {} + if chunks is not None: + for dim in preferred_chunks: + if dim in chunks: + _check_chunks_compatibility(dim, chunks, preferred_chunks) + output_chunks[dim] = chunks[dim] + else: + output_chunks[dim] = preferred_chunks[dim] + return output_chunks + + def _maybe_chunk( name, var, From a780efc093c0cc2537b8bb8201bd06e22f7bb9d7 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 29 Oct 2020 11:53:41 +0100 Subject: [PATCH 40/46] move get_chunk funtion in dataset --- xarray/backends/api.py | 4 ++-- xarray/backends/apiv2.py | 47 ++++------------------------------------ xarray/backends/zarr.py | 1 - xarray/core/dataset.py | 34 +++++++++++++++++++++++++++++ 4 files changed, 40 insertions(+), 46 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 8dd431c5f62..90d83284632 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -25,7 +25,7 @@ combine_by_coords, ) from ..core.dataarray import DataArray -from ..core.dataset import Dataset, _maybe_chunk +from ..core.dataset import Dataset, _maybe_chunk, _get_chunk from ..core.utils import close_on_error, is_grib_path, is_remote_uri from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler @@ -535,7 +535,7 @@ def maybe_decode_store(store, chunks): k: _maybe_chunk( k, v, - store.get_chunk(k, v, chunks), + _get_chunk(k, v, chunks), overwrite_encoded_chunks=overwrite_encoded_chunks, ) for k, v in ds.variables.items() diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 3319746a7bd..fce782c8dc5 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,8 +1,8 @@ import os import warnings +from ..core.dataset import _get_chunk, _maybe_chunk from ..core.utils import is_remote_uri -from ..core.dataset import _maybe_chunk from . import cfgrib_, h5netcdf_, zarr from .api import ( _autodetect_engine, @@ -18,38 +18,6 @@ } -def check_chunks_compatibility(dim, chunks, chunk_spec): - spec = chunks[dim] - if isinstance(spec, int): - spec = (spec,) - if any(s % chunk_spec[dim] for s in spec): - warnings.warn( - "Specified Dask chunks %r would " - "separate on disks chunk shape %r for " - "dimension %r. This could " - "degrades performance. Consider " - "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim), - stacklevel=2, - ) - - -def get_chunk(name, var, chunks): - preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) - if var.ndim == 1 and var.dims[0] == name: - return preferred_chunks - - output_chunks = {} - if chunks is not None: - for dim in preferred_chunks: - if dim in chunks: - check_chunks_compatibility(dim, chunks, preferred_chunks) - output_chunks[dim] = chunks[dim] - else: - output_chunks[dim] = preferred_chunks[dim] - return output_chunks - - def get_mtime(filename_or_obj): # if passed an actual file path, augment the token with # the file modification time @@ -75,10 +43,9 @@ def dataset_from_backend_dataset( ds = backend_ds else: from dask.base import tokenize + mtime = get_mtime(filename_or_obj) - token = tokenize( - filename_or_obj, mtime, engine, chunks, **extra_tokens - ) + token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) name_prefix = "open_dataset-%s" % token if isinstance(chunks, int): chunks = dict.fromkeys(backend_ds.dims, chunks) @@ -87,7 +54,7 @@ def dataset_from_backend_dataset( name: _maybe_chunk( name, var, - get_chunk(name, var, chunks), + _get_chunk(name, var, chunks), overwrite_encoded_chunks=overwrite_encoded_chunks, name_prefix=name_prefix, token=token, @@ -221,12 +188,6 @@ def open_dataset( if chunks == "auto": chunks = {} - if not (isinstance(chunks, (int, dict)) or (chunks is None)): - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - if cache is None: cache = chunks is None diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 54cf40b4c17..2145685b2b2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -362,7 +362,6 @@ def encode_variable(self, variable): def encode_attribute(self, a): return encode_zarr_attr_value(a) - def store( self, variables, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1ceb5623abd..562a99b9028 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -359,6 +359,40 @@ def _assert_empty(args: tuple, msg: str = "%s") -> None: raise ValueError(msg % args) +def _check_chunks_compatibility(dim, chunks, chunk_spec): + spec = chunks[dim] + if isinstance(spec, int): + spec = (spec,) + if any(s % chunk_spec[dim] for s in spec): + warnings.warn( + "Specified Dask chunks %r would " + "separate on disks chunk shape %r for " + "dimension %r. This could " + "degrades performance. Consider " + "rechunking after loading instead." % (chunks[dim], chunk_spec[dim], dim), + stacklevel=2, + ) + + +def _get_chunk(name, var, chunks): + if chunks == "auto": + chunks = {} + + preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) + if var.ndim == 1 and var.dims[0] == name: + return preferred_chunks + + output_chunks = {} + if chunks is not None: + for dim in preferred_chunks: + if dim in chunks: + _check_chunks_compatibility(dim, chunks, preferred_chunks) + output_chunks[dim] = chunks[dim] + else: + output_chunks[dim] = preferred_chunks[dim] + return output_chunks + + def _maybe_chunk( name, var, From 93cadeef1e24a5a2d1658b1a81d35b11c500da8f Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 29 Oct 2020 12:03:03 +0100 Subject: [PATCH 41/46] black --- xarray/core/dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ec8aaaf898a..d72df555597 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -403,8 +403,7 @@ def _check_chunks_compatibility(dim, chunks, chunk_spec): "separate on disks chunk shape %r for " "dimension %r. This could " "degrades performance. Consider " - "rechunking after loading instead." - % (chunks[dim], chunk_spec[dim], dim), + "rechunking after loading instead." % (chunks[dim], chunk_spec[dim], dim), stacklevel=2, ) From 6e9a56214484548bdbfac673aef765ef5c59e78a Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 29 Oct 2020 14:04:38 +0100 Subject: [PATCH 42/46] remove unused import re-add check on chunks type --- xarray/backends/apiv2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index fce782c8dc5..63be14caef7 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,5 +1,4 @@ import os -import warnings from ..core.dataset import _get_chunk, _maybe_chunk from ..core.utils import is_remote_uri @@ -187,6 +186,11 @@ def open_dataset( if chunks == "auto": chunks = {} + if not (isinstance(chunks, (int, dict)) or chunks is None): + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) if cache is None: cache = chunks is None From 69c2790124e97c8f2c79c2ceb97fe49e820583dd Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 29 Oct 2020 14:04:38 +0100 Subject: [PATCH 43/46] remove unused import re-add check on chunks type --- xarray/backends/apiv2.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index fce782c8dc5..496b7b3e484 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -1,5 +1,4 @@ import os -import warnings from ..core.dataset import _get_chunk, _maybe_chunk from ..core.utils import is_remote_uri @@ -187,6 +186,11 @@ def open_dataset( if chunks == "auto": chunks = {} + if not (isinstance(chunks, (int, dict)) or chunks is None): + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) if cache is None: cache = chunks is None From a1cfa297166c9131fd9268687876b6d96a280287 Mon Sep 17 00:00:00 2001 From: TheRed86 Date: Mon, 2 Nov 2020 12:49:09 +0100 Subject: [PATCH 44/46] Pass isort test. --- xarray/backends/apiv2.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 496b7b3e484..9ee802336a4 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -3,12 +3,8 @@ from ..core.dataset import _get_chunk, _maybe_chunk from ..core.utils import is_remote_uri from . import cfgrib_, h5netcdf_, zarr -from .api import ( - _autodetect_engine, - _get_backend_cls, - _normalize_path, - _protect_dataset_variables_inplace, -) +from .api import (_autodetect_engine, _get_backend_cls, _normalize_path, + _protect_dataset_variables_inplace) ENGINES = { "h5netcdf": h5netcdf_.open_backend_dataset_h5necdf, From c590e05a20778ae6c940fa8666e4364929268b08 Mon Sep 17 00:00:00 2001 From: TheRed86 Date: Mon, 2 Nov 2020 12:52:59 +0100 Subject: [PATCH 45/46] Pass black -t py36 test. --- xarray/backends/apiv2.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py index 9ee802336a4..496b7b3e484 100644 --- a/xarray/backends/apiv2.py +++ b/xarray/backends/apiv2.py @@ -3,8 +3,12 @@ from ..core.dataset import _get_chunk, _maybe_chunk from ..core.utils import is_remote_uri from . import cfgrib_, h5netcdf_, zarr -from .api import (_autodetect_engine, _get_backend_cls, _normalize_path, - _protect_dataset_variables_inplace) +from .api import ( + _autodetect_engine, + _get_backend_cls, + _normalize_path, + _protect_dataset_variables_inplace, +) ENGINES = { "h5netcdf": h5netcdf_.open_backend_dataset_h5necdf, From c6d341c7ad0190588184d4126f2f8236fc162da8 Mon Sep 17 00:00:00 2001 From: TheRed86 Date: Mon, 2 Nov 2020 13:05:12 +0100 Subject: [PATCH 46/46] Remove duplicated functions. --- xarray/core/dataset.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d72df555597..562a99b9028 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -393,40 +393,6 @@ def _get_chunk(name, var, chunks): return output_chunks -def _check_chunks_compatibility(dim, chunks, chunk_spec): - spec = chunks[dim] - if isinstance(spec, int): - spec = (spec,) - if any(s % chunk_spec[dim] for s in spec): - warnings.warn( - "Specified Dask chunks %r would " - "separate on disks chunk shape %r for " - "dimension %r. This could " - "degrades performance. Consider " - "rechunking after loading instead." % (chunks[dim], chunk_spec[dim], dim), - stacklevel=2, - ) - - -def _get_chunk(name, var, chunks): - if chunks == "auto": - chunks = {} - - preferred_chunks = dict(zip(var.dims, var.encoding.get("chunks", {}))) - if var.ndim == 1 and var.dims[0] == name: - return preferred_chunks - - output_chunks = {} - if chunks is not None: - for dim in preferred_chunks: - if dim in chunks: - _check_chunks_compatibility(dim, chunks, preferred_chunks) - output_chunks[dim] = chunks[dim] - else: - output_chunks[dim] = preferred_chunks[dim] - return output_chunks - - def _maybe_chunk( name, var,