From c210f8b9e3356590ee0d4e25dbb21b93cf7a5309 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 28 Oct 2021 13:46:03 +0200 Subject: [PATCH 1/8] [test-upstream] fix pd skipna=None (#5899) --- xarray/tests/test_duck_array_ops.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 6d49e20909d..c032a781e47 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -258,6 +258,11 @@ def from_series_or_scalar(se): def series_reduce(da, func, dim, **kwargs): """convert DataArray to pd.Series, apply pd.func, then convert back to a DataArray. Multiple dims cannot be specified.""" + + # pd no longer accepts skipna=None https://github.com/pandas-dev/pandas/issues/44178 + if kwargs.get("skipna", True) is None: + kwargs["skipna"] = True + if dim is None or da.ndim == 1: se = da.to_series() return from_series_or_scalar(getattr(se, func)(**kwargs)) From 36f05d70c864ee7c61603c8a43ba721bf7f434b3 Mon Sep 17 00:00:00 2001 From: Tom Nicholas <35968931+TomNicholas@users.noreply.github.com> Date: Thu, 28 Oct 2021 18:41:58 -0400 Subject: [PATCH 2/8] Use .to_numpy() for quantified facetgrids (#5886) Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- doc/whats-new.rst | 3 +++ xarray/plot/facetgrid.py | 14 +++++++------- xarray/plot/plot.py | 10 +++++----- xarray/tests/test_units.py | 26 +++++++++++++++++++++++++- 4 files changed, 40 insertions(+), 13 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c75a4e4de31..9811ab0163a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -82,6 +82,9 @@ Bug fixes By `Jimmy Westling `_. - Numbers are properly formatted in a plot's title (:issue:`5788`, :pull:`5789`). By `Maxime Liquet `_. +- Faceted plots will no longer raise a `pint.UnitStrippedWarning` when a `pint.Quantity` array is plotted, + and will correctly display the units of the data in the colorbar (if there is one) (:pull:`5886`). + By `Tom Nicholas `_. - With backends, check for path-like objects rather than ``pathlib.Path`` type, use ``os.fspath`` (:pull:`5879`). By `Mike Taves `_. diff --git a/xarray/plot/facetgrid.py b/xarray/plot/facetgrid.py index b384dea0571..a518a78dbf6 100644 --- a/xarray/plot/facetgrid.py +++ b/xarray/plot/facetgrid.py @@ -173,11 +173,11 @@ def __init__( ) # Set up the lists of names for the row and column facet variables - col_names = list(data[col].values) if col else [] - row_names = list(data[row].values) if row else [] + col_names = list(data[col].to_numpy()) if col else [] + row_names = list(data[row].to_numpy()) if row else [] if single_group: - full = [{single_group: x} for x in data[single_group].values] + full = [{single_group: x} for x in data[single_group].to_numpy()] empty = [None for x in range(nrow * ncol - len(full))] name_dicts = full + empty else: @@ -251,7 +251,7 @@ def map_dataarray(self, func, x, y, **kwargs): raise ValueError("cbar_ax not supported by FacetGrid.") cmap_params, cbar_kwargs = _process_cmap_cbar_kwargs( - func, self.data.values, **kwargs + func, self.data.to_numpy(), **kwargs ) self._cmap_extend = cmap_params.get("extend") @@ -347,7 +347,7 @@ def map_dataset( if hue and meta_data["hue_style"] == "continuous": cmap_params, cbar_kwargs = _process_cmap_cbar_kwargs( - func, self.data[hue].values, **kwargs + func, self.data[hue].to_numpy(), **kwargs ) kwargs["meta_data"]["cmap_params"] = cmap_params kwargs["meta_data"]["cbar_kwargs"] = cbar_kwargs @@ -423,7 +423,7 @@ def _adjust_fig_for_guide(self, guide): def add_legend(self, **kwargs): self.figlegend = self.fig.legend( handles=self._mappables[-1], - labels=list(self._hue_var.values), + labels=list(self._hue_var.to_numpy()), title=self._hue_label, loc="center right", **kwargs, @@ -619,7 +619,7 @@ def map(self, func, *args, **kwargs): if namedict is not None: data = self.data.loc[namedict] plt.sca(ax) - innerargs = [data[a].values for a in args] + innerargs = [data[a].to_numpy() for a in args] maybe_mappable = func(*innerargs, **kwargs) # TODO: better way to verify that an artist is mappable? # https://stackoverflow.com/questions/33023036/is-it-possible-to-detect-if-a-matplotlib-artist-is-a-mappable-suitable-for-use-w#33023522 diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index 1e1e59e2f71..60f132d07e1 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -1075,7 +1075,7 @@ def newplotfunc( # Matplotlib does not support normalising RGB data, so do it here. # See eg. https://github.com/matplotlib/matplotlib/pull/10220 if robust or vmax is not None or vmin is not None: - darray = _rescale_imshow_rgb(darray, vmin, vmax, robust) + darray = _rescale_imshow_rgb(darray.as_numpy(), vmin, vmax, robust) vmin, vmax, robust = None, None, False if subplot_kws is None: @@ -1146,10 +1146,6 @@ def newplotfunc( else: dims = (yval.dims[0], xval.dims[0]) - # better to pass the ndarrays directly to plotting functions - xval = xval.to_numpy() - yval = yval.to_numpy() - # May need to transpose for correct x, y labels # xlab may be the name of a coord, we have to check for dim names if imshow_rgb: @@ -1162,6 +1158,10 @@ def newplotfunc( if dims != darray.dims: darray = darray.transpose(*dims, transpose_coords=True) + # better to pass the ndarrays directly to plotting functions + xval = xval.to_numpy() + yval = yval.to_numpy() + # Pass the data as a masked ndarray too zval = darray.to_masked_array(copy=False) diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 7bde6ce8b9f..8be20c5f81c 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -5614,7 +5614,7 @@ def test_units_in_line_plot_labels(self): assert ax.get_ylabel() == "pressure [pascal]" assert ax.get_xlabel() == "x [meters]" - def test_units_in_2d_plot_labels(self): + def test_units_in_2d_plot_colorbar_label(self): arr = np.ones((2, 3)) * unit_registry.Pa da = xr.DataArray(data=arr, dims=["x", "y"], name="pressure") @@ -5622,3 +5622,27 @@ def test_units_in_2d_plot_labels(self): ax = da.plot.contourf(ax=ax, cbar_ax=cax, add_colorbar=True) assert cax.get_ylabel() == "pressure [pascal]" + + def test_units_facetgrid_plot_labels(self): + arr = np.ones((2, 3)) * unit_registry.Pa + da = xr.DataArray(data=arr, dims=["x", "y"], name="pressure") + + fig, (ax, cax) = plt.subplots(1, 2) + fgrid = da.plot.line(x="x", col="y") + + assert fgrid.axes[0, 0].get_ylabel() == "pressure [pascal]" + + def test_units_facetgrid_2d_imshow_plot_colorbar_labels(self): + arr = np.ones((2, 3, 4, 5)) * unit_registry.Pa + da = xr.DataArray(data=arr, dims=["x", "y", "z", "w"], name="pressure") + + da.plot.imshow(x="x", y="y", col="w") # no colorbar to check labels of + + def test_units_facetgrid_2d_contourf_plot_colorbar_labels(self): + arr = np.ones((2, 3, 4)) * unit_registry.Pa + da = xr.DataArray(data=arr, dims=["x", "y", "z"], name="pressure") + + fig, (ax1, ax2, ax3, cax) = plt.subplots(1, 4) + fgrid = da.plot.contourf(x="x", y="y", col="z") + + assert fgrid.cbar.ax.get_ylabel() == "pressure [pascal]" From 3bfa8c01490ec82506b66bc64446425f43b0362b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 29 Oct 2021 13:29:18 +0200 Subject: [PATCH 3/8] Add wradlib to ecosystem in docs (#5915) --- doc/ecosystem.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/ecosystem.rst b/doc/ecosystem.rst index 1c919b3040b..460541e91d7 100644 --- a/doc/ecosystem.rst +++ b/doc/ecosystem.rst @@ -37,6 +37,7 @@ Geosciences - `Spyfit `_: FTIR spectroscopy of the atmosphere - `windspharm `_: Spherical harmonic wind analysis in Python. +- `wradlib `_: An Open Source Library for Weather Radar Data Processing. - `wrf-python `_: A collection of diagnostic and interpolation routines for use with output of the Weather Research and Forecasting (WRF-ARW) Model. - `xarray-simlab `_: xarray extension for computer model simulations. - `xarray-spatial `_: Numba-accelerated raster-based spatial processing tools (NDVI, curvature, zonal-statistics, proximity, hillshading, viewshed, etc.) From b2ed62e95e452894dfd0a0aa156c3c7b0236c257 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 29 Oct 2021 10:14:43 -0500 Subject: [PATCH 4/8] Avoid accessing slow .data in unstack (#5906) --- doc/whats-new.rst | 1 + xarray/core/dataset.py | 54 +++++++++++++++++++++--------------------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9811ab0163a..8dfecdd2aa8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -90,6 +90,7 @@ Bug fixes By `Mike Taves `_. - ``open_mfdataset()`` now accepts a single ``pathlib.Path`` object (:issue: `5881`). By `Panos Mavrogiorgos `_. +- Improved performance of :py:meth:`Dataset.unstack` (:pull:`5906`). By `Tom Augspurger `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3055e50aaf5..0e6ae905aa8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4153,34 +4153,34 @@ def unstack( ) result = self.copy(deep=False) - for dim in dims: - if ( - # Dask arrays don't support assignment by index, which the fast unstack - # function requires. - # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 - any(is_duck_dask_array(v.data) for v in self.variables.values()) - # Sparse doesn't currently support (though we could special-case - # it) - # https://github.com/pydata/sparse/issues/422 - or any( - isinstance(v.data, sparse_array_type) - for v in self.variables.values() - ) - or sparse - # Until https://github.com/pydata/xarray/pull/4751 is resolved, - # we check explicitly whether it's a numpy array. Once that is - # resolved, explicitly exclude pint arrays. - # # pint doesn't implement `np.full_like` in a way that's - # # currently compatible. - # # https://github.com/pydata/xarray/pull/4746#issuecomment-753425173 - # # or any( - # # isinstance(v.data, pint_array_type) for v in self.variables.values() - # # ) - or any( - not isinstance(v.data, np.ndarray) for v in self.variables.values() - ) - ): + # we want to avoid allocating an object-dtype ndarray for a MultiIndex, + # so we can't just access self.variables[v].data for every variable. + # We only check the non-index variables. + # https://github.com/pydata/xarray/issues/5902 + nonindexes = [ + self.variables[k] for k in set(self.variables) - set(self.xindexes) + ] + # Notes for each of these cases: + # 1. Dask arrays don't support assignment by index, which the fast unstack + # function requires. + # https://github.com/pydata/xarray/pull/4746#issuecomment-753282125 + # 2. Sparse doesn't currently support (though we could special-case it) + # https://github.com/pydata/sparse/issues/422 + # 3. pint requires checking if it's a NumPy array until + # https://github.com/pydata/xarray/pull/4751 is resolved, + # Once that is resolved, explicitly exclude pint arrays. + # pint doesn't implement `np.full_like` in a way that's + # currently compatible. + needs_full_reindex = sparse or any( + is_duck_dask_array(v.data) + or isinstance(v.data, sparse_array_type) + or not isinstance(v.data, np.ndarray) + for v in nonindexes + ) + + for dim in dims: + if needs_full_reindex: result = result._unstack_full_reindex(dim, fill_value, sparse) else: result = result._unstack_once(dim, fill_value) From ba00852d061a330adbb922a2485c4de92a99540d Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Fri, 29 Oct 2021 17:16:02 +0200 Subject: [PATCH 5/8] #5740 follow up: supress xr.ufunc warnings in tests (#5914) --- xarray/tests/test_dask.py | 6 +++--- xarray/tests/test_sparse.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index d5d460056aa..de69c972fc6 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -255,13 +255,13 @@ def test_missing_methods(self): except NotImplementedError as err: assert "dask" in str(err) - @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_univariate_ufunc(self): u = self.eager_var v = self.lazy_var self.assertLazyAndAllClose(np.sin(u), xu.sin(v)) - @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_bivariate_ufunc(self): u = self.eager_var v = self.lazy_var @@ -563,7 +563,7 @@ def duplicate_and_merge(array): actual = duplicate_and_merge(self.lazy_array) self.assertLazyAndEqual(expected, actual) - @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_ufuncs(self): u = self.eager_array v = self.lazy_array diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 3d57d3dc961..ad0aafff15e 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -276,11 +276,11 @@ def test_unary_op(self): assert_sparse_equal(abs(self.var).data, abs(self.data)) assert_sparse_equal(self.var.round().data, self.data.round()) - @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_univariate_ufunc(self): assert_sparse_equal(np.sin(self.data), xu.sin(self.var).data) - @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_bivariate_ufunc(self): assert_sparse_equal(np.maximum(self.data, 0), xu.maximum(self.var, 0).data) assert_sparse_equal(np.maximum(self.data, 0), xu.maximum(0, self.var).data) @@ -664,7 +664,7 @@ def test_stack(self): roundtripped = stacked.unstack() assert_identical(arr, roundtripped) - @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") + @pytest.mark.filterwarnings("ignore::FutureWarning") def test_ufuncs(self): x = self.sp_xr assert_equal(np.sin(x), xu.sin(x)) From bcb96ce8fc1012b05ba6966fff70bc687cd6125f Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 29 Oct 2021 22:11:21 +0530 Subject: [PATCH 6/8] Add typing_extensions as a required dependency (#5911) --- ci/requirements/environment-windows.yml | 1 + ci/requirements/environment.yml | 1 + ci/requirements/py37-bare-minimum.yml | 1 + ci/requirements/py37-min-all-deps.yml | 1 + ci/requirements/py38-all-but-dask.yml | 1 + doc/getting-started-guide/installing.rst | 1 + requirements.txt | 2 +- setup.cfg | 1 + xarray/core/npcompat.py | 11 +--- xarray/core/options.py | 83 +++++++----------------- 10 files changed, 34 insertions(+), 69 deletions(-) diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml index 78ead40d5a2..2468ec6267e 100644 --- a/ci/requirements/environment-windows.yml +++ b/ci/requirements/environment-windows.yml @@ -39,6 +39,7 @@ dependencies: - setuptools - sparse - toolz + - typing_extensions - zarr - pip: - numbagg diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index f64ca3677cc..162faa7b74d 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -43,6 +43,7 @@ dependencies: - setuptools - sparse - toolz + - typing_extensions - zarr - pip: - numbagg diff --git a/ci/requirements/py37-bare-minimum.yml b/ci/requirements/py37-bare-minimum.yml index 408cf76fdd6..0cecf885436 100644 --- a/ci/requirements/py37-bare-minimum.yml +++ b/ci/requirements/py37-bare-minimum.yml @@ -13,3 +13,4 @@ dependencies: - numpy=1.17 - pandas=1.0 - setuptools=40.4 + - typing_extensions=3.7 diff --git a/ci/requirements/py37-min-all-deps.yml b/ci/requirements/py37-min-all-deps.yml index 7c3230f87b0..c73c5327d3b 100644 --- a/ci/requirements/py37-min-all-deps.yml +++ b/ci/requirements/py37-min-all-deps.yml @@ -47,6 +47,7 @@ dependencies: - setuptools=40.4 - sparse=0.8 - toolz=0.10 + - typing_extensions=3.7 - zarr=2.4 - pip: - numbagg==0.1 diff --git a/ci/requirements/py38-all-but-dask.yml b/ci/requirements/py38-all-but-dask.yml index 3f82990f3b5..688dfb7a2bc 100644 --- a/ci/requirements/py38-all-but-dask.yml +++ b/ci/requirements/py38-all-but-dask.yml @@ -39,6 +39,7 @@ dependencies: - setuptools - sparse - toolz + - typing_extensions - zarr - pip: - numbagg diff --git a/doc/getting-started-guide/installing.rst b/doc/getting-started-guide/installing.rst index 93738da9d9b..c6bc84e6ddb 100644 --- a/doc/getting-started-guide/installing.rst +++ b/doc/getting-started-guide/installing.rst @@ -8,6 +8,7 @@ Required dependencies - Python (3.7 or later) - setuptools (40.4 or later) +- ``typing_extensions`` (3.7 or later) - `numpy `__ (1.17 or later) - `pandas `__ (1.0 or later) diff --git a/requirements.txt b/requirements.txt index 732d40cde18..0fa83c8ccc1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,4 @@ numpy >= 1.17 pandas >= 1.0 setuptools >= 40.4 -typing-extensions >= 3.10 +typing-extensions >= 3.7 diff --git a/setup.cfg b/setup.cfg index aa8ca8df0ff..2dc1b7ffeca 100644 --- a/setup.cfg +++ b/setup.cfg @@ -78,6 +78,7 @@ python_requires = >=3.7 install_requires = numpy >= 1.17 pandas >= 1.0 + typing_extensions >= 3.7 setuptools >= 40.4 # For pkg_resources [options.extras_require] diff --git a/xarray/core/npcompat.py b/xarray/core/npcompat.py index 6e22c8cf0a4..09f78c5971c 100644 --- a/xarray/core/npcompat.py +++ b/xarray/core/npcompat.py @@ -41,17 +41,10 @@ # fall back for numpy < 1.20, ArrayLike adapted from numpy.typing._array_like if sys.version_info >= (3, 8): from typing import Protocol - - HAVE_PROTOCOL = True else: - try: - from typing_extensions import Protocol - except ImportError: - HAVE_PROTOCOL = False - else: - HAVE_PROTOCOL = True + from typing_extensions import Protocol - if TYPE_CHECKING or HAVE_PROTOCOL: + if TYPE_CHECKING: class _SupportsArray(Protocol): def __array__(self) -> np.ndarray: diff --git a/xarray/core/options.py b/xarray/core/options.py index 14f77306316..c9e037e6fd6 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -6,70 +6,35 @@ # TODO: Remove this check once python 3.7 is not supported: if sys.version_info >= (3, 8): from typing import TYPE_CHECKING, Literal, TypedDict, Union +else: + from typing import TYPE_CHECKING, Union - if TYPE_CHECKING: - try: - from matplotlib.colors import Colormap - except ImportError: - Colormap = str - - class T_Options(TypedDict): - arithmetic_join: Literal["inner", "outer", "left", "right", "exact"] - cmap_divergent: Union[str, "Colormap"] - cmap_sequential: Union[str, "Colormap"] - display_max_rows: int - display_style: Literal["text", "html"] - display_width: int - display_expand_attrs: Literal["default", True, False] - display_expand_coords: Literal["default", True, False] - display_expand_data_vars: Literal["default", True, False] - display_expand_data: Literal["default", True, False] - enable_cftimeindex: bool - file_cache_maxsize: int - keep_attrs: Literal["default", True, False] - warn_for_unclosed_files: bool - use_bottleneck: bool + from typing_extensions import Literal, TypedDict -else: - # See GH5624, this is a convoluted way to allow type-checking to use - # `TypedDict` and `Literal` without requiring typing_extensions as a - # required dependency to _run_ the code (it is required to type-check). +if TYPE_CHECKING: try: - from typing import TYPE_CHECKING, Union - - from typing_extensions import Literal, TypedDict - - if TYPE_CHECKING: - try: - from matplotlib.colors import Colormap - except ImportError: - Colormap = str - - class T_Options(TypedDict): - arithmetic_join: Literal["inner", "outer", "left", "right", "exact"] - cmap_divergent: Union[str, "Colormap"] - cmap_sequential: Union[str, "Colormap"] - display_max_rows: int - display_style: Literal["text", "html"] - display_width: int - display_expand_attrs: Literal["default", True, False] - display_expand_coords: Literal["default", True, False] - display_expand_data_vars: Literal["default", True, False] - display_expand_data: Literal["default", True, False] - enable_cftimeindex: bool - file_cache_maxsize: int - keep_attrs: Literal["default", True, False] - warn_for_unclosed_files: bool - use_bottleneck: bool - + from matplotlib.colors import Colormap except ImportError: - from typing import TYPE_CHECKING, Any, Dict, Hashable - - if TYPE_CHECKING: - raise - else: - T_Options = Dict[Hashable, Any] + Colormap = str + + +class T_Options(TypedDict): + arithmetic_join: Literal["inner", "outer", "left", "right", "exact"] + cmap_divergent: Union[str, "Colormap"] + cmap_sequential: Union[str, "Colormap"] + display_max_rows: int + display_style: Literal["text", "html"] + display_width: int + display_expand_attrs: Literal["default", True, False] + display_expand_coords: Literal["default", True, False] + display_expand_data_vars: Literal["default", True, False] + display_expand_data: Literal["default", True, False] + enable_cftimeindex: bool + file_cache_maxsize: int + keep_attrs: Literal["default", True, False] + warn_for_unclosed_files: bool + use_bottleneck: bool OPTIONS: T_Options = { From 1d94b1ebc323f55448c07d3778cd51d72666683b Mon Sep 17 00:00:00 2001 From: Tom Nicholas <35968931+TomNicholas@users.noreply.github.com> Date: Fri, 29 Oct 2021 14:12:21 -0400 Subject: [PATCH 7/8] Add .chunksizes property (#5900) * added chunksizes property * fix typing via Hashable->Any * add chunksizes to API doc * whatsnew * grammar * Update doc/whats-new.rst Co-authored-by: Deepak Cherian * Update doc/whats-new.rst Co-authored-by: Deepak Cherian * removed the word consistent * test .chunksizes Co-authored-by: Deepak Cherian --- doc/api.rst | 2 ++ doc/whats-new.rst | 4 +++ xarray/core/common.py | 17 +++++++++++++ xarray/core/dataarray.py | 32 +++++++++++++++++++++--- xarray/core/dataset.py | 51 ++++++++++++++++++++++++++++----------- xarray/core/variable.py | 37 +++++++++++++++++++++++++--- xarray/tests/test_dask.py | 37 ++++++++++++++++++++++++++++ 7 files changed, 159 insertions(+), 21 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 83015cb3993..183e9d1425e 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -65,6 +65,7 @@ Attributes Dataset.indexes Dataset.get_index Dataset.chunks + Dataset.chunksizes Dataset.nbytes Dictionary interface @@ -271,6 +272,7 @@ Attributes DataArray.encoding DataArray.indexes DataArray.get_index + DataArray.chunksizes **ndarray attributes**: :py:attr:`~DataArray.ndim` diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8dfecdd2aa8..cd0d4461aa2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,6 +38,10 @@ New Features `Nathan Lis `_. - Histogram plots are set with a title displaying the scalar coords if any, similarly to the other plots (:issue:`5791`, :pull:`5792`). By `Maxime Liquet `_. +- Added a new :py:attr:`Dataset.chunksizes`, :py:attr:`DataArray.chunksizes`, and :py:attr:`Variable.chunksizes` + property, which will always return a mapping from dimension names to chunking pattern along that dimension, + regardless of whether the object is a Dataset, DataArray, or Variable. (:issue:`5846`, :pull:`5900`) + By `Tom Nicholas `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/common.py b/xarray/core/common.py index 2c5d7900ef8..b5dc3bf0e20 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1813,6 +1813,23 @@ def ones_like(other, dtype: DTypeLike = None): return full_like(other, 1, dtype) +def get_chunksizes( + variables: Iterable[Variable], +) -> Mapping[Any, Tuple[int, ...]]: + + chunks: Dict[Any, Tuple[int, ...]] = {} + for v in variables: + if hasattr(v.data, "chunks"): + for dim, c in v.chunksizes.items(): + if dim in chunks and c != chunks[dim]: + raise ValueError( + f"Object has inconsistent chunks along dimension {dim}. " + "This can be fixed by calling unify_chunks()." + ) + chunks[dim] = c + return Frozen(chunks) + + def is_np_datetime_like(dtype: DTypeLike) -> bool: """Check if a dtype is a subclass of the numpy datetime types""" return np.issubdtype(dtype, np.datetime64) or np.issubdtype(dtype, np.timedelta64) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index ed8b393628d..c6f534796d1 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -43,7 +43,7 @@ reindex_like_indexers, ) from .arithmetic import DataArrayArithmetic -from .common import AbstractArray, DataWithCoords +from .common import AbstractArray, DataWithCoords, get_chunksizes from .computation import unify_chunks from .coordinates import ( DataArrayCoordinates, @@ -1058,11 +1058,37 @@ def __deepcopy__(self, memo=None) -> "DataArray": @property def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: - """Block dimensions for this array's data or None if it's not a dask - array. + """ + Tuple of block lengths for this dataarray's data, in order of dimensions, or None if + the underlying data is not a dask array. + + See Also + -------- + DataArray.chunk + DataArray.chunksizes + xarray.unify_chunks """ return self.variable.chunks + @property + def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this dataarray's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Differs from DataArray.chunks because it returns a mapping of dimensions to chunk shapes + instead of a tuple of chunk shapes. + + See Also + -------- + DataArray.chunk + DataArray.chunks + xarray.unify_chunks + """ + all_variables = [self.variable] + [c.variable for c in self.coords.values()] + return get_chunksizes(all_variables) + def chunk( self, chunks: Union[ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0e6ae905aa8..e882495dce5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -52,7 +52,7 @@ ) from .alignment import _broadcast_helper, _get_broadcast_dims_map_common_coords, align from .arithmetic import DatasetArithmetic -from .common import DataWithCoords, _contains_datetime_like_objects +from .common import DataWithCoords, _contains_datetime_like_objects, get_chunksizes from .computation import unify_chunks from .coordinates import ( DatasetCoordinates, @@ -2095,20 +2095,37 @@ def info(self, buf=None) -> None: @property def chunks(self) -> Mapping[Hashable, Tuple[int, ...]]: - """Block dimensions for this dataset's data or None if it's not a dask - array. """ - chunks: Dict[Hashable, Tuple[int, ...]] = {} - for v in self.variables.values(): - if v.chunks is not None: - for dim, c in zip(v.dims, v.chunks): - if dim in chunks and c != chunks[dim]: - raise ValueError( - f"Object has inconsistent chunks along dimension {dim}. " - "This can be fixed by calling unify_chunks()." - ) - chunks[dim] = c - return Frozen(chunks) + Mapping from dimension names to block lengths for this dataset's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Same as Dataset.chunksizes, but maintained for backwards compatibility. + + See Also + -------- + Dataset.chunk + Dataset.chunksizes + xarray.unify_chunks + """ + return get_chunksizes(self.variables.values()) + + @property + def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this dataset's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Same as Dataset.chunks. + + See Also + -------- + Dataset.chunk + Dataset.chunks + xarray.unify_chunks + """ + return get_chunksizes(self.variables.values()) def chunk( self, @@ -2147,6 +2164,12 @@ def chunk( Returns ------- chunked : xarray.Dataset + + See Also + -------- + Dataset.chunks + Dataset.chunksizes + xarray.unify_chunks """ if chunks is None: warnings.warn( diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 191bb4059f5..a96adb31e64 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -45,6 +45,7 @@ sparse_array_type, ) from .utils import ( + Frozen, NdimSizeLenMixin, OrderedSet, _default, @@ -996,16 +997,44 @@ def __deepcopy__(self, memo=None): __hash__ = None # type: ignore[assignment] @property - def chunks(self): - """Block dimensions for this array's data or None if it's not a dask - array. + def chunks(self) -> Optional[Tuple[Tuple[int, ...], ...]]: + """ + Tuple of block lengths for this dataarray's data, in order of dimensions, or None if + the underlying data is not a dask array. + + See Also + -------- + Variable.chunk + Variable.chunksizes + xarray.unify_chunks """ return getattr(self._data, "chunks", None) + @property + def chunksizes(self) -> Mapping[Any, Tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this variable's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Differs from variable.chunks because it returns a mapping of dimensions to chunk shapes + instead of a tuple of chunk shapes. + + See Also + -------- + Variable.chunk + Variable.chunks + xarray.unify_chunks + """ + if hasattr(self._data, "chunks"): + return Frozen({dim: c for dim, c in zip(self.dims, self.data.chunks)}) + else: + return {} + _array_counter = itertools.count() def chunk(self, chunks={}, name=None, lock=False): - """Coerce this array's data into a dask arrays with the given chunks. + """Coerce this array's data into a dask array with the given chunks. If this variable is a non-dask array, it will be converted to dask array. If it's a dask array, it will be rechunked to the given chunk diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index de69c972fc6..9eda6ccdf19 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -104,6 +104,11 @@ def test_chunk(self): assert rechunked.chunks == expected self.assertLazyAndIdentical(self.eager_var, rechunked) + expected_chunksizes = { + dim: chunks for dim, chunks in zip(self.lazy_var.dims, expected) + } + assert rechunked.chunksizes == expected_chunksizes + def test_indexing(self): u = self.eager_var v = self.lazy_var @@ -330,6 +335,38 @@ def setUp(self): self.data, coords={"x": range(4)}, dims=("x", "y"), name="foo" ) + def test_chunk(self): + for chunks, expected in [ + ({}, ((2, 2), (2, 2, 2))), + (3, ((3, 1), (3, 3))), + ({"x": 3, "y": 3}, ((3, 1), (3, 3))), + ({"x": 3}, ((3, 1), (2, 2, 2))), + ({"x": (3, 1)}, ((3, 1), (2, 2, 2))), + ]: + # Test DataArray + rechunked = self.lazy_array.chunk(chunks) + assert rechunked.chunks == expected + self.assertLazyAndIdentical(self.eager_array, rechunked) + + expected_chunksizes = { + dim: chunks for dim, chunks in zip(self.lazy_array.dims, expected) + } + assert rechunked.chunksizes == expected_chunksizes + + # Test Dataset + lazy_dataset = self.lazy_array.to_dataset() + eager_dataset = self.eager_array.to_dataset() + expected_chunksizes = { + dim: chunks for dim, chunks in zip(lazy_dataset.dims, expected) + } + rechunked = lazy_dataset.chunk(chunks) + + # Dataset.chunks has a different return type to DataArray.chunks - see issue #5843 + assert rechunked.chunks == expected_chunksizes + self.assertLazyAndIdentical(eager_dataset, rechunked) + + assert rechunked.chunksizes == expected_chunksizes + def test_rechunk(self): chunked = self.eager_array.chunk({"x": 2}).chunk({"y": 2}) assert chunked.chunks == ((2,) * 2, (2,) * 3) From 867646fa75cb00413f1c21ca152d07a9bc4eb444 Mon Sep 17 00:00:00 2001 From: Tom Nicholas <35968931+TomNicholas@users.noreply.github.com> Date: Fri, 29 Oct 2021 15:57:35 -0400 Subject: [PATCH 8/8] Combine by coords dataarray bugfix (#5834) * fixed bug * added tests + reorganised slightly * clarified logic for dealing with mixed sets of objects * removed commented out old code * recorded bugfix in whatsnew * Update doc/whats-new.rst Co-authored-by: Mathias Hauser * Update xarray/core/combine.py Co-authored-by: Mathias Hauser * removed pointless renaming * update tests to look for capitalized error message * clarified return type in docstring * added test for combining two dataarrays with the same name * Update xarray/tests/test_combine.py Co-authored-by: Deepak Cherian * Update doc/whats-new.rst Co-authored-by: Deepak Cherian * added examples to docstrings * correct docstring example * re-trigger CI * Update xarray/core/combine.py Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> Co-authored-by: Mathias Hauser Co-authored-by: Deepak Cherian Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> --- doc/whats-new.rst | 2 + xarray/core/combine.py | 104 ++++++++++++++++++++++++++--------- xarray/tests/test_combine.py | 71 +++++++++++++++++++----- 3 files changed, 139 insertions(+), 38 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index cd0d4461aa2..08d6d23aeab 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -80,6 +80,8 @@ Bug fixes - Fixed performance bug where ``cftime`` import attempted within various core operations if ``cftime`` not installed (:pull:`5640`). By `Luke Sewell `_ +- Fixed bug when combining named DataArrays using :py:func:`combine_by_coords`. (:pull:`5834`). + By `Tom Nicholas `_. - When a custom engine was used in :py:func:`~xarray.open_dataset` the engine wasn't initialized properly, causing missing argument errors or inconsistent method signatures. (:pull:`5684`) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 56956a57e02..081b53391ba 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -673,7 +673,7 @@ def combine_by_coords( Attempt to auto-magically combine the given datasets (or data arrays) into one by using dimension coordinates. - This method attempts to combine a group of datasets along any number of + This function attempts to combine a group of datasets along any number of dimensions into a single entity by inspecting coords and metadata and using a combination of concat and merge. @@ -765,6 +765,8 @@ def combine_by_coords( Returns ------- combined : xarray.Dataset or xarray.DataArray + Will return a Dataset unless all the inputs are unnamed DataArrays, in which case a + DataArray will be returned. See also -------- @@ -870,6 +872,50 @@ def combine_by_coords( Data variables: temperature (y, x) float64 10.98 14.3 12.06 nan ... 18.89 10.44 8.293 precipitation (y, x) float64 0.4376 0.8918 0.9637 ... 0.5684 0.01879 0.6176 + + You can also combine DataArray objects, but the behaviour will differ depending on + whether or not the DataArrays are named. If all DataArrays are named then they will + be promoted to Datasets before combining, and then the resultant Dataset will be + returned, e.g. + + >>> named_da1 = xr.DataArray( + ... name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x" + ... ) + >>> named_da1 + + array([1., 2.]) + Coordinates: + * x (x) int64 0 1 + + >>> named_da2 = xr.DataArray( + ... name="a", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x" + ... ) + >>> named_da2 + + array([3., 4.]) + Coordinates: + * x (x) int64 2 3 + + >>> xr.combine_by_coords([named_da1, named_da2]) + + Dimensions: (x: 4) + Coordinates: + * x (x) int64 0 1 2 3 + Data variables: + a (x) float64 1.0 2.0 3.0 4.0 + + If all the DataArrays are unnamed, a single DataArray will be returned, e.g. + + >>> unnamed_da1 = xr.DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") + >>> unnamed_da2 = xr.DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") + >>> xr.combine_by_coords([unnamed_da1, unnamed_da2]) + + array([1., 2., 3., 4.]) + Coordinates: + * x (x) int64 0 1 2 3 + + Finally, if you attempt to combine a mix of unnamed DataArrays with either named + DataArrays or Datasets, a ValueError will be raised (as this is an ambiguous operation). """ # TODO remove after version 0.21, see PR4696 @@ -883,33 +929,41 @@ def combine_by_coords( if not data_objects: return Dataset() - mixed_arrays_and_datasets = any( + objs_are_unnamed_dataarrays = [ isinstance(data_object, DataArray) and data_object.name is None for data_object in data_objects - ) and any(isinstance(data_object, Dataset) for data_object in data_objects) - if mixed_arrays_and_datasets: - raise ValueError("Can't automatically combine datasets with unnamed arrays.") - - all_unnamed_data_arrays = all( - isinstance(data_object, DataArray) and data_object.name is None - for data_object in data_objects - ) - if all_unnamed_data_arrays: - unnamed_arrays = data_objects - temp_datasets = [data_array._to_temp_dataset() for data_array in unnamed_arrays] - - combined_temp_dataset = _combine_single_variable_hypercube( - temp_datasets, - fill_value=fill_value, - data_vars=data_vars, - coords=coords, - compat=compat, - join=join, - combine_attrs=combine_attrs, - ) - return DataArray()._from_temp_dataset(combined_temp_dataset) - + ] + if any(objs_are_unnamed_dataarrays): + if all(objs_are_unnamed_dataarrays): + # Combine into a single larger DataArray + temp_datasets = [ + unnamed_dataarray._to_temp_dataset() + for unnamed_dataarray in data_objects + ] + + combined_temp_dataset = _combine_single_variable_hypercube( + temp_datasets, + fill_value=fill_value, + data_vars=data_vars, + coords=coords, + compat=compat, + join=join, + combine_attrs=combine_attrs, + ) + return DataArray()._from_temp_dataset(combined_temp_dataset) + else: + # Must be a mix of unnamed dataarrays with either named dataarrays or with datasets + # Can't combine these as we wouldn't know whether to merge or concatenate the arrays + raise ValueError( + "Can't automatically combine unnamed DataArrays with either named DataArrays or Datasets." + ) else: + # Promote any named DataArrays to single-variable Datasets to simplify combining + data_objects = [ + obj.to_dataset() if isinstance(obj, DataArray) else obj + for obj in data_objects + ] + # Group by data vars sorted_datasets = sorted(data_objects, key=vars_as_keys) grouped_by_vars = itertools.groupby(sorted_datasets, key=vars_as_keys) diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index cbe09aab815..8d0c09eacec 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -12,6 +12,7 @@ combine_by_coords, combine_nested, concat, + merge, ) from xarray.core import dtypes from xarray.core.combine import ( @@ -688,7 +689,7 @@ def test_nested_combine_mixed_datasets_arrays(self): combine_nested(objs, "x") -class TestCombineAuto: +class TestCombineDatasetsbyCoords: def test_combine_by_coords(self): objs = [Dataset({"x": [0]}), Dataset({"x": [1]})] actual = combine_by_coords(objs) @@ -730,17 +731,6 @@ def test_combine_by_coords(self): def test_empty_input(self): assert_identical(Dataset(), combine_by_coords([])) - def test_combine_coords_mixed_datasets_arrays(self): - objs = [ - DataArray([0, 1], dims=("x"), coords=({"x": [0, 1]})), - Dataset({"x": [2, 3]}), - ] - with pytest.raises( - ValueError, - match=r"Can't automatically combine datasets with unnamed arrays.", - ): - combine_by_coords(objs) - @pytest.mark.parametrize( "join, expected", [ @@ -1044,7 +1034,35 @@ def test_combine_by_coords_incomplete_hypercube(self): with pytest.raises(ValueError): combine_by_coords([x1, x2, x3], fill_value=None) - def test_combine_by_coords_unnamed_arrays(self): + +class TestCombineMixedObjectsbyCoords: + def test_combine_by_coords_mixed_unnamed_dataarrays(self): + named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") + unnamed_da = DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") + + with pytest.raises( + ValueError, match="Can't automatically combine unnamed DataArrays with" + ): + combine_by_coords([named_da, unnamed_da]) + + da = DataArray([0, 1], dims="x", coords=({"x": [0, 1]})) + ds = Dataset({"x": [2, 3]}) + with pytest.raises( + ValueError, + match="Can't automatically combine unnamed DataArrays with", + ): + combine_by_coords([da, ds]) + + def test_combine_coords_mixed_datasets_named_dataarrays(self): + da = DataArray(name="a", data=[4, 5], dims="x", coords=({"x": [0, 1]})) + ds = Dataset({"b": ("x", [2, 3])}) + actual = combine_by_coords([da, ds]) + expected = Dataset( + {"a": ("x", [4, 5]), "b": ("x", [2, 3])}, coords={"x": ("x", [0, 1])} + ) + assert_identical(expected, actual) + + def test_combine_by_coords_all_unnamed_dataarrays(self): unnamed_array = DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") actual = combine_by_coords([unnamed_array]) @@ -1060,6 +1078,33 @@ def test_combine_by_coords_unnamed_arrays(self): ) assert_identical(expected, actual) + def test_combine_by_coords_all_named_dataarrays(self): + named_da = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") + + actual = combine_by_coords([named_da]) + expected = named_da.to_dataset() + assert_identical(expected, actual) + + named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") + named_da2 = DataArray(name="b", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") + + actual = combine_by_coords([named_da1, named_da2]) + expected = Dataset( + { + "a": DataArray(data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x"), + "b": DataArray(data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x"), + } + ) + assert_identical(expected, actual) + + def test_combine_by_coords_all_dataarrays_with_the_same_name(self): + named_da1 = DataArray(name="a", data=[1.0, 2.0], coords={"x": [0, 1]}, dims="x") + named_da2 = DataArray(name="a", data=[3.0, 4.0], coords={"x": [2, 3]}, dims="x") + + actual = combine_by_coords([named_da1, named_da2]) + expected = merge([named_da1, named_da2]) + assert_identical(expected, actual) + @requires_cftime def test_combine_by_coords_distant_cftime_dates():