From 6ffaf6c523bc552f9387c4a8367542f83d4560c0 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 30 Nov 2023 10:50:48 -0800 Subject: [PATCH 01/12] Use numbagg for `rolling` methods A couple of tests are failing for the multi-dimensional case, which I'll fix before merge. --- xarray/core/rolling.py | 82 +++++++++++++++++++++++++++++++++--- xarray/tests/test_rolling.py | 51 ++++++++++++---------- 2 files changed, 104 insertions(+), 29 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 8f21fe37072..5151636f040 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -14,7 +14,7 @@ from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.pycompat import is_duck_dask_array from xarray.core.types import CoarsenBoundaryOptions, SideOptions, T_Xarray -from xarray.core.utils import either_dict_or_kwargs +from xarray.core.utils import either_dict_or_kwargs, module_available try: import bottleneck @@ -153,14 +153,21 @@ def _reduce_method( # type: ignore[misc] array_agg_func = getattr(duck_array_ops, name) bottleneck_move_func = getattr(bottleneck, "move_" + name, None) + if module_available("numbagg"): + import numbagg + + numbagg_move_func = getattr(numbagg, "move_" + name, None) + else: + numbagg_move_func = None def method(self, keep_attrs=None, **kwargs): keep_attrs = self._get_keep_attrs(keep_attrs) return self._numpy_or_bottleneck_reduce( - array_agg_func, - bottleneck_move_func, - rolling_agg_func, + array_agg_func=array_agg_func, + bottleneck_move_func=bottleneck_move_func, + numbagg_move_func=numbagg_move_func, + rolling_agg_func=rolling_agg_func, keep_attrs=keep_attrs, fillna=fillna, **kwargs, @@ -510,9 +517,47 @@ def _counts(self, keep_attrs: bool | None) -> DataArray: ) return counts - def _bottleneck_reduce(self, func, keep_attrs, **kwargs): - from xarray.core.dataarray import DataArray + def _numbagg_reduce(self, func, keep_attrs, **kwargs): + # Some of this is copied from `_bottleneck_reduce`, we could reduce this as part + # of a wider refactor. + + axis = self.obj.get_axis_num(self.dim[0]) + + padded = self.obj.variable + if self.center[0]: + if is_duck_dask_array(padded.data): + # workaround to make the padded chunk size larger than + # self.window - 1 + shift = -(self.window[0] + 1) // 2 + offset = (self.window[0] - 1) // 2 + valid = (slice(None),) * axis + ( + slice(offset, offset + self.obj.shape[axis]), + ) + else: + shift = (-self.window[0] // 2) + 1 + valid = (slice(None),) * axis + (slice(-shift, None),) + padded = padded.pad({self.dim[0]: (0, -shift)}, mode="constant") + + if is_duck_dask_array(padded.data): + raise AssertionError("should not be reachable") + else: + values = func( + padded.data, + window=self.window[0], + min_count=self.min_periods, + axis=axis, + ) + + if self.center[0]: + values = values[valid] + attrs = self.obj.attrs if keep_attrs else {} + + return self.obj.__class__( + values, self.obj.coords, attrs=attrs, name=self.obj.name + ) + + def _bottleneck_reduce(self, func, keep_attrs, **kwargs): # bottleneck doesn't allow min_count to be 0, although it should # work the same as if min_count = 1 # Note bottleneck only works with 1d-rolling. @@ -550,12 +595,15 @@ def _bottleneck_reduce(self, func, keep_attrs, **kwargs): attrs = self.obj.attrs if keep_attrs else {} - return DataArray(values, self.obj.coords, attrs=attrs, name=self.obj.name) + return self.obj.__class__( + values, self.obj.coords, attrs=attrs, name=self.obj.name + ) def _numpy_or_bottleneck_reduce( self, array_agg_func, bottleneck_move_func, + numbagg_move_func, rolling_agg_func, keep_attrs, fillna, @@ -571,6 +619,26 @@ def _numpy_or_bottleneck_reduce( ) del kwargs["dim"] + if ( + OPTIONS["use_numbagg"] + and numbagg_move_func is not None + # TODO: can we allow this with numbagg? + and not is_duck_dask_array(self.obj.data) + and self.obj.data.dtype.kind not in "Ob" + ): + import numbagg + + # Numbagg has a default ddof of 1. I (@max-sixty) think we should make + # this the default in xarray too, but until we do, don't use numbagg for + # std and var unless ddof is set to 1. + if ( + numbagg_move_func not in [numbagg.move_std, numbagg.move_var] + or kwargs.get("ddof") == 1 + ): + return self._numbagg_reduce( + numbagg_move_func, keep_attrs=keep_attrs, **kwargs + ) + if ( OPTIONS["use_bottleneck"] and bottleneck_move_func is not None diff --git a/xarray/tests/test_rolling.py b/xarray/tests/test_rolling.py index cb7b723a208..bdaaf269a77 100644 --- a/xarray/tests/test_rolling.py +++ b/xarray/tests/test_rolling.py @@ -98,7 +98,9 @@ def test_rolling_wrapped_bottleneck(self, da, name, center, min_periods) -> None expected = getattr(bn, func_name)( da.values, window=7, axis=1, min_count=min_periods ) - assert_array_equal(actual.values, expected) + + # Using assert_allclose because we get tiny (1e-17) differences in numbagg. + np.testing.assert_allclose(actual.values, expected) with pytest.warns(DeprecationWarning, match="Reductions are applied"): getattr(rolling_obj, name)(dim="time") @@ -280,28 +282,33 @@ def test_rolling_count_correct(self) -> None: @pytest.mark.parametrize("min_periods", (None, 1)) @pytest.mark.parametrize("name", ("sum", "mean", "max")) def test_ndrolling_reduce(self, da, center, min_periods, name) -> None: - rolling_obj = da.rolling(time=3, x=2, center=center, min_periods=min_periods) - - actual = getattr(rolling_obj, name)() - expected = getattr( - getattr( - da.rolling(time=3, center=center, min_periods=min_periods), name - )().rolling(x=2, center=center, min_periods=min_periods), - name, - )() - - assert_allclose(actual, expected) - assert actual.dims == expected.dims - - if name in ["mean"]: - # test our reimplementation of nanmean using np.nanmean - expected = getattr(rolling_obj.construct({"time": "tw", "x": "xw"}), name)( - ["tw", "xw"] + # FIXME + # with xr.set_options(use_numbagg=False): + if True: + rolling_obj = da.rolling( + time=3, x=2, center=center, min_periods=min_periods ) - count = rolling_obj.count() - if min_periods is None: - min_periods = 1 - assert_allclose(actual, expected.where(count >= min_periods)) + + actual = getattr(rolling_obj, name)() + expected = getattr( + getattr( + da.rolling(time=3, center=center, min_periods=min_periods), name + )().rolling(x=2, center=center, min_periods=min_periods), + name, + )() + + assert_allclose(actual, expected) + assert actual.dims == expected.dims + + if name in ["mean"]: + # test our reimplementation of nanmean using np.nanmean + expected = getattr( + rolling_obj.construct({"time": "tw", "x": "xw"}), name + )(["tw", "xw"]) + count = rolling_obj.count() + if min_periods is None: + min_periods = 1 + assert_allclose(actual, expected.where(count >= min_periods)) @pytest.mark.parametrize("center", (True, False, (True, False))) @pytest.mark.parametrize("fill_value", (np.nan, 0.0)) From d8257b4d547fb520f1add8d13c075730beb2cf13 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Thu, 30 Nov 2023 15:25:59 -0800 Subject: [PATCH 02/12] --- xarray/core/rolling.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 5151636f040..e2da28c7307 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -8,8 +8,9 @@ from typing import TYPE_CHECKING, Any, Callable, Generic, TypeVar import numpy as np +from packaging.version import Version -from xarray.core import dtypes, duck_array_ops, utils +from xarray.core import dtypes, duck_array_ops, pycompat, utils from xarray.core.arithmetic import CoarsenArithmetic from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.pycompat import is_duck_dask_array @@ -621,6 +622,8 @@ def _numpy_or_bottleneck_reduce( if ( OPTIONS["use_numbagg"] + and module_available("numbagg") + and pycompat.mod_version("numbagg") >= Version("0.6.3") and numbagg_move_func is not None # TODO: can we allow this with numbagg? and not is_duck_dask_array(self.obj.data) From 112d6e8e937a95cdf582b47e6031b0a79d981fca Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 2 Dec 2023 11:46:36 -0800 Subject: [PATCH 03/12] --- xarray/core/rolling.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index e2da28c7307..099b799d862 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -146,7 +146,7 @@ def _reduce_method( # type: ignore[misc] name: str, fillna: Any, rolling_agg_func: Callable | None = None ) -> Callable[..., T_Xarray]: """Constructs reduction methods built on a numpy reduction function (e.g. sum), - a bottleneck reduction function (e.g. move_sum), or a Rolling reduction (_mean). + a numbagg reduction function (e.g. move_sum), a bottleneck reduction function (e.g. move_sum), or a Rolling reduction (_mean). """ if rolling_agg_func: array_agg_func = None @@ -164,7 +164,7 @@ def _reduce_method( # type: ignore[misc] def method(self, keep_attrs=None, **kwargs): keep_attrs = self._get_keep_attrs(keep_attrs) - return self._numpy_or_bottleneck_reduce( + return self._array_reduce( array_agg_func=array_agg_func, bottleneck_move_func=bottleneck_move_func, numbagg_move_func=numbagg_move_func, @@ -600,7 +600,7 @@ def _bottleneck_reduce(self, func, keep_attrs, **kwargs): values, self.obj.coords, attrs=attrs, name=self.obj.name ) - def _numpy_or_bottleneck_reduce( + def _array_reduce( self, array_agg_func, bottleneck_move_func, @@ -776,7 +776,7 @@ def _counts(self, keep_attrs: bool | None) -> Dataset: DataArrayRolling._counts, keep_attrs=keep_attrs ) - def _numpy_or_bottleneck_reduce( + def _array_reduce( self, array_agg_func, bottleneck_move_func, @@ -786,7 +786,7 @@ def _numpy_or_bottleneck_reduce( ): return self._dataset_implementation( functools.partial( - DataArrayRolling._numpy_or_bottleneck_reduce, + DataArrayRolling._array_reduce, array_agg_func=array_agg_func, bottleneck_move_func=bottleneck_move_func, rolling_agg_func=rolling_agg_func, From c9b79bf55f7dbaffb03977a31c1a1a9a23567f97 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 2 Dec 2023 12:13:44 -0800 Subject: [PATCH 04/12] --- xarray/core/rolling.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 099b799d862..027983b494b 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -146,7 +146,13 @@ def _reduce_method( # type: ignore[misc] name: str, fillna: Any, rolling_agg_func: Callable | None = None ) -> Callable[..., T_Xarray]: """Constructs reduction methods built on a numpy reduction function (e.g. sum), - a numbagg reduction function (e.g. move_sum), a bottleneck reduction function (e.g. move_sum), or a Rolling reduction (_mean). + a numbagg reduction function (e.g. move_sum), a bottleneck reduction function + (e.g. move_sum), or a Rolling reduction (_mean). + + The logic here for which function to run is quite diffuse, across this method & + _array_reduce. Arguably we could refactor this. But one constraint is that we + need context of xarray options, of the functions each library offers, of + the array (e.g. dtype). """ if rolling_agg_func: array_agg_func = None @@ -539,7 +545,7 @@ def _numbagg_reduce(self, func, keep_attrs, **kwargs): valid = (slice(None),) * axis + (slice(-shift, None),) padded = padded.pad({self.dim[0]: (0, -shift)}, mode="constant") - if is_duck_dask_array(padded.data): + if is_duck_dask_array(padded.data) and False: raise AssertionError("should not be reachable") else: values = func( @@ -625,9 +631,16 @@ def _array_reduce( and module_available("numbagg") and pycompat.mod_version("numbagg") >= Version("0.6.3") and numbagg_move_func is not None - # TODO: can we allow this with numbagg? + # TODO: we could at least allow this for the equivalent of `apply_ufunc`'s + # "parallelized". `rolling_exp` does this, as an example (but it otherwise + # much simpler) and not is_duck_dask_array(self.obj.data) + # Numbagg doesn't handle object arrays and generally has dtype consistency, + # so doesn't deal well with bool arrays. and self.obj.data.dtype.kind not in "Ob" + # TODO: we could also allow this, probably as part of a refactoring of this + # module, so we can use the machinery in `self.reduce`. + and self.ndim == 1 ): import numbagg From f8218707ea4e3b11e949960e9594c5d0bb3e2b64 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 2 Dec 2023 12:14:23 -0800 Subject: [PATCH 05/12] --- xarray/core/rolling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index 027983b494b..b3a705f17ee 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -667,8 +667,10 @@ def _array_reduce( return self._bottleneck_reduce( bottleneck_move_func, keep_attrs=keep_attrs, **kwargs ) + if rolling_agg_func: return rolling_agg_func(self, keep_attrs=self._get_keep_attrs(keep_attrs)) + if fillna is not None: if fillna is dtypes.INF: fillna = dtypes.get_pos_infinity(self.obj.dtype, max_for_int=True) From 54657ba064f5a2b2c81c80edf65bec765ce2fe01 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 2 Dec 2023 13:17:34 -0800 Subject: [PATCH 06/12] wip --- xarray/core/dataarray.py | 2 +- xarray/core/rolling.py | 2 +- xarray/tests/test_rolling.py | 50 +++++++++++++++++++++++------------- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index bac4ad36adb..da362756e80 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -6322,7 +6322,7 @@ def curvefit( ... param="time_constant" ... ) # doctest: +NUMBER - array([1.0569203, 1.7354963, 2.9421577]) + array([1.05692035, 1.73549638, 2.9421577 ]) Coordinates: * x (x) int64 0 1 2 param None: @pytest.mark.parametrize("center", (True, False, None)) @pytest.mark.parametrize("min_periods", (1, None)) @pytest.mark.parametrize("backend", ["numpy"], indirect=True) - def test_rolling_wrapped_bottleneck(self, da, name, center, min_periods) -> None: + @pytest.mark.parametrize("compute_backend", ["bottleneck", "numbagg"]) + def test_rolling_wrapped_bottleneck( + self, da, name, center, min_periods, compute_backend + ) -> None: bn = pytest.importorskip("bottleneck", minversion="1.1") + if compute_backend == "bottleneck": + options = dict(use_bottleneck=True, use_numbagg=False) + elif compute_backend == "numbagg": + options = dict(use_bottleneck=False, use_numbagg=True) + else: + raise ValueError - # Test all bottleneck functions - rolling_obj = da.rolling(time=7, min_periods=min_periods) + with xr.set_options(**options): + # Test all bottleneck functions + rolling_obj = da.rolling(time=7, min_periods=min_periods) - func_name = f"move_{name}" - actual = getattr(rolling_obj, name)() - expected = getattr(bn, func_name)( - da.values, window=7, axis=1, min_count=min_periods - ) + func_name = f"move_{name}" + actual = getattr(rolling_obj, name)() + expected = getattr(bn, func_name)( + da.values, window=7, axis=1, min_count=min_periods + ) - # Using assert_allclose because we get tiny (1e-17) differences in numbagg. - np.testing.assert_allclose(actual.values, expected) + # Using assert_allclose because we get tiny (1e-17) differences in numbagg. + np.testing.assert_allclose(actual.values, expected) - with pytest.warns(DeprecationWarning, match="Reductions are applied"): - getattr(rolling_obj, name)(dim="time") + with pytest.warns(DeprecationWarning, match="Reductions are applied"): + getattr(rolling_obj, name)(dim="time") - # Test center - rolling_obj = da.rolling(time=7, center=center) - actual = getattr(rolling_obj, name)()["time"] - assert_equal(actual, da["time"]) + # Test center + rolling_obj = da.rolling(time=7, center=center) + actual = getattr(rolling_obj, name)()["time"] + np.testing.assert_allclose(actual.values, expected) + # Using assert_allclose because we get tiny (1e-17) differences in numbagg. + assert_allclose(actual, da["time"]) @requires_dask @pytest.mark.parametrize("name", ("mean", "count")) @@ -584,12 +598,12 @@ def test_rolling_wrapped_bottleneck( ) else: raise ValueError - assert_array_equal(actual[key].values, expected) + np.testing.assert_allclose(actual[key].values, expected) # Test center rolling_obj = ds.rolling(time=7, center=center) actual = getattr(rolling_obj, name)()["time"] - assert_equal(actual, ds["time"]) + assert_allclose(actual, ds["time"]) @pytest.mark.parametrize("center", (True, False)) @pytest.mark.parametrize("min_periods", (None, 1, 2, 3)) From 3ada3c6729ed1d0e174b0dd2d5616f0bf9bf3a72 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 2 Dec 2023 22:32:48 +0000 Subject: [PATCH 07/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_rolling.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xarray/tests/test_rolling.py b/xarray/tests/test_rolling.py index 8bae1b8ee60..d0b4709419d 100644 --- a/xarray/tests/test_rolling.py +++ b/xarray/tests/test_rolling.py @@ -1,18 +1,15 @@ from __future__ import annotations -from calendar import c from typing import Any import numpy as np import pandas as pd import pytest -from cycler import V import xarray as xr from xarray import DataArray, Dataset, set_options from xarray.tests import ( assert_allclose, - assert_array_equal, assert_equal, assert_identical, has_dask, From d1b6664e4743f713657ce2171d346ce5d81b2ec0 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 2 Dec 2023 14:39:17 -0800 Subject: [PATCH 08/12] --- xarray/tests/test_rolling.py | 57 +++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/xarray/tests/test_rolling.py b/xarray/tests/test_rolling.py index d0b4709419d..62e3623bb95 100644 --- a/xarray/tests/test_rolling.py +++ b/xarray/tests/test_rolling.py @@ -23,6 +23,19 @@ ] +@pytest.fixture(params=["numbagg", "bottleneck"]) +def compute_backend(request): + if request.param == "bottleneck": + options = dict(use_bottleneck=True, use_numbagg=False) + elif request.param == "numbagg": + options = dict(use_bottleneck=False, use_numbagg=True) + else: + raise ValueError + + with xr.set_options(**options): + yield request.param + + class TestDataArrayRolling: @pytest.mark.parametrize("da", (1, 2), indirect=True) @pytest.mark.parametrize("center", [True, False]) @@ -86,40 +99,30 @@ def test_rolling_properties(self, da) -> None: @pytest.mark.parametrize("center", (True, False, None)) @pytest.mark.parametrize("min_periods", (1, None)) @pytest.mark.parametrize("backend", ["numpy"], indirect=True) - @pytest.mark.parametrize("compute_backend", ["bottleneck", "numbagg"]) def test_rolling_wrapped_bottleneck( self, da, name, center, min_periods, compute_backend ) -> None: bn = pytest.importorskip("bottleneck", minversion="1.1") - if compute_backend == "bottleneck": - options = dict(use_bottleneck=True, use_numbagg=False) - elif compute_backend == "numbagg": - options = dict(use_bottleneck=False, use_numbagg=True) - else: - raise ValueError - - with xr.set_options(**options): - # Test all bottleneck functions - rolling_obj = da.rolling(time=7, min_periods=min_periods) + # Test all bottleneck functions + rolling_obj = da.rolling(time=7, min_periods=min_periods) - func_name = f"move_{name}" - actual = getattr(rolling_obj, name)() - expected = getattr(bn, func_name)( - da.values, window=7, axis=1, min_count=min_periods - ) + func_name = f"move_{name}" + actual = getattr(rolling_obj, name)() + expected = getattr(bn, func_name)( + da.values, window=7, axis=1, min_count=min_periods + ) - # Using assert_allclose because we get tiny (1e-17) differences in numbagg. - np.testing.assert_allclose(actual.values, expected) + # Using assert_allclose because we get tiny (1e-17) differences in numbagg. + np.testing.assert_allclose(actual.values, expected) - with pytest.warns(DeprecationWarning, match="Reductions are applied"): - getattr(rolling_obj, name)(dim="time") + with pytest.warns(DeprecationWarning, match="Reductions are applied"): + getattr(rolling_obj, name)(dim="time") - # Test center - rolling_obj = da.rolling(time=7, center=center) - actual = getattr(rolling_obj, name)()["time"] - np.testing.assert_allclose(actual.values, expected) - # Using assert_allclose because we get tiny (1e-17) differences in numbagg. - assert_allclose(actual, da["time"]) + # Test center + rolling_obj = da.rolling(time=7, center=center) + actual = getattr(rolling_obj, name)()["time"] + # Using assert_allclose because we get tiny (1e-17) differences in numbagg. + assert_allclose(actual, da["time"]) @requires_dask @pytest.mark.parametrize("name", ("mean", "count")) @@ -578,7 +581,7 @@ def test_rolling_properties(self, ds) -> None: @pytest.mark.parametrize("key", ("z1", "z2")) @pytest.mark.parametrize("backend", ["numpy"], indirect=True) def test_rolling_wrapped_bottleneck( - self, ds, name, center, min_periods, key + self, ds, name, center, min_periods, key, compute_backend ) -> None: bn = pytest.importorskip("bottleneck", minversion="1.1") From 06fbcbfe5772f55d3e12ce859eaa0b58844771c7 Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 2 Dec 2023 14:45:27 -0800 Subject: [PATCH 09/12] whatsnew --- doc/whats-new.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index bfe0a1b9be5..71a18351c75 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -23,6 +23,13 @@ v2023.11.1 (unreleased) New Features ~~~~~~~~~~~~ +- :py:meth:`rolling` uses numbagg `_ for + most of its computations by default, which is up to 5x faster than bottleneck + where parallelization is possible. Where parallelization isn't possible — for + example a 1D array — it's about the same speed as bottleneck, and 2-5x faster + than pandas' default functions. (:pull:`8493`). numbagg is an optional + dependency, so requires installing separately. + By `Maximilian Roos `_. - Use a concise format when plotting datetime arrays. (:pull:`8449`). By `Jimmy Westling `_. - Avoid overwriting unchanged existing coordinate variables when appending by setting ``mode='a-'``. @@ -83,7 +90,7 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ -- :py:meth:`DataArray.bfill` & :py:meth:`DataArray.ffill` now use numbagg by +- :py:meth:`DataArray.bfill` & :py:meth:`DataArray.ffill` now use numbagg `_ by default, which is up to 5x faster where parallelization is possible. (:pull:`8339`) By `Maximilian Roos `_. - Update mypy version to 1.7 (:issue:`8448`, :pull:`8501`). From df5702b27c4f53fbbc0debb7017478508b32901f Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 2 Dec 2023 14:45:47 -0800 Subject: [PATCH 10/12] --- xarray/core/dataarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4533ea7db39..1d7e82d3044 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -6328,7 +6328,7 @@ def curvefit( ... param="time_constant" ... ) # doctest: +NUMBER - array([1.05692035, 1.73549638, 2.9421577 ]) + array([1.0569203, 1.7354963, 2.9421577]) Coordinates: * x (x) int64 0 1 2 param Date: Sat, 2 Dec 2023 14:48:45 -0800 Subject: [PATCH 11/12] --- doc/whats-new.rst | 2 +- xarray/core/rolling.py | 4 +-- xarray/tests/test_rolling.py | 65 +++++++++++++++++++----------------- 3 files changed, 37 insertions(+), 34 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 71a18351c75..1ea0246538f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,7 +24,7 @@ New Features ~~~~~~~~~~~~ - :py:meth:`rolling` uses numbagg `_ for - most of its computations by default, which is up to 5x faster than bottleneck + most of its computations by default. Numbagg is up to 5x faster than bottleneck where parallelization is possible. Where parallelization isn't possible — for example a 1D array — it's about the same speed as bottleneck, and 2-5x faster than pandas' default functions. (:pull:`8493`). numbagg is an optional diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index e90ca6992dc..819c31642d0 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -632,11 +632,11 @@ def _array_reduce( and pycompat.mod_version("numbagg") >= Version("0.6.3") and numbagg_move_func is not None # TODO: we could at least allow this for the equivalent of `apply_ufunc`'s - # "parallelized". `rolling_exp` does this, as an example (but it otherwise + # "parallelized". `rolling_exp` does this, as an example (but rolling_exp is # much simpler) and not is_duck_dask_array(self.obj.data) # Numbagg doesn't handle object arrays and generally has dtype consistency, - # so doesn't deal well with bool arrays. + # so doesn't deal well with bool arrays which are expected to change type. and self.obj.data.dtype.kind not in "ObMm" # TODO: we could also allow this, probably as part of a refactoring of this # module, so we can use the machinery in `self.reduce`. diff --git a/xarray/tests/test_rolling.py b/xarray/tests/test_rolling.py index 62e3623bb95..0ea373e3ab0 100644 --- a/xarray/tests/test_rolling.py +++ b/xarray/tests/test_rolling.py @@ -169,7 +169,9 @@ def test_rolling_wrapped_dask_nochunk(self, center) -> None: @pytest.mark.parametrize("center", (True, False)) @pytest.mark.parametrize("min_periods", (None, 1, 2, 3)) @pytest.mark.parametrize("window", (1, 2, 3, 4)) - def test_rolling_pandas_compat(self, center, window, min_periods) -> None: + def test_rolling_pandas_compat( + self, center, window, min_periods, compute_backend + ) -> None: s = pd.Series(np.arange(10)) da = DataArray.from_series(s) @@ -219,7 +221,9 @@ def test_rolling_construct(self, center: bool, window: int) -> None: @pytest.mark.parametrize("min_periods", (None, 1, 2, 3)) @pytest.mark.parametrize("window", (1, 2, 3, 4)) @pytest.mark.parametrize("name", ("sum", "mean", "std", "max")) - def test_rolling_reduce(self, da, center, min_periods, window, name) -> None: + def test_rolling_reduce( + self, da, center, min_periods, window, name, compute_backend + ) -> None: if min_periods is not None and window < min_periods: min_periods = window @@ -239,7 +243,9 @@ def test_rolling_reduce(self, da, center, min_periods, window, name) -> None: @pytest.mark.parametrize("min_periods", (None, 1, 2, 3)) @pytest.mark.parametrize("window", (1, 2, 3, 4)) @pytest.mark.parametrize("name", ("sum", "max")) - def test_rolling_reduce_nonnumeric(self, center, min_periods, window, name) -> None: + def test_rolling_reduce_nonnumeric( + self, center, min_periods, window, name, compute_backend + ) -> None: da = DataArray( [0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time" ).isnull() @@ -255,7 +261,7 @@ def test_rolling_reduce_nonnumeric(self, center, min_periods, window, name) -> N assert_allclose(actual, expected) assert actual.dims == expected.dims - def test_rolling_count_correct(self) -> None: + def test_rolling_count_correct(self, compute_backend) -> None: da = DataArray([0, np.nan, 1, 2, np.nan, 3, 4, 5, np.nan, 6, 7], dims="time") kwargs: list[dict[str, Any]] = [ @@ -295,34 +301,31 @@ def test_rolling_count_correct(self) -> None: @pytest.mark.parametrize("center", (True, False)) @pytest.mark.parametrize("min_periods", (None, 1)) @pytest.mark.parametrize("name", ("sum", "mean", "max")) - def test_ndrolling_reduce(self, da, center, min_periods, name) -> None: - # FIXME - # with xr.set_options(use_numbagg=False): - if True: - rolling_obj = da.rolling( - time=3, x=2, center=center, min_periods=min_periods - ) + def test_ndrolling_reduce( + self, da, center, min_periods, name, compute_backend + ) -> None: + rolling_obj = da.rolling(time=3, x=2, center=center, min_periods=min_periods) + + actual = getattr(rolling_obj, name)() + expected = getattr( + getattr( + da.rolling(time=3, center=center, min_periods=min_periods), name + )().rolling(x=2, center=center, min_periods=min_periods), + name, + )() - actual = getattr(rolling_obj, name)() - expected = getattr( - getattr( - da.rolling(time=3, center=center, min_periods=min_periods), name - )().rolling(x=2, center=center, min_periods=min_periods), - name, - )() - - assert_allclose(actual, expected) - assert actual.dims == expected.dims - - if name in ["mean"]: - # test our reimplementation of nanmean using np.nanmean - expected = getattr( - rolling_obj.construct({"time": "tw", "x": "xw"}), name - )(["tw", "xw"]) - count = rolling_obj.count() - if min_periods is None: - min_periods = 1 - assert_allclose(actual, expected.where(count >= min_periods)) + assert_allclose(actual, expected) + assert actual.dims == expected.dims + + if name in ["mean"]: + # test our reimplementation of nanmean using np.nanmean + expected = getattr(rolling_obj.construct({"time": "tw", "x": "xw"}), name)( + ["tw", "xw"] + ) + count = rolling_obj.count() + if min_periods is None: + min_periods = 1 + assert_allclose(actual, expected.where(count >= min_periods)) @pytest.mark.parametrize("center", (True, False, (True, False))) @pytest.mark.parametrize("fill_value", (np.nan, 0.0)) From 1cdacaa1c29f2ff166df212c8cc332553f75a3bd Mon Sep 17 00:00:00 2001 From: Maximilian Roos Date: Sat, 2 Dec 2023 18:14:00 -0800 Subject: [PATCH 12/12] --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1ea0246538f..e05756401f7 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -84,7 +84,7 @@ Documentation - Improved error message when attempting to get a variable which doesn't exist from a Dataset. (:pull:`8474`) By `Maximilian Roos `_. -- Fix default value of ``combine_attrs `` in :py:func:`xarray.combine_by_coords` (:pull:`8471`) +- Fix default value of ``combine_attrs`` in :py:func:`xarray.combine_by_coords` (:pull:`8471`) By `Gregorio L. Trevisan `_. Internal Changes