From 9d97ddf446e2785be051dcdf6934e4de7a92fd0a Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Thu, 7 Sep 2023 00:41:33 +0200 Subject: [PATCH] lowess: n_steps and use_coords --- mesmer/stats/smoothing.py | 50 ++++++++++++++++++++++++++--- tests/unit/test_smoothing.py | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 5 deletions(-) diff --git a/mesmer/stats/smoothing.py b/mesmer/stats/smoothing.py index 844eaf37..b0643f7f 100644 --- a/mesmer/stats/smoothing.py +++ b/mesmer/stats/smoothing.py @@ -4,7 +4,7 @@ from mesmer.core.utils import _check_dataarray_form -def lowess(data, dim, *, frac, use_coords_as_x=False, it=0): +def lowess(data, dim, *, n_steps=None, frac=None, use_coords=True, it=0): """LOWESS (Locally Weighted Scatterplot Smoothing) for xarray objects Parameters @@ -13,13 +13,27 @@ def lowess(data, dim, *, frac, use_coords_as_x=False, it=0): Data to smooth (y-values). dim : str Dimension along which to smooth (x-dimension) + n_steps : int + The number of data points used to estimate each y-value, must be between 0 and + the length of dim. If given used to calculate ``frac``. Exactly one of + ``n_steps`` and ``frac`` must be given. frac : float - Between 0 and 1. The fraction of the data used when estimating each y-value. - use_coords_as_x : boolean, default: False + The fraction of the data used when estimating each y-value. Between 0 and 1. + Exactly one of ``n_steps`` and ``frac`` must be given. + use_coords : boolean, default: True If True uses ``data[dim]`` as x-values else uses ``np.arange(data[dim].size)`` (useful if ``dim`` are time coordinates). it : int, default: 0 The number of residual-based re-weightings to perform. + + Returns + ------- + out : xr.DataArray | xr.Dataset + LOWESS smoothed array + + See Also + -------- + statsmodels.nonparametric.smoothers_lowess.lowess """ from statsmodels.nonparametric.smoothers_lowess import lowess @@ -27,11 +41,37 @@ def lowess(data, dim, *, frac, use_coords_as_x=False, it=0): if not isinstance(dim, str): raise ValueError("Can only pass a single dimension.") + if (n_steps is None and frac is None) or (n_steps is not None and frac is not None): + raise ValueError("Exactly one of ``n_steps`` and ``frac`` must be given.") + coords = data[dim] _check_dataarray_form(coords, name=dim, ndim=1) - if use_coords_as_x: - x = coords + n_coords = coords.size + if n_steps is not None: + + if n_steps > n_coords: + raise ValueError( + f"``n_steps`` ({n_steps}) cannot be be larger than the length of '{dim}' ({n_coords})" + ) + + frac = n_steps / n_coords + + # TODO: could instead convert datetime coords to numeric (see e.g. in flox) + if use_coords: + try: + # test if coords can be cast to float (required by statsmodels..lowess) + # use safe casting so we don't convert np datetime to float + # while this technically works the x values are then too large such that + # a missing year is no longer detected... + x = coords.astype(float, casting="safe") + except TypeError as e: + raise TypeError( + f"Cannot convert coords ({dim}) of type `{coords.dtype}` to float. " + "Set ``use_coords=False`` to use enumerated coords " + "(``np.arange(data[dim].size)``) instead." + ) from e + else: x = xr.ones_like(coords) x.data = np.arange(coords.size) diff --git a/tests/unit/test_smoothing.py b/tests/unit/test_smoothing.py index cb9bf50b..f7370e49 100644 --- a/tests/unit/test_smoothing.py +++ b/tests/unit/test_smoothing.py @@ -16,6 +16,29 @@ def test_lowess_errors(): with pytest.raises(ValueError, match="data should be 1-dimensional"): mesmer.stats.smoothing.lowess(data.to_dataset(), "data", frac=0.3) + with pytest.raises(ValueError, match="Exactly one of ``n_steps`` and ``frac``"): + mesmer.stats.smoothing.lowess(data.to_dataset(), "lat") + + with pytest.raises(ValueError, match="Exactly one of ``n_steps`` and ``frac``"): + mesmer.stats.smoothing.lowess(data.to_dataset(), "lat", frac=0.5, n_steps=10) + + with pytest.raises(ValueError, match=r"``n_steps`` \(40\) cannot be be larger"): + mesmer.stats.smoothing.lowess(data.to_dataset(), "lat", n_steps=40) + + # numpy datetime + time = xr.date_range("2000-01-01", periods=30) + data = data.assign_coords(time=time) + + with pytest.raises(TypeError, match="Cannot convert coords"): + mesmer.stats.smoothing.lowess(data.to_dataset(), "time", frac=0.5) + + # cftime datetime + time = xr.date_range("2000-01-01", periods=30, calendar="noloeap") + data = data.assign_coords(time=time) + + with pytest.raises(TypeError, match="Cannot convert coords"): + mesmer.stats.smoothing.lowess(data.to_dataset(), "time", frac=0.5) + @pytest.mark.parametrize("it", [0, 3]) @pytest.mark.parametrize("frac", [0.3, 0.5]) @@ -33,6 +56,45 @@ def test_lowess(it, frac): xr.testing.assert_allclose(result, expected) +@pytest.mark.parametrize("n_steps", [0, 10, 15, 30]) +def test_lowess_n_steps(n_steps): + + data = trend_data_1D() + + result = mesmer.stats.smoothing.lowess(data, "time", n_steps=n_steps) + + frac = n_steps / 30 + expected = mesmer.stats.smoothing.lowess(data, "time", frac=frac) + + xr.testing.assert_allclose(result, expected) + + +def test_lowess_use_coords(): + + data = trend_data_1D() + time = data.time.values + time[-1] = time[-1] + 10 + data = data.assign_coords(time=time) + + + result = mesmer.stats.smoothing.lowess(data, "time", frac=0.1) + + # time is not equally spaced: we do NOT want the same result as for use_coords=False + not_expected = mesmer.stats.smoothing.lowess( + data, "time", frac=0.1, use_coords=False + ) + + # ensure it makes a difference + assert not result.equals(not_expected) + + expected = lowess( + data.values, data.time.values, frac=0.1, it=0, return_sorted=False + ) + expected = xr.DataArray(expected, dims="time", coords={"time": data.time}) + + xr.testing.assert_allclose(result, expected) + + def test_lowess_dataset(): data = trend_data_1D()