Skip to content

Commit

Permalink
lowess: n_steps and use_coords
Browse files Browse the repository at this point in the history
  • Loading branch information
mathause committed Sep 6, 2023
1 parent 2cc49a2 commit 9d97ddf
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 5 deletions.
50 changes: 45 additions & 5 deletions mesmer/stats/smoothing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from mesmer.core.utils import _check_dataarray_form


def lowess(data, dim, *, frac, use_coords_as_x=False, it=0):
def lowess(data, dim, *, n_steps=None, frac=None, use_coords=True, it=0):
"""LOWESS (Locally Weighted Scatterplot Smoothing) for xarray objects
Parameters
Expand All @@ -13,25 +13,65 @@ def lowess(data, dim, *, frac, use_coords_as_x=False, it=0):
Data to smooth (y-values).
dim : str
Dimension along which to smooth (x-dimension)
n_steps : int
The number of data points used to estimate each y-value, must be between 0 and
the length of dim. If given used to calculate ``frac``. Exactly one of
``n_steps`` and ``frac`` must be given.
frac : float
Between 0 and 1. The fraction of the data used when estimating each y-value.
use_coords_as_x : boolean, default: False
The fraction of the data used when estimating each y-value. Between 0 and 1.
Exactly one of ``n_steps`` and ``frac`` must be given.
use_coords : boolean, default: True
If True uses ``data[dim]`` as x-values else uses ``np.arange(data[dim].size)``
(useful if ``dim`` are time coordinates).
it : int, default: 0
The number of residual-based re-weightings to perform.
Returns
-------
out : xr.DataArray | xr.Dataset
LOWESS smoothed array
See Also
--------
statsmodels.nonparametric.smoothers_lowess.lowess
"""

from statsmodels.nonparametric.smoothers_lowess import lowess

if not isinstance(dim, str):
raise ValueError("Can only pass a single dimension.")

if (n_steps is None and frac is None) or (n_steps is not None and frac is not None):
raise ValueError("Exactly one of ``n_steps`` and ``frac`` must be given.")

coords = data[dim]
_check_dataarray_form(coords, name=dim, ndim=1)

if use_coords_as_x:
x = coords
n_coords = coords.size
if n_steps is not None:

if n_steps > n_coords:
raise ValueError(
f"``n_steps`` ({n_steps}) cannot be be larger than the length of '{dim}' ({n_coords})"
)

frac = n_steps / n_coords

# TODO: could instead convert datetime coords to numeric (see e.g. in flox)
if use_coords:
try:
# test if coords can be cast to float (required by statsmodels..lowess)
# use safe casting so we don't convert np datetime to float
# while this technically works the x values are then too large such that
# a missing year is no longer detected...
x = coords.astype(float, casting="safe")
except TypeError as e:
raise TypeError(
f"Cannot convert coords ({dim}) of type `{coords.dtype}` to float. "
"Set ``use_coords=False`` to use enumerated coords "
"(``np.arange(data[dim].size)``) instead."
) from e

else:
x = xr.ones_like(coords)
x.data = np.arange(coords.size)
Expand Down
62 changes: 62 additions & 0 deletions tests/unit/test_smoothing.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,29 @@ def test_lowess_errors():
with pytest.raises(ValueError, match="data should be 1-dimensional"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "data", frac=0.3)

with pytest.raises(ValueError, match="Exactly one of ``n_steps`` and ``frac``"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "lat")

with pytest.raises(ValueError, match="Exactly one of ``n_steps`` and ``frac``"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "lat", frac=0.5, n_steps=10)

with pytest.raises(ValueError, match=r"``n_steps`` \(40\) cannot be be larger"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "lat", n_steps=40)

# numpy datetime
time = xr.date_range("2000-01-01", periods=30)
data = data.assign_coords(time=time)

with pytest.raises(TypeError, match="Cannot convert coords"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "time", frac=0.5)

# cftime datetime
time = xr.date_range("2000-01-01", periods=30, calendar="noloeap")
data = data.assign_coords(time=time)

with pytest.raises(TypeError, match="Cannot convert coords"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "time", frac=0.5)


@pytest.mark.parametrize("it", [0, 3])
@pytest.mark.parametrize("frac", [0.3, 0.5])
Expand All @@ -33,6 +56,45 @@ def test_lowess(it, frac):
xr.testing.assert_allclose(result, expected)


@pytest.mark.parametrize("n_steps", [0, 10, 15, 30])
def test_lowess_n_steps(n_steps):

data = trend_data_1D()

result = mesmer.stats.smoothing.lowess(data, "time", n_steps=n_steps)

frac = n_steps / 30
expected = mesmer.stats.smoothing.lowess(data, "time", frac=frac)

xr.testing.assert_allclose(result, expected)


def test_lowess_use_coords():

data = trend_data_1D()
time = data.time.values
time[-1] = time[-1] + 10
data = data.assign_coords(time=time)


result = mesmer.stats.smoothing.lowess(data, "time", frac=0.1)

# time is not equally spaced: we do NOT want the same result as for use_coords=False
not_expected = mesmer.stats.smoothing.lowess(
data, "time", frac=0.1, use_coords=False
)

# ensure it makes a difference
assert not result.equals(not_expected)

expected = lowess(
data.values, data.time.values, frac=0.1, it=0, return_sorted=False
)
expected = xr.DataArray(expected, dims="time", coords={"time": data.time})

xr.testing.assert_allclose(result, expected)


def test_lowess_dataset():

data = trend_data_1D()
Expand Down

0 comments on commit 9d97ddf

Please sign in to comment.