Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lowess: n_steps and use_coords #285

Merged
merged 5 commits into from
Sep 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ New Features

- Other refactorings:
- Extract the LOWESS smoothing for xarray objects: :py:func:`mesmer.stats.smoothing.lowess`.
(`#193 <https://github.com/MESMER-group/mesmer/pull/193>`_ and `#283
<https://github.com/MESMER-group/mesmer/pull/283>`_).
(`#193 <https://github.com/MESMER-group/mesmer/pull/193>`_,
`#283 <https://github.com/MESMER-group/mesmer/pull/283>`_, and
`#285 <https://github.com/MESMER-group/mesmer/pull/285>`_).
By `Mathias Hauser <https://github.com/mathause>`_.

- Added helper functions to process xarray-based model data:
Expand Down
50 changes: 45 additions & 5 deletions mesmer/stats/smoothing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from mesmer.core.utils import _check_dataarray_form


def lowess(data, dim, *, frac, use_coords_as_x=False, it=0):
def lowess(data, dim, *, n_steps=None, frac=None, use_coords=True, it=0):
"""LOWESS (Locally Weighted Scatterplot Smoothing) for xarray objects

Parameters
Expand All @@ -13,25 +13,65 @@ def lowess(data, dim, *, frac, use_coords_as_x=False, it=0):
Data to smooth (y-values).
dim : str
Dimension along which to smooth (x-dimension)
n_steps : int
The number of data points used to estimate each y-value, must be between 0 and
the length of dim. If given used to calculate ``frac``. Exactly one of
``n_steps`` and ``frac`` must be given.
frac : float
Between 0 and 1. The fraction of the data used when estimating each y-value.
use_coords_as_x : boolean, default: False
The fraction of the data used when estimating each y-value. Between 0 and 1.
Exactly one of ``n_steps`` and ``frac`` must be given.
use_coords : boolean, default: True
If True uses ``data[dim]`` as x-values else uses ``np.arange(data[dim].size)``
(useful if ``dim`` are time coordinates).
it : int, default: 0
The number of residual-based re-weightings to perform.

Returns
-------
out : xr.DataArray | xr.Dataset
LOWESS smoothed array

See Also
--------
statsmodels.nonparametric.smoothers_lowess.lowess
"""

from statsmodels.nonparametric.smoothers_lowess import lowess

if not isinstance(dim, str):
raise ValueError("Can only pass a single dimension.")

if (n_steps is None and frac is None) or (n_steps is not None and frac is not None):
raise ValueError("Exactly one of ``n_steps`` and ``frac`` must be given.")

coords = data[dim]
_check_dataarray_form(coords, name=dim, ndim=1)

if use_coords_as_x:
x = coords
n_coords = coords.size
if n_steps is not None:

if n_steps > n_coords:
raise ValueError(
f"``n_steps`` ({n_steps}) cannot be be larger than the length of '{dim}' ({n_coords})"
)

frac = n_steps / n_coords

# TODO: could instead convert datetime coords to numeric (see e.g. in flox)
if use_coords:
try:
# test if coords can be cast to float (required by statsmodels..lowess)
# use safe casting so we don't convert np datetime to float
# while this technically works the x values are then too large such that
# a missing year is no longer detected...
x = coords.astype(float, casting="safe")
except TypeError as e:
raise TypeError(
f"Cannot convert coords ({dim}) of type `{coords.dtype}` to float. "
"Set ``use_coords=False`` to use enumerated coords "
"(``np.arange(data[dim].size)``) instead."
) from e

else:
x = xr.ones_like(coords)
x.data = np.arange(coords.size)
Expand Down
66 changes: 66 additions & 0 deletions tests/unit/test_smoothing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pandas as pd
import pytest
import xarray as xr
from packaging.version import Version
from statsmodels.nonparametric.smoothers_lowess import lowess

import mesmer.stats.smoothing
Expand All @@ -16,6 +18,32 @@ def test_lowess_errors():
with pytest.raises(ValueError, match="data should be 1-dimensional"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "data", frac=0.3)

with pytest.raises(ValueError, match="Exactly one of ``n_steps`` and ``frac``"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "lat")

with pytest.raises(ValueError, match="Exactly one of ``n_steps`` and ``frac``"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "lat", frac=0.5, n_steps=10)

with pytest.raises(ValueError, match=r"``n_steps`` \(40\) cannot be be larger"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "lat", n_steps=40)

# numpy datetime
time = pd.date_range("2000-01-01", periods=30)
data = data.assign_coords(time=time)

with pytest.raises(TypeError, match="Cannot convert coords"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "time", frac=0.5)

# TODO: remove check once we drop python 3.7
if Version(xr.__version__) >= Version("21.0"):

# cftime datetime
time = xr.date_range("2000-01-01", periods=30, calendar="noleap")
data = data.assign_coords(time=time)

with pytest.raises(TypeError, match="Cannot convert coords"):
mesmer.stats.smoothing.lowess(data.to_dataset(), "time", frac=0.5)


@pytest.mark.parametrize("it", [0, 3])
@pytest.mark.parametrize("frac", [0.3, 0.5])
Expand All @@ -33,6 +61,44 @@ def test_lowess(it, frac):
xr.testing.assert_allclose(result, expected)


@pytest.mark.parametrize("n_steps", [0, 10, 15, 30])
def test_lowess_n_steps(n_steps):

data = trend_data_1D()

result = mesmer.stats.smoothing.lowess(data, "time", n_steps=n_steps)

frac = n_steps / 30
expected = mesmer.stats.smoothing.lowess(data, "time", frac=frac)

xr.testing.assert_allclose(result, expected)


def test_lowess_use_coords():

data = trend_data_1D()
time = data.time.values
time[-1] = time[-1] + 10
data = data.assign_coords(time=time)

result = mesmer.stats.smoothing.lowess(data, "time", frac=0.1)

# time is not equally spaced: we do NOT want the same result as for use_coords=False
not_expected = mesmer.stats.smoothing.lowess(
data, "time", frac=0.1, use_coords=False
)

# ensure it makes a difference
assert not result.equals(not_expected)

expected = lowess(
data.values, data.time.values, frac=0.1, it=0, return_sorted=False
)
expected = xr.DataArray(expected, dims="time", coords={"time": data.time})

xr.testing.assert_allclose(result, expected)


def test_lowess_dataset():

data = trend_data_1D()
Expand Down