Skip to content

Commit

Permalink
Fix rolling window operations with dask when bottleneck is installed (#…
Browse files Browse the repository at this point in the history
…3040)

xref GH2940, GH2942

Previously, these operations could silently return incorrect results (dask
2.0), or use unbounded amounts of memory (older versions of dask).

This requires a fairly large refactoring, because deciding when to use
bottleneck now needs to be done at runtime rather than at import-time. These
methods are now constructed as methods rather being injected aftewards into
the class, which should also be a much more standard and understable design.
  • Loading branch information
shoyer authored Jun 28, 2019
1 parent 3b622b0 commit a78e0f6
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 176 deletions.
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ Enhancements
Bug fixes
~~~~~~~~~

- Rolling operations on xarray objects containing dask arrays could silently
compute the incorrect result or use large amounts of memory (:issue:`2940`).
By `Stephan Hoyer <https://github.com/shoyer>`_.
- Don't set encoding attributes on bounds variables when writing to netCDF.
(:issue:`2921`)
By `Deepak Cherian <https://github.com/dcherian>`_.
Expand Down
75 changes: 0 additions & 75 deletions xarray/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,6 @@
NAN_REDUCE_METHODS = ['argmax', 'argmin', 'max', 'min', 'mean', 'prod', 'sum',
'std', 'var', 'median']
NAN_CUM_METHODS = ['cumsum', 'cumprod']
BOTTLENECK_ROLLING_METHODS = {'move_sum': 'sum', 'move_mean': 'mean',
'move_std': 'std', 'move_min': 'min',
'move_max': 'max', 'move_var': 'var',
'move_argmin': 'argmin', 'move_argmax': 'argmax',
'move_median': 'median'}
# TODO: wrap take, dot, sort


Expand Down Expand Up @@ -103,20 +98,6 @@
If fewer than min_count non-NA values are present the result will
be NA. New in version 0.10.8: Added with the default being None."""

_ROLLING_REDUCE_DOCSTRING_TEMPLATE = """\
Reduce this {da_or_ds}'s data windows by applying `{name}` along its dimension.
Parameters
----------
**kwargs : dict
Additional keyword arguments passed on to `{name}`.
Returns
-------
reduced : {da_or_ds}
New {da_or_ds} object with `{name}` applied along its rolling dimnension.
"""

_COARSEN_REDUCE_DOCSTRING_TEMPLATE = """\
Coarsen this object by applying `{name}` along its dimensions.
Expand Down Expand Up @@ -236,13 +217,6 @@ def func(self, *args, **kwargs):
return func


def rolling_count(rolling):

rolling_count = rolling._counts()
enough_periods = rolling_count >= rolling._min_periods
return rolling_count.where(enough_periods)


def inject_reduce_methods(cls):
methods = ([(name, getattr(duck_array_ops, 'array_%s' % name), False)
for name in REDUCE_METHODS] +
Expand Down Expand Up @@ -340,55 +314,6 @@ def inject_all_ops_and_reduce_methods(cls, priority=50, array_only=True):
inject_cum_methods(cls)


def inject_bottleneck_rolling_methods(cls):
# standard numpy reduce methods
methods = [(name, getattr(duck_array_ops, name))
for name in NAN_REDUCE_METHODS]
for name, f in methods:
func = cls._reduce_method(f)
func.__name__ = name
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
name=func.__name__, da_or_ds='DataArray')
setattr(cls, name, func)

# bottleneck doesn't offer rolling_count, so we construct it ourselves
func = rolling_count
func.__name__ = 'count'
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
name=func.__name__, da_or_ds='DataArray')
setattr(cls, 'count', func)

# bottleneck rolling methods
if not has_bottleneck:
return

for bn_name, method_name in BOTTLENECK_ROLLING_METHODS.items():
f = getattr(bn, bn_name)
func = cls._bottleneck_reduce(f)
func.__name__ = method_name
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
name=func.__name__, da_or_ds='DataArray')
setattr(cls, method_name, func)


def inject_datasetrolling_methods(cls):
# standard numpy reduce methods
methods = [(name, getattr(duck_array_ops, name))
for name in NAN_REDUCE_METHODS]
for name, f in methods:
func = cls._reduce_method(f)
func.__name__ = name
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
name=func.__name__, da_or_ds='Dataset')
setattr(cls, name, func)
# bottleneck doesn't offer rolling_count, so we construct it ourselves
func = rolling_count
func.__name__ = 'count'
func.__doc__ = _ROLLING_REDUCE_DOCSTRING_TEMPLATE.format(
name=func.__name__, da_or_ds='Dataset')
setattr(cls, 'count', func)


def inject_coarsen_methods(cls):
# standard numpy reduce methods
methods = [(name, getattr(duck_array_ops, name))
Expand Down
Loading

0 comments on commit a78e0f6

Please sign in to comment.