Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rank Methods #1733

Merged
merged 8 commits into from
Dec 18, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/api-hidden.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
Dataset.T
Dataset.cumsum
Dataset.cumprod
Dataset.rank

DataArray.ndim
DataArray.shape
Expand Down Expand Up @@ -91,6 +92,7 @@
DataArray.T
DataArray.cumsum
DataArray.cumprod
DataArray.rank

ufuncs.angle
ufuncs.arccos
Expand Down
2 changes: 2 additions & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ Computation
:py:attr:`~Dataset.real`
:py:attr:`~Dataset.cumsum`
:py:attr:`~Dataset.cumprod`
:py:attr:`~Dataset.rank`

**Grouped operations**:
:py:attr:`~core.groupby.DatasetGroupBy.assign`
Expand Down Expand Up @@ -312,6 +313,7 @@ Computation
:py:attr:`~DataArray.T`
:py:attr:`~DataArray.cumsum`
:py:attr:`~DataArray.cumprod`
:py:attr:`~DataArray.rank`

**Grouped operations**:
:py:attr:`~core.groupby.DataArrayGroupBy.assign_coords`
Expand Down
6 changes: 6 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ Enhancements
in :py:func:`xarray.open_rasterio`.
By `Matti Eskelinen <https://github.com/maaleske>`

**New functions/methods**

- New :py:meth:`~xarray.DataArray.rank` on arrays and datasets. Requires
bottleneck (:issue:`1731`).
By `0x0L <https://github.com/0x0L>`_.

Bug fixes
~~~~~~~~~

Expand Down
40 changes: 40 additions & 0 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from .accessors import DatetimeAccessor
from .alignment import align, reindex_like_indexers
from .common import AbstractArray, BaseDataObject
from .computation import apply_ufunc
from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource,
Indexes, assert_coordinate_consistent,
remap_label_indexers)
Expand Down Expand Up @@ -1971,6 +1972,45 @@ def quantile(self, q, dim=None, interpolation='linear', keep_attrs=False):
interpolation=interpolation)
return self._from_temp_dataset(ds)

def rank(self, dim, pct=False, keep_attrs=False):
"""Ranks the data.

Equal values are assigned a rank that is the average of the ranks that
would have been otherwise assigned to all of the values within that set.
Ranks begin at 1, not 0. If pct is True, computes percentage ranks.

NaNs in the input array are returned as NaNs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a note here that this requires bottleneck.

The `bottleneck` library is required.

Parameters
----------
dim : str
Dimension over which to compute rank.
pct : bool, optional
If True, compute percentage ranks, otherwise compute integer ranks.
keep_attrs : bool, optional
If True, the dataset's attributes (`attrs`) will be copied from
the original object to the new one. If False (default), the new
object will be returned without attributes.

Returns
-------
ranked : DataArray
DataArray with the same coordinates and dtype 'float64'.

Examples
--------

>>> arr = xr.DataArray([5, 6, 7], dims='x')
>>> arr.rank('x')
<xarray.DataArray (x: 3)>
array([ 1., 2., 3.])
Dimensions without coordinates: x
"""
ds = self._to_temp_dataset().rank(dim, pct=pct, keep_attrs=keep_attrs)
return self._from_temp_dataset(ds)


# priority most be higher than Variable to properly work with binary ufuncs
ops.inject_all_ops_and_reduce_methods(DataArray, priority=60)
42 changes: 42 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3222,6 +3222,48 @@ def quantile(self, q, dim=None, interpolation='linear',
new.coords['quantile'] = q
return new

def rank(self, dim, pct=False, keep_attrs=False):
"""Ranks the data.

Equal values are assigned a rank that is the average of the ranks that
would have been otherwise assigned to all of the values within that set.
Ranks begin at 1, not 0. If pct is True, computes percentage ranks.

NaNs in the input array are returned as NaNs.

The `bottleneck` library is required.

Parameters
----------
dim : str
Dimension over which to compute rank.
pct : bool, optional
If True, compute percentage ranks, otherwise compute integer ranks.
keep_attrs : bool, optional
If True, the dataset's attributes (`attrs`) will be copied from
the original object to the new one. If False (default), the new
object will be returned without attributes.

Returns
-------
ranked : Dataset
Variables that do not depend on `dim` are dropped.
"""
if dim not in self.dims:
raise ValueError('Dataset does not contain the dimension: %s' % dim)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add test coverage for this condition


variables = OrderedDict()
for name, var in iteritems(self.variables):
if name in self.data_vars:
if dim in var.dims:
variables[name] = var.rank(dim, pct=pct)
else:
variables[name] = var

coord_names = set(self.coords)
attrs = self.attrs if keep_attrs else None
return self._replace_vars_and_dims(variables, coord_names, attrs=attrs)

@property
def real(self):
return self._unary_op(lambda x: x.real, keep_attrs=True)(self)
Expand Down
42 changes: 41 additions & 1 deletion xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -1348,7 +1348,6 @@ def quantile(self, q, dim=None, interpolation='linear'):
numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile,
DataArray.quantile
"""

if isinstance(self.data, dask_array_type):
raise TypeError("quantile does not work for arrays stored as dask "
"arrays. Load the data via .compute() or .load() "
Expand Down Expand Up @@ -1379,6 +1378,47 @@ def quantile(self, q, dim=None, interpolation='linear'):
interpolation=interpolation)
return Variable(new_dims, qs)

def rank(self, dim, pct=False):
"""Ranks the data.

Equal values are assigned a rank that is the average of the ranks that
would have been otherwise assigned to all of the values within that set.
Ranks begin at 1, not 0. If pct is True, computes percentage ranks.

NaNs in the input array are returned as NaNs.

The `bottleneck` library is required.

Parameters
----------
dim : str
Dimension over which to compute rank.
pct : bool, optional
If True, compute percentage ranks, otherwise compute integer ranks.

Returns
-------
ranked : Variable

See Also
--------
Dataset.rank, DataArray.rank
"""
import bottleneck as bn

if isinstance(self.data, dask_array_type):
raise TypeError("rank does not work for arrays stored as dask "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add a test that this error is raised, e.g., using pytest.raises or raises_regex

"arrays. Load the data via .compute() or .load() "
"prior to calling this method.")

axis = self.get_axis_num(dim)
func = bn.nanrankdata if self.dtype.kind is 'f' else bn.rankdata
ranked = func(self.data, axis=axis)
if pct:
count = np.sum(~np.isnan(self.data), axis=axis, keepdims=True)
ranked /= count
return Variable(self.dims, ranked)

@property
def real(self):
return type(self)(self.dims, self.data.real, self._attrs)
Expand Down
21 changes: 20 additions & 1 deletion xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from xarray.tests import (
TestCase, ReturnItem, source_ndarray, unittest, requires_dask,
assert_identical, assert_equal, assert_allclose, assert_array_equal,
raises_regex, requires_scipy)
raises_regex, requires_scipy, requires_bottleneck)


class TestDataArray(TestCase):
Expand Down Expand Up @@ -3104,6 +3104,25 @@ def test_sortby(self):
actual = da.sortby(['x', 'y'])
self.assertDataArrayEqual(actual, expected)

@requires_bottleneck
def test_rank(self):
# floats
ar = DataArray([[3, 4, np.nan, 1]])
expect_0 = DataArray([[1, 1, np.nan, 1]])
expect_1 = DataArray([[2, 3, np.nan, 1]])
self.assertDataArrayEqual(ar.rank('dim_0'), expect_0)
self.assertDataArrayEqual(ar.rank('dim_1'), expect_1)
# int
x = DataArray([3,2,1])
self.assertDataArrayEqual(x.rank('dim_0'), x)
# str
y = DataArray(['c', 'b', 'a'])
self.assertDataArrayEqual(y.rank('dim_0'), x)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need to add test coverage for the pct=True option.

x = DataArray([3.0, 1.0, np.nan, 2.0, 4.0], dims=('z',))
y = DataArray([0.75, 0.25, np.nan, 0.5, 1.0], dims=('z',))
self.assertDataArrayEqual(y.rank('z', pct=True), y)


@pytest.fixture(params=[1])
def da(request):
Expand Down
20 changes: 19 additions & 1 deletion xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
requires_dask, source_ndarray)

from xarray.tests import (assert_equal, assert_allclose,
assert_array_equal, requires_scipy)
assert_array_equal, requires_bottleneck,
requires_scipy)


def create_test_data(seed=None):
Expand Down Expand Up @@ -3410,6 +3411,23 @@ def test_quantile(self):
assert 'dim3' in ds_quantile.dims
assert all(d not in ds_quantile.dims for d in dim)

@requires_bottleneck
def test_rank(self):
ds = create_test_data(seed=1234)
# only ds.var3 depends on dim3
z = ds.rank('dim3')
self.assertItemsEqual(['var3'], list(z.data_vars))
# same as dataarray version
x = z.var3
y = ds.var3.rank('dim3')
self.assertDataArrayEqual(x, y)
# coordinates stick
self.assertItemsEqual(list(z.coords), list(ds.coords))
self.assertItemsEqual(list(x.coords), list(y.coords))
# invalid dim
with raises_regex(ValueError, 'does not contain'):
x.rank('invalid_dim')

def test_count(self):
ds = Dataset({'x': ('a', [np.nan, 1]), 'y': 0, 'z': np.nan})
expected = Dataset({'x': 1, 'y': 1, 'z': 0})
Expand Down
34 changes: 34 additions & 0 deletions xarray/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

from . import TestCase, source_ndarray, requires_dask, raises_regex

from xarray.tests import requires_bottleneck


class VariableSubclassTestCases(object):
def test_properties(self):
Expand Down Expand Up @@ -1353,6 +1355,38 @@ def test_quantile_dask_raises(self):
with raises_regex(TypeError, 'arrays stored as dask'):
v.quantile(0.5, dim='x')

@requires_dask
@requires_bottleneck
def test_rank_dask_raises(self):
v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0]).chunk(2)
with raises_regex(TypeError, 'arrays stored as dask'):
v.rank('x')

@requires_bottleneck
def test_rank(self):
import bottleneck as bn
# floats
v = Variable(['x', 'y'], [[3, 4, np.nan, 1]])
expect_0 = bn.nanrankdata(v.data, axis=0)
expect_1 = bn.nanrankdata(v.data, axis=1)
np.testing.assert_allclose(v.rank('x').values, expect_0)
np.testing.assert_allclose(v.rank('y').values, expect_1)
# int
v = Variable(['x'], [3,2,1])
expect = bn.rankdata(v.data, axis=0)
np.testing.assert_allclose(v.rank('x').values, expect)
Copy link
Collaborator

@max-sixty max-sixty Dec 8, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI for the future, xarray.test has these natively, and assert_equal rather than the older self. methods
(tbc, no need to change)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A shameless copy/paste from from the test above :)

# str
v = Variable(['x'], ['c', 'b', 'a'])
expect = bn.rankdata(v.data, axis=0)
np.testing.assert_allclose(v.rank('x').values, expect)
# pct
v = Variable(['x'], [3.0, 1.0, np.nan, 2.0, 4.0])
v_expect = Variable(['x'], [0.75, 0.25, np.nan, 0.5, 1.0])
self.assertVariableEqual(v.rank('x', pct=True), v_expect)
# invalid dim
with raises_regex(ValueError, 'not found'):
v.rank('y')

def test_big_endian_reduce(self):
# regression test for GH489
data = np.ones(5, dtype='>f4')
Expand Down