Skip to content

Commit

Permalink
Correctly concatenate pandas.Index objects (#875)
Browse files Browse the repository at this point in the history
We now have a dedicated method, Coordinate.concat that will correctly
concatenate pandas.Index objects, even if they can't be properly expressed as
NumPy arrays (e.g., PeriodIndex and MultiIndex).

As part of this change, I removed and replaced the internal `interleaved_concat`
routine. It turns out we can do this with an inverse permutation instead, which
results in much simpler and cleaner code. In particular, we no longer need a
special path to support dask.array.

This should help with GH818.
  • Loading branch information
shoyer authored Jun 16, 2016
1 parent 1addb9b commit 918945c
Show file tree
Hide file tree
Showing 10 changed files with 227 additions and 171 deletions.
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ Bug fixes
``keep_attrs=True`` option. By
`Jeremy McGibbon <https://github.com/mcgibbon>`_.

- Concatenating xarray objects along an axis with a MultiIndex or PeriodIndex
preserves the nature of the index (:issue:`875`). By
`Stephan Hoyer <https://github.com/shoyer>`_.

- ``decode_cf_timedelta`` now accepts arrays with ``ndim`` >1 (:issue:`842`).
This fixes issue :issue:`665`.
`Filipe Fernandes <https://github.com/ocefpaf>`_.
Expand Down
4 changes: 2 additions & 2 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from . import utils
from .pycompat import iteritems, reduce, OrderedDict, basestring
from .variable import Variable, as_variable, Coordinate
from .variable import Variable, as_variable, Coordinate, concat as concat_vars


def concat(objs, dim=None, data_vars='all', coords='different',
Expand Down Expand Up @@ -265,7 +265,7 @@ def ensure_common_dims(vars):
# stack up each variable to fill-out the dataset
for k in concat_over:
vars = ensure_common_dims([ds.variables[k] for ds in datasets])
combined = Variable.concat(vars, dim, positions)
combined = concat_vars(vars, dim, positions)
insert_result_variable(k, combined)

result = Dataset(result_vars, attrs=result_attrs)
Expand Down
74 changes: 67 additions & 7 deletions xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import pandas as pd

from . import nputils
from . import ops
from .combine import concat
from .common import (
Expand Down Expand Up @@ -66,6 +67,52 @@ def _dummy_copy(xarray_obj):
raise AssertionError
return res

def _is_one_or_none(obj):
return obj == 1 or obj is None


def _consolidate_slices(slices):
"""Consolidate adjacent slices in a list of slices.
"""
result = []
for slice_ in slices:
if not isinstance(slice_, slice):
raise ValueError('list element is not a slice: %r' % slice_)
if (result and last_slice.stop == slice_.start
and _is_one_or_none(last_slice.step)
and _is_one_or_none(slice_.step)):
last_slice = slice(last_slice.start, slice_.stop, slice_.step)
result[-1] = last_slice
else:
result.append(slice_)
last_slice = slice_
return result


def _inverse_permutation_indices(positions):
"""Like inverse_permutation, but also handles slices.
Parameters
----------
positions : list of np.ndarray or slice objects.
If slice objects, all are assumed to be slices.
Returns
-------
np.ndarray of indices or None, if no permutation is necessary.
"""
if not positions:
return None

if isinstance(positions[0], slice):
positions = _consolidate_slices(positions)
if positions == slice(None):
return None
positions = [np.arange(sl.start, sl.stop, sl.step) for sl in positions]

indices = nputils.inverse_permutation(np.concatenate(positions))
return indices


class GroupBy(object):
"""A object that implements the split-apply-combine pattern.
Expand Down Expand Up @@ -302,6 +349,16 @@ def assign_coords(self, **kwargs):
return self.apply(lambda ds: ds.assign_coords(**kwargs))


def _maybe_reorder(xarray_obj, concat_dim, positions):
order = _inverse_permutation_indices(positions)

if order is None:
return xarray_obj
else:
dim, = concat_dim.dims
return xarray_obj[{dim: order}]


class DataArrayGroupBy(GroupBy, ImplementsArrayReduce):
"""GroupBy object specialized to grouping DataArray objects
"""
Expand All @@ -313,14 +370,14 @@ def _iter_grouped_shortcut(self):
for indices in self.group_indices:
yield var[{self.group_dim: indices}]

def _concat_shortcut(self, applied, concat_dim, positions):
def _concat_shortcut(self, applied, concat_dim, positions=None):
# nb. don't worry too much about maintaining this method -- it does
# speed things up, but it's not very interpretable and there are much
# faster alternatives (e.g., doing the grouped aggregation in a
# compiled language)
stacked = Variable.concat(
applied, concat_dim, positions, shortcut=True)
result = self.obj._replace_maybe_drop_dims(stacked)
stacked = Variable.concat(applied, concat_dim, shortcut=True)
reordered = _maybe_reorder(stacked, concat_dim, positions)
result = self.obj._replace_maybe_drop_dims(reordered)
result._coords[concat_dim.name] = as_variable(concat_dim, copy=True)
return result

Expand Down Expand Up @@ -391,7 +448,8 @@ def _concat(self, applied, shortcut=False):
if shortcut:
combined = self._concat_shortcut(applied, concat_dim, positions)
else:
combined = concat(applied, concat_dim, positions=positions)
combined = concat(applied, concat_dim)
combined = _maybe_reorder(combined, concat_dim, positions)

if isinstance(combined, type(self.obj)):
combined = self._restore_dim_order(combined)
Expand Down Expand Up @@ -472,8 +530,10 @@ def apply(self, func, **kwargs):
def _concat(self, applied):
applied_example, applied = peek_at(applied)
concat_dim, positions = self._infer_concat_args(applied_example)
combined = concat(applied, concat_dim, positions=positions)
return combined

combined = concat(applied, concat_dim)
reordered = _maybe_reorder(combined, concat_dim, positions)
return reordered

def reduce(self, func, dim=None, keep_attrs=False, **kwargs):
"""Reduce the items in this group by applying `func` along some
Expand Down
36 changes: 18 additions & 18 deletions xarray/core/nputils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,24 @@ def nanlast(values, axis):
return _select_along_axis(values, idx_last, axis)


def _calc_concat_shape(arrays, axis=0):
first_shape = arrays[0].shape
length = builtins.sum(a.shape[axis] for a in arrays)
result_shape = first_shape[:axis] + (length,) + first_shape[(axis + 1):]
return result_shape


def interleaved_concat(arrays, indices, axis=0):
arrays = [np.asarray(a) for a in arrays]
axis = _validate_axis(arrays[0], axis)
result_shape = _calc_concat_shape(arrays, axis=axis)
dtype = reduce(np.promote_types, [a.dtype for a in arrays])
result = np.empty(result_shape, dtype)
key = [slice(None)] * result.ndim
for a, ind in zip(arrays, indices):
key[axis] = ind
result[key] = a
return result
def inverse_permutation(indices):
"""Return indices for an inverse permutation.
Parameters
----------
indices : 1D np.ndarray with dtype=int
Integer positions to assign elements to.
Returns
-------
inverse_permutation : 1D np.ndarray with dtype=int
Integer indices to take from the original array to create the
permutation.
"""
# use intp instead of int64 because of windows :(
inverse_permutation = np.empty(len(indices), dtype=np.intp)
inverse_permutation[indices] = np.arange(len(indices), dtype=np.intp)
return inverse_permutation


def _ensure_bool_is_ndarray(result, *args):
Expand Down
66 changes: 1 addition & 65 deletions xarray/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@

from . import npcompat
from .pycompat import PY3, dask_array_type
from .nputils import (
nanfirst, nanlast, interleaved_concat as _interleaved_concat_numpy,
array_eq, array_ne, _validate_axis, _calc_concat_shape
)
from .nputils import nanfirst, nanlast, array_eq, array_ne


try:
Expand Down Expand Up @@ -100,67 +97,6 @@ def _fail_on_dask_array_input(values, msg=None, func_name=None):
tensordot = _dask_or_eager_func('tensordot', n_array_args=2)


def _interleaved_indices_required(indices):
"""With dask, we care about data locality and would rather avoid splitting
splitting up each arrays into single elements. This routine checks to see
if we really need the "interleaved" part of interleaved_concat.
We don't use for the pure numpy version of interleaved_concat, because it's
just as fast or faster to directly do the interleaved concatenate rather
than check if we could simply it.
"""
next_expected = 0
for ind in indices:
if isinstance(ind, slice):
if ((ind.start or 0) != next_expected or
ind.step not in (1, None)):
return True
next_expected = ind.stop
else:
ind = np.asarray(ind)
expected = np.arange(next_expected, next_expected + ind.size)
if (ind != expected).any():
return True
next_expected = ind[-1] + 1
return False


def _interleaved_concat_slow(arrays, indices, axis=0):
"""A slow version of interleaved_concat that also works on dask arrays
"""
axis = _validate_axis(arrays[0], axis)

result_shape = _calc_concat_shape(arrays, axis=axis)
length = result_shape[axis]
array_lookup = np.empty(length, dtype=int)
element_lookup = np.empty(length, dtype=int)

for n, ind in enumerate(indices):
if isinstance(ind, slice):
ind = np.arange(*ind.indices(length))
for m, i in enumerate(ind):
array_lookup[i] = n
element_lookup[i] = m

split_arrays = [arrays[n][(slice(None),) * axis + (slice(m, m + 1),)]
for (n, m) in zip(array_lookup, element_lookup)]
return concatenate(split_arrays, axis)


def interleaved_concat(arrays, indices, axis=0):
"""Concatenate each array along the given axis, but also assign each array
element into the location given by indices. This operation is used for
groupby.transform.
"""
if has_dask and isinstance(arrays[0], da.Array):
if not _interleaved_indices_required(indices):
return da.concatenate(arrays, axis)
else:
return _interleaved_concat_slow(arrays, indices, axis)
else:
return _interleaved_concat_numpy(arrays, indices, axis)


def asarray(data):
return data if isinstance(data, dask_array_type) else np.asarray(data)

Expand Down
Loading

0 comments on commit 918945c

Please sign in to comment.