Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correctly concatenate pandas.Index objects #875

Merged
merged 7 commits into from
Jun 16, 2016
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ Bug fixes
``keep_attrs=True`` option. By
`Jeremy McGibbon <https://github.com/mcgibbon>`_.

- Concatenating xarray objects along an axis with a MultiIndex or PeriodIndex
preserves the nature of the index (:issue:`875`). By
`Stephan Hoyer <https://github.com/shoyer>`_.

- ``decode_cf_timedelta`` now accepts arrays with ``ndim`` >1 (:issue:`842`).
This fixes issue :issue:`665`.
`Filipe Fernandes <https://github.com/ocefpaf>`_.
Expand Down
4 changes: 2 additions & 2 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from . import utils
from .pycompat import iteritems, reduce, OrderedDict, basestring
from .variable import Variable, as_variable, Coordinate
from .variable import Variable, as_variable, Coordinate, concat as concat_vars


def concat(objs, dim=None, data_vars='all', coords='different',
Expand Down Expand Up @@ -265,7 +265,7 @@ def ensure_common_dims(vars):
# stack up each variable to fill-out the dataset
for k in concat_over:
vars = ensure_common_dims([ds.variables[k] for ds in datasets])
combined = Variable.concat(vars, dim, positions)
combined = concat_vars(vars, dim, positions)
insert_result_variable(k, combined)

result = Dataset(result_vars, attrs=result_attrs)
Expand Down
74 changes: 67 additions & 7 deletions xarray/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import numpy as np
import pandas as pd

from . import nputils
from . import ops
from .combine import concat
from .common import (
Expand Down Expand Up @@ -66,6 +67,52 @@ def _dummy_copy(xarray_obj):
raise AssertionError
return res

def _is_one_or_none(obj):
return obj == 1 or obj is None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you want to treat True as 1? If not, this should be obj is 1 or obj is None.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting! I never knew that.
I always thought ints & strings should use ==, but for -1 to 256 is is fine: http://stackoverflow.com/questions/2239737/is-it-better-to-use-is-or-for-number-comparison-in-python

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this depends on the interpreter, then. CPython might work, but others might not I guess?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These slice objects are generated internally by groupby, so I think it's pretty far fetched to get step is True. Using == feels more idiomatic, anyways.



def _consolidate_slices(slices):
"""Consolidate adjacent slices in a list of slices.
"""
result = []
for slice_ in slices:
if not isinstance(slice_, slice):
raise ValueError('list element is not a slice: %r' % slice_)
if (result and last_slice.stop == slice_.start
and _is_one_or_none(last_slice.step)
and _is_one_or_none(slice_.step)):
last_slice = slice(last_slice.start, slice_.stop, slice_.step)
result[-1] = last_slice
else:
result.append(slice_)
last_slice = slice_
return result


def _inverse_permutation_indices(positions):
"""Like inverse_permutation, but also handles slices.

Parameters
----------
positions : list of np.ndarray or slice objects.
If slice objects, all are assumed to be slices.

Returns
-------
np.ndarray of indices or None, if no permutation is necessary.
"""
if not positions:
return None

if isinstance(positions[0], slice):
positions = _consolidate_slices(positions)
if positions == slice(None):
return None
positions = [np.arange(sl.start, sl.stop, sl.step) for sl in positions]

indices = nputils.inverse_permutation(np.concatenate(positions))
return indices


class GroupBy(object):
"""A object that implements the split-apply-combine pattern.
Expand Down Expand Up @@ -302,6 +349,16 @@ def assign_coords(self, **kwargs):
return self.apply(lambda ds: ds.assign_coords(**kwargs))


def _maybe_reorder(xarray_obj, concat_dim, positions):
order = _inverse_permutation_indices(positions)

if order is None:
return xarray_obj
else:
dim, = concat_dim.dims
return xarray_obj[{dim: order}]


class DataArrayGroupBy(GroupBy, ImplementsArrayReduce):
"""GroupBy object specialized to grouping DataArray objects
"""
Expand All @@ -313,14 +370,14 @@ def _iter_grouped_shortcut(self):
for indices in self.group_indices:
yield var[{self.group_dim: indices}]

def _concat_shortcut(self, applied, concat_dim, positions):
def _concat_shortcut(self, applied, concat_dim, positions=None):
# nb. don't worry too much about maintaining this method -- it does
# speed things up, but it's not very interpretable and there are much
# faster alternatives (e.g., doing the grouped aggregation in a
# compiled language)
stacked = Variable.concat(
applied, concat_dim, positions, shortcut=True)
result = self.obj._replace_maybe_drop_dims(stacked)
stacked = Variable.concat(applied, concat_dim, shortcut=True)
reordered = _maybe_reorder(stacked, concat_dim, positions)
result = self.obj._replace_maybe_drop_dims(reordered)
result._coords[concat_dim.name] = as_variable(concat_dim, copy=True)
return result

Expand Down Expand Up @@ -391,7 +448,8 @@ def _concat(self, applied, shortcut=False):
if shortcut:
combined = self._concat_shortcut(applied, concat_dim, positions)
else:
combined = concat(applied, concat_dim, positions=positions)
combined = concat(applied, concat_dim)
combined = _maybe_reorder(combined, concat_dim, positions)

if isinstance(combined, type(self.obj)):
combined = self._restore_dim_order(combined)
Expand Down Expand Up @@ -472,8 +530,10 @@ def apply(self, func, **kwargs):
def _concat(self, applied):
applied_example, applied = peek_at(applied)
concat_dim, positions = self._infer_concat_args(applied_example)
combined = concat(applied, concat_dim, positions=positions)
return combined

combined = concat(applied, concat_dim)
reordered = _maybe_reorder(combined, concat_dim, positions)
return reordered

def reduce(self, func, dim=None, keep_attrs=False, **kwargs):
"""Reduce the items in this group by applying `func` along some
Expand Down
35 changes: 17 additions & 18 deletions xarray/core/nputils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,23 @@ def nanlast(values, axis):
return _select_along_axis(values, idx_last, axis)


def _calc_concat_shape(arrays, axis=0):
first_shape = arrays[0].shape
length = builtins.sum(a.shape[axis] for a in arrays)
result_shape = first_shape[:axis] + (length,) + first_shape[(axis + 1):]
return result_shape


def interleaved_concat(arrays, indices, axis=0):
arrays = [np.asarray(a) for a in arrays]
axis = _validate_axis(arrays[0], axis)
result_shape = _calc_concat_shape(arrays, axis=axis)
dtype = reduce(np.promote_types, [a.dtype for a in arrays])
result = np.empty(result_shape, dtype)
key = [slice(None)] * result.ndim
for a, ind in zip(arrays, indices):
key[axis] = ind
result[key] = a
return result
def inverse_permutation(indices):
"""Return indices for an inverse permutation.

Parameters
----------
indices : 1D np.ndarray with dtype=int
Integer positions to assign elements to.

Returns
-------
inverse_permutation : 1D np.ndarray with dtype=int
Integer indices to take from the original array to create the
permutation.
"""
inverse_permutation = np.empty(len(indices), dtype=np.int64)
inverse_permutation[indices] = np.arange(len(indices))
return inverse_permutation


def _ensure_bool_is_ndarray(result, *args):
Expand Down
66 changes: 1 addition & 65 deletions xarray/core/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@

from . import npcompat
from .pycompat import PY3, dask_array_type
from .nputils import (
nanfirst, nanlast, interleaved_concat as _interleaved_concat_numpy,
array_eq, array_ne, _validate_axis, _calc_concat_shape
)
from .nputils import nanfirst, nanlast, array_eq, array_ne


try:
Expand Down Expand Up @@ -100,67 +97,6 @@ def _fail_on_dask_array_input(values, msg=None, func_name=None):
tensordot = _dask_or_eager_func('tensordot', n_array_args=2)


def _interleaved_indices_required(indices):
"""With dask, we care about data locality and would rather avoid splitting
splitting up each arrays into single elements. This routine checks to see
if we really need the "interleaved" part of interleaved_concat.

We don't use for the pure numpy version of interleaved_concat, because it's
just as fast or faster to directly do the interleaved concatenate rather
than check if we could simply it.
"""
next_expected = 0
for ind in indices:
if isinstance(ind, slice):
if ((ind.start or 0) != next_expected or
ind.step not in (1, None)):
return True
next_expected = ind.stop
else:
ind = np.asarray(ind)
expected = np.arange(next_expected, next_expected + ind.size)
if (ind != expected).any():
return True
next_expected = ind[-1] + 1
return False


def _interleaved_concat_slow(arrays, indices, axis=0):
"""A slow version of interleaved_concat that also works on dask arrays
"""
axis = _validate_axis(arrays[0], axis)

result_shape = _calc_concat_shape(arrays, axis=axis)
length = result_shape[axis]
array_lookup = np.empty(length, dtype=int)
element_lookup = np.empty(length, dtype=int)

for n, ind in enumerate(indices):
if isinstance(ind, slice):
ind = np.arange(*ind.indices(length))
for m, i in enumerate(ind):
array_lookup[i] = n
element_lookup[i] = m

split_arrays = [arrays[n][(slice(None),) * axis + (slice(m, m + 1),)]
for (n, m) in zip(array_lookup, element_lookup)]
return concatenate(split_arrays, axis)


def interleaved_concat(arrays, indices, axis=0):
"""Concatenate each array along the given axis, but also assign each array
element into the location given by indices. This operation is used for
groupby.transform.
"""
if has_dask and isinstance(arrays[0], da.Array):
if not _interleaved_indices_required(indices):
return da.concatenate(arrays, axis)
else:
return _interleaved_concat_slow(arrays, indices, axis)
else:
return _interleaved_concat_numpy(arrays, indices, axis)


def asarray(data):
return data if isinstance(data, dask_array_type) else np.asarray(data)

Expand Down
Loading