Skip to content

Commit

Permalink
Flexible indexes: add Index base class and xindexes properties (#5102)
Browse files Browse the repository at this point in the history
* add IndexAdapter class + move PandasIndexAdapter

* wip: xarray_obj.indexes -> IndexAdapter objects

* fix more broken tests

* fix merge glitch

* fix group bins tests

* add xindexes property

Use it internally instead of indexes

* rename IndexAdapter -> Index

* rename _to_index_adpater (typo) -> _to_xindex

* add Index.to_pandas_index() method

Also improve xarray_obj.indexes property implementation

* rename PandasIndexAdpater -> PandasIndex

* update index type in tests

* ensure .indexes only returns pd.Index objects

* PandasIndex: normalize other index in cmp funcs

* fix merge lint errors

* fix PandasIndex union/intersection

* [skip-ci] add TODO comment about index sizes

* address more PR comments

* [skip-ci] update what's new

* fix coord_names normalization

* move what's new entry to unreleased section
  • Loading branch information
benbovy authored May 11, 2021
1 parent 234b40a commit 6e14df6
Show file tree
Hide file tree
Showing 22 changed files with 534 additions and 311 deletions.
5 changes: 4 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ Documentation
Internal Changes
~~~~~~~~~~~~~~~~

- Explicit indexes refactor: add an ``xarray.Index`` base class and
``Dataset.xindexes`` / ``DataArray.xindexes`` properties. Also rename
``PandasIndexAdapter`` to ``PandasIndex``, which now inherits from
``xarray.Index`` (:pull:`5102`). By `Benoit Bovy <https://github.com/benbovy>`_.

.. _whats-new.0.18.0:

Expand Down Expand Up @@ -268,7 +272,6 @@ Internal Changes
(:pull:`5188`), (:pull:`5191`).
By `Maximilian Roos <https://github.com/max-sixty>`_.


.. _whats-new.0.17.0:

v0.17.0 (24 Feb 2021)
Expand Down
39 changes: 24 additions & 15 deletions xarray/core/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
import numpy as np
import pandas as pd

from . import dtypes, utils
from . import dtypes
from .indexes import Index, PandasIndex
from .indexing import get_indexer_nd
from .utils import is_dict_like, is_full_slice, maybe_coerce_to_str
from .utils import is_dict_like, is_full_slice, maybe_coerce_to_str, safe_cast_to_index
from .variable import IndexVariable, Variable

if TYPE_CHECKING:
Expand All @@ -30,11 +31,11 @@
DataAlignable = TypeVar("DataAlignable", bound=DataWithCoords)


def _get_joiner(join):
def _get_joiner(join, index_cls):
if join == "outer":
return functools.partial(functools.reduce, pd.Index.union)
return functools.partial(functools.reduce, index_cls.union)
elif join == "inner":
return functools.partial(functools.reduce, pd.Index.intersection)
return functools.partial(functools.reduce, index_cls.intersection)
elif join == "left":
return operator.itemgetter(0)
elif join == "right":
Expand Down Expand Up @@ -63,7 +64,7 @@ def _override_indexes(objects, all_indexes, exclude):
objects = list(objects)
for idx, obj in enumerate(objects[1:]):
new_indexes = {}
for dim in obj.indexes:
for dim in obj.xindexes:
if dim not in exclude:
new_indexes[dim] = all_indexes[dim][0]
objects[idx + 1] = obj._overwrite_indexes(new_indexes)
Expand Down Expand Up @@ -284,7 +285,7 @@ def align(
if dim not in exclude:
all_coords[dim].append(obj.coords[dim])
try:
index = obj.indexes[dim]
index = obj.xindexes[dim]
except KeyError:
unlabeled_dim_sizes[dim].add(obj.sizes[dim])
else:
Expand All @@ -298,16 +299,19 @@ def align(
# - It ensures it's possible to do operations that don't require alignment
# on indexes with duplicate values (which cannot be reindexed with
# pandas). This is useful, e.g., for overwriting such duplicate indexes.
joiner = _get_joiner(join)
joined_indexes = {}
for dim, matching_indexes in all_indexes.items():
if dim in indexes:
index = utils.safe_cast_to_index(indexes[dim])
# TODO: benbovy - flexible indexes. maybe move this logic in util func
if isinstance(indexes[dim], Index):
index = indexes[dim]
else:
index = PandasIndex(safe_cast_to_index(indexes[dim]))
if (
any(not index.equals(other) for other in matching_indexes)
or dim in unlabeled_dim_sizes
):
joined_indexes[dim] = indexes[dim]
joined_indexes[dim] = index
else:
if (
any(
Expand All @@ -318,6 +322,7 @@ def align(
):
if join == "exact":
raise ValueError(f"indexes along dimension {dim!r} are not equal")
joiner = _get_joiner(join, type(matching_indexes[0]))
index = joiner(matching_indexes)
# make sure str coords are not cast to object
index = maybe_coerce_to_str(index, all_coords[dim])
Expand All @@ -327,6 +332,9 @@ def align(

if dim in unlabeled_dim_sizes:
unlabeled_sizes = unlabeled_dim_sizes[dim]
# TODO: benbovy - flexible indexes: expose a size property for xarray.Index?
# Some indexes may not have a defined size (e.g., built from multiple coords of
# different sizes)
labeled_size = index.size
if len(unlabeled_sizes | {labeled_size}) > 1:
raise ValueError(
Expand Down Expand Up @@ -469,7 +477,7 @@ def reindex_like_indexers(
ValueError
If any dimensions without labels have different sizes.
"""
indexers = {k: v for k, v in other.indexes.items() if k in target.dims}
indexers = {k: v for k, v in other.xindexes.items() if k in target.dims}

for dim in other.dims:
if dim not in indexers and dim in target.dims:
Expand All @@ -487,14 +495,14 @@ def reindex_like_indexers(
def reindex_variables(
variables: Mapping[Any, Variable],
sizes: Mapping[Any, int],
indexes: Mapping[Any, pd.Index],
indexes: Mapping[Any, Index],
indexers: Mapping,
method: Optional[str] = None,
tolerance: Any = None,
copy: bool = True,
fill_value: Optional[Any] = dtypes.NA,
sparse: bool = False,
) -> Tuple[Dict[Hashable, Variable], Dict[Hashable, pd.Index]]:
) -> Tuple[Dict[Hashable, Variable], Dict[Hashable, Index]]:
"""Conform a dictionary of aligned variables onto a new set of variables,
filling in missing values with NaN.
Expand Down Expand Up @@ -559,10 +567,11 @@ def reindex_variables(
"from that to be indexed along {:s}".format(str(indexer.dims), dim)
)

target = new_indexes[dim] = utils.safe_cast_to_index(indexers[dim])
target = new_indexes[dim] = PandasIndex(safe_cast_to_index(indexers[dim]))

if dim in indexes:
index = indexes[dim]
# TODO (benbovy - flexible indexes): support other indexes than pd.Index?
index = indexes[dim].to_pandas_index()

if not index.is_unique:
raise ValueError(
Expand Down
14 changes: 11 additions & 3 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,17 @@ def _infer_concat_order_from_coords(datasets):
if dim in ds0:

# Need to read coordinate values to do ordering
indexes = [ds.indexes.get(dim) for ds in datasets]
indexes = [ds.xindexes.get(dim) for ds in datasets]
if any(index is None for index in indexes):
raise ValueError(
"Every dimension needs a coordinate for "
"inferring concatenation order"
)

# TODO (benbovy, flexible indexes): all indexes should be Pandas.Index
# get pd.Index objects from Index objects
indexes = [index.array for index in indexes]

# If dimension coordinate values are same on every dataset then
# should be leaving this dimension alone (it's just a "bystander")
if not all(index.equals(indexes[0]) for index in indexes[1:]):
Expand Down Expand Up @@ -801,9 +805,13 @@ def combine_by_coords(
)

# Check the overall coordinates are monotonically increasing
# TODO (benbovy - flexible indexes): only with pandas.Index?
for dim in concat_dims:
indexes = concatenated.indexes.get(dim)
if not (indexes.is_monotonic_increasing or indexes.is_monotonic_decreasing):
indexes = concatenated.xindexes.get(dim)
if not (
indexes.array.is_monotonic_increasing
or indexes.array.is_monotonic_decreasing
):
raise ValueError(
"Resulting object does not have monotonic"
" global indexes along dimension {}".format(dim)
Expand Down
5 changes: 3 additions & 2 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ def get_index(self, key: Hashable) -> pd.Index:
raise KeyError(key)

try:
return self.indexes[key]
return self.xindexes[key].to_pandas_index()
except KeyError:
return pd.Index(range(self.sizes[key]), name=key)

Expand Down Expand Up @@ -1162,7 +1162,8 @@ def resample(
category=FutureWarning,
)

if isinstance(self.indexes[dim_name], CFTimeIndex):
# TODO (benbovy - flexible indexes): update when CFTimeIndex is an xarray Index subclass
if isinstance(self.xindexes[dim_name].to_pandas_index(), CFTimeIndex):
from .resample_cftime import CFTimeGrouper

grouper = CFTimeGrouper(freq, closed, label, base, loffset)
Expand Down
22 changes: 14 additions & 8 deletions xarray/core/coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import pandas as pd

from . import formatting, indexing
from .indexes import Indexes
from .indexes import Index, Indexes
from .merge import merge_coordinates_without_align, merge_coords
from .utils import Frozen, ReprObject, either_dict_or_kwargs
from .variable import Variable
Expand Down Expand Up @@ -52,6 +52,10 @@ def dims(self) -> Union[Mapping[Hashable, int], Tuple[Hashable, ...]]:
def indexes(self) -> Indexes:
return self._data.indexes # type: ignore[attr-defined]

@property
def xindexes(self) -> Indexes:
return self._data.xindexes # type: ignore[attr-defined]

@property
def variables(self):
raise NotImplementedError()
Expand Down Expand Up @@ -157,15 +161,15 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
def update(self, other: Mapping[Hashable, Any]) -> None:
other_vars = getattr(other, "variables", other)
coords, indexes = merge_coords(
[self.variables, other_vars], priority_arg=1, indexes=self.indexes
[self.variables, other_vars], priority_arg=1, indexes=self.xindexes
)
self._update_coords(coords, indexes)

def _merge_raw(self, other, reflexive):
"""For use with binary arithmetic."""
if other is None:
variables = dict(self.variables)
indexes = dict(self.indexes)
indexes = dict(self.xindexes)
else:
coord_list = [self, other] if not reflexive else [other, self]
variables, indexes = merge_coordinates_without_align(coord_list)
Expand All @@ -180,7 +184,9 @@ def _merge_inplace(self, other):
# don't include indexes in prioritized, because we didn't align
# first and we want indexes to be checked
prioritized = {
k: (v, None) for k, v in self.variables.items() if k not in self.indexes
k: (v, None)
for k, v in self.variables.items()
if k not in self.xindexes
}
variables, indexes = merge_coordinates_without_align(
[self, other], prioritized
Expand Down Expand Up @@ -265,7 +271,7 @@ def to_dataset(self) -> "Dataset":
return self._data._copy_listed(names)

def _update_coords(
self, coords: Dict[Hashable, Variable], indexes: Mapping[Hashable, pd.Index]
self, coords: Dict[Hashable, Variable], indexes: Mapping[Hashable, Index]
) -> None:
from .dataset import calculate_dimensions

Expand All @@ -285,7 +291,7 @@ def _update_coords(

# TODO(shoyer): once ._indexes is always populated by a dict, modify
# it to update inplace instead.
original_indexes = dict(self._data.indexes)
original_indexes = dict(self._data.xindexes)
original_indexes.update(indexes)
self._data._indexes = original_indexes

Expand Down Expand Up @@ -328,7 +334,7 @@ def __getitem__(self, key: Hashable) -> "DataArray":
return self._data._getitem_coord(key)

def _update_coords(
self, coords: Dict[Hashable, Variable], indexes: Mapping[Hashable, pd.Index]
self, coords: Dict[Hashable, Variable], indexes: Mapping[Hashable, Index]
) -> None:
from .dataset import calculate_dimensions

Expand All @@ -343,7 +349,7 @@ def _update_coords(

# TODO(shoyer): once ._indexes is always populated by a dict, modify
# it to update inplace instead.
original_indexes = dict(self._data.indexes)
original_indexes = dict(self._data.xindexes)
original_indexes.update(indexes)
self._data._indexes = original_indexes

Expand Down
35 changes: 29 additions & 6 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
)
from .dataset import Dataset, split_indexes
from .formatting import format_item
from .indexes import Indexes, default_indexes, propagate_indexes
from .indexes import Index, Indexes, PandasIndex, default_indexes, propagate_indexes
from .indexing import is_fancy_indexer
from .merge import PANDAS_TYPES, MergeError, _extract_indexes_from_coords
from .options import OPTIONS, _get_keep_attrs
Expand Down Expand Up @@ -345,7 +345,7 @@ class DataArray(AbstractArray, DataWithCoords, DataArrayArithmetic):
_cache: Dict[str, Any]
_coords: Dict[Any, Variable]
_close: Optional[Callable[[], None]]
_indexes: Optional[Dict[Hashable, pd.Index]]
_indexes: Optional[Dict[Hashable, Index]]
_name: Optional[Hashable]
_variable: Variable

Expand Down Expand Up @@ -478,7 +478,9 @@ def _overwrite_indexes(self, indexes: Mapping[Hashable, Any]) -> "DataArray":
# switch from dimension to level names, if necessary
dim_names: Dict[Any, str] = {}
for dim, idx in indexes.items():
if not isinstance(idx, pd.MultiIndex) and idx.name != dim:
# TODO: benbovy - flexible indexes: update when MultiIndex has its own class
pd_idx = idx.array
if not isinstance(pd_idx, pd.MultiIndex) and pd_idx.name != dim:
dim_names[dim] = idx.name
if dim_names:
obj = obj.rename(dim_names)
Expand Down Expand Up @@ -772,7 +774,21 @@ def encoding(self, value: Mapping[Hashable, Any]) -> None:

@property
def indexes(self) -> Indexes:
"""Mapping of pandas.Index objects used for label based indexing"""
"""Mapping of pandas.Index objects used for label based indexing.
Raises an error if this Dataset has indexes that cannot be coerced
to pandas.Index objects.
See Also
--------
DataArray.xindexes
"""
return Indexes({k: idx.to_pandas_index() for k, idx in self.xindexes.items()})

@property
def xindexes(self) -> Indexes:
"""Mapping of xarray Index objects used for label based indexing."""
if self._indexes is None:
self._indexes = default_indexes(self._coords, self.dims)
return Indexes(self._indexes)
Expand Down Expand Up @@ -990,7 +1006,12 @@ def copy(self, deep: bool = True, data: Any = None) -> "DataArray":
if self._indexes is None:
indexes = self._indexes
else:
indexes = {k: v.copy(deep=deep) for k, v in self._indexes.items()}
# TODO: benbovy: flexible indexes: support all xarray indexes (not just pandas.Index)
# xarray Index needs a copy method.
indexes = {
k: PandasIndex(v.to_pandas_index().copy(deep=deep))
for k, v in self._indexes.items()
}
return self._replace(variable, coords, indexes=indexes)

def __copy__(self) -> "DataArray":
Expand Down Expand Up @@ -2169,7 +2190,9 @@ def to_unstacked_dataset(self, dim, level=0):
Dataset.to_stacked_array
"""

idx = self.indexes[dim]
# TODO: benbovy - flexible indexes: update when MultIndex has its own
# class inheriting from xarray.Index
idx = self.xindexes[dim].to_pandas_index()
if not isinstance(idx, pd.MultiIndex):
raise ValueError(f"'{dim}' is not a stacked coordinate")

Expand Down
Loading

0 comments on commit 6e14df6

Please sign in to comment.