Flexible indexes: add Index base class and xindexes properties (#5102)

* add IndexAdapter class + move PandasIndexAdapter * wip: xarray_obj.indexes -> IndexAdapter objects * fix more broken tests * fix merge glitch * fix group bins tests * add xindexes property Use it internally instead of indexes * rename IndexAdapter -> Index * rename _to_index_adpater (typo) -> _to_xindex * add Index.to_pandas_index() method Also improve xarray_obj.indexes property implementation * rename PandasIndexAdpater -> PandasIndex * update index type in tests * ensure .indexes only returns pd.Index objects * PandasIndex: normalize other index in cmp funcs * fix merge lint errors * fix PandasIndex union/intersection * [skip-ci] add TODO comment about index sizes * address more PR comments * [skip-ci] update what's new * fix coord_names normalization * move what's new entry to unreleased section
pydata · May 11, 2021 · 6e14df6 · 6e14df6
1 parent 234b40a
commit 6e14df6
Show file tree

Hide file tree

Showing 22 changed files with 534 additions and 311 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -42,6 +42,10 @@ Documentation
 Internal Changes
 ~~~~~~~~~~~~~~~~
 
+- Explicit indexes refactor: add an ``xarray.Index`` base class and
+  ``Dataset.xindexes`` / ``DataArray.xindexes`` properties. Also rename
+  ``PandasIndexAdapter`` to ``PandasIndex``, which now inherits from
+  ``xarray.Index`` (:pull:`5102`). By `Benoit Bovy <https://github.com/benbovy>`_.
 
 .. _whats-new.0.18.0:
 
@@ -268,7 +272,6 @@ Internal Changes
   (:pull:`5188`), (:pull:`5191`).
   By `Maximilian Roos <https://github.com/max-sixty>`_.
 
-
 .. _whats-new.0.17.0:
 
 v0.17.0 (24 Feb 2021)

diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py
@@ -17,9 +17,10 @@
 import numpy as np
 import pandas as pd
 
-from . import dtypes, utils
+from . import dtypes
+from .indexes import Index, PandasIndex
 from .indexing import get_indexer_nd
-from .utils import is_dict_like, is_full_slice, maybe_coerce_to_str
+from .utils import is_dict_like, is_full_slice, maybe_coerce_to_str, safe_cast_to_index
 from .variable import IndexVariable, Variable
 
 if TYPE_CHECKING:
@@ -30,11 +31,11 @@
     DataAlignable = TypeVar("DataAlignable", bound=DataWithCoords)
 
 
-def _get_joiner(join):
+def _get_joiner(join, index_cls):
     if join == "outer":
-        return functools.partial(functools.reduce, pd.Index.union)
+        return functools.partial(functools.reduce, index_cls.union)
     elif join == "inner":
-        return functools.partial(functools.reduce, pd.Index.intersection)
+        return functools.partial(functools.reduce, index_cls.intersection)
     elif join == "left":
         return operator.itemgetter(0)
     elif join == "right":
@@ -63,7 +64,7 @@ def _override_indexes(objects, all_indexes, exclude):
     objects = list(objects)
     for idx, obj in enumerate(objects[1:]):
         new_indexes = {}
-        for dim in obj.indexes:
+        for dim in obj.xindexes:
             if dim not in exclude:
                 new_indexes[dim] = all_indexes[dim][0]
         objects[idx + 1] = obj._overwrite_indexes(new_indexes)
@@ -284,7 +285,7 @@ def align(
             if dim not in exclude:
                 all_coords[dim].append(obj.coords[dim])
                 try:
-                    index = obj.indexes[dim]
+                    index = obj.xindexes[dim]
                 except KeyError:
                     unlabeled_dim_sizes[dim].add(obj.sizes[dim])
                 else:
@@ -298,16 +299,19 @@ def align(
     # - It ensures it's possible to do operations that don't require alignment
     #   on indexes with duplicate values (which cannot be reindexed with
     #   pandas). This is useful, e.g., for overwriting such duplicate indexes.
-    joiner = _get_joiner(join)
     joined_indexes = {}
     for dim, matching_indexes in all_indexes.items():
         if dim in indexes:
-            index = utils.safe_cast_to_index(indexes[dim])
+            # TODO: benbovy - flexible indexes. maybe move this logic in util func
+            if isinstance(indexes[dim], Index):
+                index = indexes[dim]
+            else:
+                index = PandasIndex(safe_cast_to_index(indexes[dim]))
             if (
                 any(not index.equals(other) for other in matching_indexes)
                 or dim in unlabeled_dim_sizes
             ):
-                joined_indexes[dim] = indexes[dim]
+                joined_indexes[dim] = index
         else:
             if (
                 any(
@@ -318,6 +322,7 @@ def align(
             ):
                 if join == "exact":
                     raise ValueError(f"indexes along dimension {dim!r} are not equal")
+                joiner = _get_joiner(join, type(matching_indexes[0]))
                 index = joiner(matching_indexes)
                 # make sure str coords are not cast to object
                 index = maybe_coerce_to_str(index, all_coords[dim])
@@ -327,6 +332,9 @@ def align(
 
         if dim in unlabeled_dim_sizes:
             unlabeled_sizes = unlabeled_dim_sizes[dim]
+            # TODO: benbovy - flexible indexes: expose a size property for xarray.Index?
+            # Some indexes may not have a defined size (e.g., built from multiple coords of
+            # different sizes)
             labeled_size = index.size
             if len(unlabeled_sizes | {labeled_size}) > 1:
                 raise ValueError(
@@ -469,7 +477,7 @@ def reindex_like_indexers(
     ValueError
         If any dimensions without labels have different sizes.
     """
-    indexers = {k: v for k, v in other.indexes.items() if k in target.dims}
+    indexers = {k: v for k, v in other.xindexes.items() if k in target.dims}
 
     for dim in other.dims:
         if dim not in indexers and dim in target.dims:
@@ -487,14 +495,14 @@ def reindex_like_indexers(
 def reindex_variables(
     variables: Mapping[Any, Variable],
     sizes: Mapping[Any, int],
-    indexes: Mapping[Any, pd.Index],
+    indexes: Mapping[Any, Index],
     indexers: Mapping,
     method: Optional[str] = None,
     tolerance: Any = None,
     copy: bool = True,
     fill_value: Optional[Any] = dtypes.NA,
     sparse: bool = False,
-) -> Tuple[Dict[Hashable, Variable], Dict[Hashable, pd.Index]]:
+) -> Tuple[Dict[Hashable, Variable], Dict[Hashable, Index]]:
     """Conform a dictionary of aligned variables onto a new set of variables,
     filling in missing values with NaN.
 
@@ -559,10 +567,11 @@ def reindex_variables(
                 "from that to be indexed along {:s}".format(str(indexer.dims), dim)
             )
 
-        target = new_indexes[dim] = utils.safe_cast_to_index(indexers[dim])
+        target = new_indexes[dim] = PandasIndex(safe_cast_to_index(indexers[dim]))
 
         if dim in indexes:
-            index = indexes[dim]
+            # TODO (benbovy - flexible indexes): support other indexes than pd.Index?
+            index = indexes[dim].to_pandas_index()
 
             if not index.is_unique:
                 raise ValueError(

diff --git a/xarray/core/combine.py b/xarray/core/combine.py
@@ -69,13 +69,17 @@ def _infer_concat_order_from_coords(datasets):
         if dim in ds0:
 
             # Need to read coordinate values to do ordering
-            indexes = [ds.indexes.get(dim) for ds in datasets]
+            indexes = [ds.xindexes.get(dim) for ds in datasets]
             if any(index is None for index in indexes):
                 raise ValueError(
                     "Every dimension needs a coordinate for "
                     "inferring concatenation order"
                 )
 
+            # TODO (benbovy, flexible indexes): all indexes should be Pandas.Index
+            # get pd.Index objects from Index objects
+            indexes = [index.array for index in indexes]
+
             # If dimension coordinate values are same on every dataset then
             # should be leaving this dimension alone (it's just a "bystander")
             if not all(index.equals(indexes[0]) for index in indexes[1:]):
@@ -801,9 +805,13 @@ def combine_by_coords(
         )
 
         # Check the overall coordinates are monotonically increasing
+        # TODO (benbovy - flexible indexes): only with pandas.Index?
         for dim in concat_dims:
-            indexes = concatenated.indexes.get(dim)
-            if not (indexes.is_monotonic_increasing or indexes.is_monotonic_decreasing):
+            indexes = concatenated.xindexes.get(dim)
+            if not (
+                indexes.array.is_monotonic_increasing
+                or indexes.array.is_monotonic_decreasing
+            ):
                 raise ValueError(
                     "Resulting object does not have monotonic"
                     " global indexes along dimension {}".format(dim)

diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -406,7 +406,7 @@ def get_index(self, key: Hashable) -> pd.Index:
             raise KeyError(key)
 
         try:
-            return self.indexes[key]
+            return self.xindexes[key].to_pandas_index()
         except KeyError:
             return pd.Index(range(self.sizes[key]), name=key)
 
@@ -1162,7 +1162,8 @@ def resample(
                 category=FutureWarning,
             )
 
-            if isinstance(self.indexes[dim_name], CFTimeIndex):
+            # TODO (benbovy - flexible indexes): update when CFTimeIndex is an xarray Index subclass
+            if isinstance(self.xindexes[dim_name].to_pandas_index(), CFTimeIndex):
                 from .resample_cftime import CFTimeGrouper
 
                 grouper = CFTimeGrouper(freq, closed, label, base, loffset)

diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py
@@ -17,7 +17,7 @@
 import pandas as pd
 
 from . import formatting, indexing
-from .indexes import Indexes
+from .indexes import Index, Indexes
 from .merge import merge_coordinates_without_align, merge_coords
 from .utils import Frozen, ReprObject, either_dict_or_kwargs
 from .variable import Variable
@@ -52,6 +52,10 @@ def dims(self) -> Union[Mapping[Hashable, int], Tuple[Hashable, ...]]:
     def indexes(self) -> Indexes:
         return self._data.indexes  # type: ignore[attr-defined]
 
+    @property
+    def xindexes(self) -> Indexes:
+        return self._data.xindexes  # type: ignore[attr-defined]
+
     @property
     def variables(self):
         raise NotImplementedError()
@@ -157,15 +161,15 @@ def to_index(self, ordered_dims: Sequence[Hashable] = None) -> pd.Index:
     def update(self, other: Mapping[Hashable, Any]) -> None:
         other_vars = getattr(other, "variables", other)
         coords, indexes = merge_coords(
-            [self.variables, other_vars], priority_arg=1, indexes=self.indexes
+            [self.variables, other_vars], priority_arg=1, indexes=self.xindexes
         )
         self._update_coords(coords, indexes)
 
     def _merge_raw(self, other, reflexive):
         """For use with binary arithmetic."""
         if other is None:
             variables = dict(self.variables)
-            indexes = dict(self.indexes)
+            indexes = dict(self.xindexes)
         else:
             coord_list = [self, other] if not reflexive else [other, self]
             variables, indexes = merge_coordinates_without_align(coord_list)
@@ -180,7 +184,9 @@ def _merge_inplace(self, other):
             # don't include indexes in prioritized, because we didn't align
             # first and we want indexes to be checked
             prioritized = {
-                k: (v, None) for k, v in self.variables.items() if k not in self.indexes
+                k: (v, None)
+                for k, v in self.variables.items()
+                if k not in self.xindexes
             }
             variables, indexes = merge_coordinates_without_align(
                 [self, other], prioritized
@@ -265,7 +271,7 @@ def to_dataset(self) -> "Dataset":
         return self._data._copy_listed(names)
 
     def _update_coords(
-        self, coords: Dict[Hashable, Variable], indexes: Mapping[Hashable, pd.Index]
+        self, coords: Dict[Hashable, Variable], indexes: Mapping[Hashable, Index]
     ) -> None:
         from .dataset import calculate_dimensions
 
@@ -285,7 +291,7 @@ def _update_coords(
 
         # TODO(shoyer): once ._indexes is always populated by a dict, modify
         # it to update inplace instead.
-        original_indexes = dict(self._data.indexes)
+        original_indexes = dict(self._data.xindexes)
         original_indexes.update(indexes)
         self._data._indexes = original_indexes
 
@@ -328,7 +334,7 @@ def __getitem__(self, key: Hashable) -> "DataArray":
         return self._data._getitem_coord(key)
 
     def _update_coords(
-        self, coords: Dict[Hashable, Variable], indexes: Mapping[Hashable, pd.Index]
+        self, coords: Dict[Hashable, Variable], indexes: Mapping[Hashable, Index]
     ) -> None:
         from .dataset import calculate_dimensions
 
@@ -343,7 +349,7 @@ def _update_coords(
 
         # TODO(shoyer): once ._indexes is always populated by a dict, modify
         # it to update inplace instead.
-        original_indexes = dict(self._data.indexes)
+        original_indexes = dict(self._data.xindexes)
         original_indexes.update(indexes)
         self._data._indexes = original_indexes
 

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -51,7 +51,7 @@
 )
 from .dataset import Dataset, split_indexes
 from .formatting import format_item
-from .indexes import Indexes, default_indexes, propagate_indexes
+from .indexes import Index, Indexes, PandasIndex, default_indexes, propagate_indexes
 from .indexing import is_fancy_indexer
 from .merge import PANDAS_TYPES, MergeError, _extract_indexes_from_coords
 from .options import OPTIONS, _get_keep_attrs
@@ -345,7 +345,7 @@ class DataArray(AbstractArray, DataWithCoords, DataArrayArithmetic):
     _cache: Dict[str, Any]
     _coords: Dict[Any, Variable]
     _close: Optional[Callable[[], None]]
-    _indexes: Optional[Dict[Hashable, pd.Index]]
+    _indexes: Optional[Dict[Hashable, Index]]
     _name: Optional[Hashable]
     _variable: Variable
 
@@ -478,7 +478,9 @@ def _overwrite_indexes(self, indexes: Mapping[Hashable, Any]) -> "DataArray":
         # switch from dimension to level names, if necessary
         dim_names: Dict[Any, str] = {}
         for dim, idx in indexes.items():
-            if not isinstance(idx, pd.MultiIndex) and idx.name != dim:
+            # TODO: benbovy - flexible indexes: update when MultiIndex has its own class
+            pd_idx = idx.array
+            if not isinstance(pd_idx, pd.MultiIndex) and pd_idx.name != dim:
                 dim_names[dim] = idx.name
         if dim_names:
             obj = obj.rename(dim_names)
@@ -772,7 +774,21 @@ def encoding(self, value: Mapping[Hashable, Any]) -> None:
 
     @property
     def indexes(self) -> Indexes:
-        """Mapping of pandas.Index objects used for label based indexing"""
+        """Mapping of pandas.Index objects used for label based indexing.
+
+        Raises an error if this Dataset has indexes that cannot be coerced
+        to pandas.Index objects.
+
+        See Also
+        --------
+        DataArray.xindexes
+
+        """
+        return Indexes({k: idx.to_pandas_index() for k, idx in self.xindexes.items()})
+
+    @property
+    def xindexes(self) -> Indexes:
+        """Mapping of xarray Index objects used for label based indexing."""
         if self._indexes is None:
             self._indexes = default_indexes(self._coords, self.dims)
         return Indexes(self._indexes)
@@ -990,7 +1006,12 @@ def copy(self, deep: bool = True, data: Any = None) -> "DataArray":
         if self._indexes is None:
             indexes = self._indexes
         else:
-            indexes = {k: v.copy(deep=deep) for k, v in self._indexes.items()}
+            # TODO: benbovy: flexible indexes: support all xarray indexes (not just pandas.Index)
+            # xarray Index needs a copy method.
+            indexes = {
+                k: PandasIndex(v.to_pandas_index().copy(deep=deep))
+                for k, v in self._indexes.items()
+            }
         return self._replace(variable, coords, indexes=indexes)
 
     def __copy__(self) -> "DataArray":
@@ -2169,7 +2190,9 @@ def to_unstacked_dataset(self, dim, level=0):
         Dataset.to_stacked_array
         """
 
-        idx = self.indexes[dim]
+        # TODO: benbovy - flexible indexes: update when MultIndex has its own
+        # class inheriting from xarray.Index
+        idx = self.xindexes[dim].to_pandas_index()
         if not isinstance(idx, pd.MultiIndex):
             raise ValueError(f"'{dim}' is not a stacked coordinate")