CLN/COMPAT: IntervalIndex

pandas-dev · Feb 15, 2017 · 067375c · 067375c
1 parent b67b098
commit 067375c
Show file tree

Hide file tree

Showing 43 changed files with 1,763 additions and 2,283 deletions.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -11,6 +11,7 @@ Highlights include:
 
 - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`)
 - The ``.ix`` indexer has been deprecated, see :ref:`here <whatsnew_0200.api_breaking.deprecate_ix>`
+- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here <whatsnew_0200.enhancements.intervalindex>`
 - Switched the test framework to `pytest`_ (:issue:`13097`)
 
 .. _pytest: http://doc.pytest.org/en/latest/
@@ -120,6 +121,36 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
 - Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)
 
+.. _whatsnew_0200.enhancements.intervalindex:
+
+IntervalIndex
+^^^^^^^^^^^^^
+
+pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval
+notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`)
+
+**Previous behavior**:
+
+.. code-block:: ipython
+
+   In [2]: pd.cut(range(3), 2)
+   Out[2]:
+   [(-0.002, 1], (-0.002, 1], (1, 2]]
+   Categories (2, object): [(-0.002, 1] < (1, 2]]
+
+   # the returned categories are strings, representing Intervals
+   In [3]: pd.cut(range(3), 2).categories
+   Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object')
+
+**New behavior**:
+
+.. ipython:: python
+
+   c = pd.cut(range(3), 2)
+   c
+   c.categories
+   pd.api.types.is_interval_dtype(c.categories)
+
 .. _whatsnew_0200.enhancements.other:
 
 Other enhancements

diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -15,6 +15,7 @@
                                  is_int64_dtype,
                                  is_categorical_dtype,
                                  is_extension_type,
+                                 is_interval_dtype,
                                  is_datetimetz,
                                  is_period_dtype,
                                  is_period_arraylike,
@@ -401,31 +402,40 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
     if bins is not None:
         try:
             from pandas.tools.tile import cut
-            values = Series(values).values
-            cat, bins = cut(values, bins, retbins=True)
+            values = Series(values)
+            ii = cut(values, bins, include_lowest=True)
         except TypeError:
             raise TypeError("bins argument only works with numeric data.")
 
-    if is_extension_type(values) and not is_datetimetz(values):
-        # handle Categorical and sparse,
-        # datetime tz can be handeled in ndarray path
-        result = Series(values).values.value_counts(dropna=dropna)
-        result.name = name
-        counts = result.values
+        # count, remove nulls (from the index), and but the bins
+        result = ii.value_counts(dropna=dropna)
+        result = result[result.index.notnull()]
+        result.index = result.index.astype('interval')
+        result = result.sort_index()
+
+        # if we are dropna and we have NO values
+        if dropna and (result.values == 0).all():
+            result = result.iloc[0:0]
+
+        # normalizing is by len of all (regarless of dropna)
+        counts = np.array([len(ii)])
+
     else:
-        # ndarray path. pass original to handle DatetimeTzBlock
-        keys, counts = _value_counts_arraylike(values, dropna=dropna)
 
-        from pandas import Index, Series
-        if not isinstance(keys, Index):
-            keys = Index(keys)
-        result = Series(counts, index=keys, name=name)
+        if is_extension_type(values) and not is_datetimetz(values):
+            # handle Categorical and sparse,
+            # datetime tz can be handeled in ndarray path
+            result = Series(values).values.value_counts(dropna=dropna)
+            result.name = name
+            counts = result.values
+        else:
+            # ndarray path. pass original to handle DatetimeTzBlock
+            keys, counts = _value_counts_arraylike(values, dropna=dropna)
 
-    if bins is not None:
-        # TODO: This next line should be more efficient
-        result = result.reindex(np.arange(len(cat.categories)),
-                                fill_value=0)
-        result.index = bins[:-1]
+            from pandas import Index, Series
+            if not isinstance(keys, Index):
+                keys = Index(keys)
+            result = Series(counts, index=keys, name=name)
 
     if sort:
         result = result.sort_values(ascending=ascending)
@@ -1244,6 +1254,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
                            allow_fill=allow_fill)
     elif is_datetimetz(arr):
         return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
+    elif is_interval_dtype(arr):
+        return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
 
     if indexer is None:
         indexer = np.arange(arr.shape[axis], dtype=np.int64)

diff --git a/pandas/core/api.py b/pandas/core/api.py
@@ -11,8 +11,8 @@
 from pandas.formats.format import set_eng_float_format
 from pandas.core.index import (Index, CategoricalIndex, Int64Index,
                                UInt64Index, RangeIndex, Float64Index,
-                               MultiIndex)
-from pandas.core.interval import Interval, IntervalIndex
+                               MultiIndex, IntervalIndex)
+from pandas.indexes.interval import Interval, interval_range
 
 from pandas.core.series import Series, TimeSeries
 from pandas.core.frame import DataFrame

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -30,6 +30,7 @@
                                _possibly_downcast_to_dtype,
                                _invalidate_string_dtypes,
                                _coerce_to_dtypes,
+                               _coerce_extension_to_embed,
                                _maybe_upcast_putmask,
                                _find_common_type)
 from pandas.types.common import (is_categorical_dtype,
@@ -2648,7 +2649,7 @@ def reindexer(value):
 
         # return internal types directly
         if is_extension_type(value):
-            return value
+            return _coerce_extension_to_embed(value)
 
         # broadcast across multiple columns if necessary
         if broadcast and key in self.columns and value.ndim == 1:

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -17,6 +17,7 @@
 from pandas.types.common import (is_numeric_dtype,
                                  is_timedelta64_dtype, is_datetime64_dtype,
                                  is_categorical_dtype,
+                                 is_interval_dtype,
                                  is_datetimelike,
                                  is_datetime64_any_dtype,
                                  is_bool, is_integer_dtype,
@@ -39,10 +40,11 @@
 
 from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
                               DataError, SpecificationError)
+from pandas.core.index import (Index, MultiIndex,
+                               CategoricalIndex, _ensure_index)
 from pandas.core.categorical import Categorical
 from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame
-from pandas.core.interval import IntervalIndex
 from pandas.core.internals import BlockManager, make_block
 from pandas.core.series import Series
 from pandas.core.panel import Panel
@@ -2592,7 +2594,7 @@ def _convert_grouper(axis, grouper):
             return grouper.reindex(axis)._values
     elif isinstance(grouper, (list, Series, Index, np.ndarray)):
         if len(grouper) != len(axis):
-            raise AssertionError('Grouper and axis must be same length')
+            raise ValueError('Grouper and axis must be same length')
         return grouper
     else:
         return grouper
@@ -3084,36 +3086,37 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
 
         if bins is None:
             lab, lev = algos.factorize(val, sort=True)
+            llab = lambda lab, inc: lab[inc]
         else:
-            raise NotImplementedError('this is broken')
-            lab, bins = cut(val, bins, retbins=True)
-            # bins[:-1] for backward compat;
-            # o.w. cat.categories could be better
-            # cat = Categorical(cat)
-            # lab, lev, dropna = cat.codes, bins[:-1], False
-
-        if (lab.dtype == object
-                and lib.is_interval_array_fixed_closed(lab[notnull(lab)])):
-            lab_index = Index(lab)
-            assert isinstance(lab, IntervalIndex)
-            sorter = np.lexsort((lab_index.left, lab_index.right, ids))
+
+            # lab is a Categorical with categories an IntervalIndex
+            lab = cut(Series(val), bins, include_lowest=True)
+            lev = lab.cat.categories
+            lab = lev.take(lab.cat.codes)
+            llab = lambda lab, inc: lab[inc]._multiindex.labels[-1]
+
+        if is_interval_dtype(lab):
+            # TODO: should we do this inside II?
+            sorter = np.lexsort((lab.left, lab.right, ids))
         else:
             sorter = np.lexsort((lab, ids))
+
         ids, lab = ids[sorter], lab[sorter]
 
         # group boundaries are where group ids change
         idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
 
         # new values are where sorted labels change
-        inc = np.r_[True, lab[1:] != lab[:-1]]
+        lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
+        inc = np.r_[True, lchanges]
         inc[idx] = True  # group boundaries are also new values
         out = np.diff(np.nonzero(np.r_[inc, True])[0])  # value counts
 
         # num. of times each group should be repeated
         rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
 
         # multi-index components
-        labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
+        labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)]
         levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
         names = self.grouper.names + [self.name]
 
@@ -3139,13 +3142,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
                 acc = rep(d)
             out /= acc
 
-        if sort:  # and bins is None:
+        if sort and bins is None:
             cat = ids[inc][mask] if dropna else ids[inc]
             sorter = np.lexsort((out if ascending else -out, cat))
             out, labels[-1] = out[sorter], labels[-1][sorter]
 
-        # if bins is None:
-        if True:
+        if bins is None:
             mi = MultiIndex(levels=levels, labels=labels, names=names,
                             verify_integrity=False)
 

diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -39,6 +39,8 @@ PyDateTime_IMPORT
 cdef extern from "Python.h":
     int PySlice_Check(object)
 
+cdef size_t _INIT_VEC_CAP = 128
+
 include "hashtable_class_helper.pxi"
 include "hashtable_func_helper.pxi"
 

diff --git a/pandas/indexes/api.py b/pandas/indexes/api.py
@@ -3,6 +3,7 @@
                                  InvalidIndexError)
 from pandas.indexes.category import CategoricalIndex  # noqa
 from pandas.indexes.multi import MultiIndex  # noqa
+from pandas.indexes.interval import IntervalIndex  # noqa
 from pandas.indexes.numeric import (NumericIndex, Float64Index,  # noqa
                                     Int64Index, UInt64Index)
 from pandas.indexes.range import RangeIndex  # noqa
@@ -13,7 +14,7 @@
 # TODO: there are many places that rely on these private methods existing in
 # pandas.core.index
 __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
-           'CategoricalIndex', 'RangeIndex', 'UInt64Index',
+           'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index',
            'InvalidIndexError',
            '_new_Index',
            '_ensure_index', '_get_na_value', '_get_combined_index',

diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py
@@ -26,6 +26,7 @@
                                  is_dtype_equal,
                                  is_object_dtype,
                                  is_categorical_dtype,
+                                 is_interval_dtype,
                                  is_bool_dtype,
                                  is_signed_integer_dtype,
                                  is_unsigned_integer_dtype,
@@ -164,6 +165,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
             from .category import CategoricalIndex
             return CategoricalIndex(data, copy=copy, name=name, **kwargs)
 
+        # interval
+        if is_interval_dtype(data):
+            from .interval import IntervalIndex
+            return IntervalIndex.from_intervals(data, name=name,
+                                                copy=copy)
+
         # index-like
         elif isinstance(data, (np.ndarray, Index, ABCSeries)):
 
@@ -268,6 +275,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
                 elif inferred in ['floating', 'mixed-integer-float']:
                     from .numeric import Float64Index
                     return Float64Index(subarr, copy=copy, name=name)
+                elif inferred == 'interval':
+                    from .interval import IntervalIndex
+                    return IntervalIndex.from_intervals(subarr, name=name,
+                                                        copy=copy)
                 elif inferred == 'boolean':
                     # don't support boolean explicity ATM
                     pass
@@ -1180,6 +1191,9 @@ def is_object(self):
     def is_categorical(self):
         return self.inferred_type in ['categorical']
 
+    def is_interval(self):
+        return self.inferred_type in ['interval']
+
     def is_mixed(self):
         return self.inferred_type in ['mixed']
 
@@ -3235,6 +3249,13 @@ def _searchsorted_monotonic(self, label, side='left'):
 
         raise ValueError('index must be monotonic increasing or decreasing')
 
+    def _get_loc_only_exact_matches(self, key):
+        """
+        This is overriden on subclasses (namely, IntervalIndex) to control
+        get_slice_bound.
+        """
+        return self.get_loc(key)
+
     def get_slice_bound(self, label, side, kind):
         """
         Calculate slice bound that corresponds to given label.
@@ -3264,7 +3285,7 @@ def get_slice_bound(self, label, side, kind):
 
         # we need to look up the label
         try:
-            slc = self.get_loc(label)
+            slc = self._get_loc_only_exact_matches(label)
         except KeyError as err:
             try:
                 return self._searchsorted_monotonic(label, side)
@@ -3504,7 +3525,9 @@ def _evaluate_compare(self, other):
                 if needs_i8_conversion(self) and needs_i8_conversion(other):
                     return self._evaluate_compare(other, op)
 
-                if is_object_dtype(self) and self.nlevels == 1:
+                if (is_object_dtype(self) and
+                        self.nlevels == 1):
+
                     # don't pass MultiIndex
                     with np.errstate(all='ignore'):
                         result = _comp_method_OBJECT_ARRAY(
@@ -3816,6 +3839,9 @@ def _ensure_index(index_like, copy=False):
 
 
 def _get_na_value(dtype):
+    if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype):
+        return tslib.NaT
+
     return {np.datetime64: tslib.NaT,
             np.timedelta64: tslib.NaT}.get(dtype, np.nan)
 

diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py
@@ -7,6 +7,7 @@
 from pandas.types.common import (is_categorical_dtype,
                                  _ensure_platform_int,
                                  is_list_like,
+                                 is_interval_dtype,
                                  is_scalar)
 from pandas.types.missing import array_equivalent
 
@@ -266,6 +267,13 @@ def __array__(self, dtype=None):
         """ the array interface, return my values """
         return np.array(self._data, dtype=dtype)
 
+    @Appender(_index_shared_docs['astype'])
+    def astype(self, dtype, copy=True):
+        if is_interval_dtype(dtype):
+            from pandas import IntervalIndex
+            return IntervalIndex.from_intervals(np.array(self))
+        return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
+
     @cache_readonly
     def _isnan(self):
         """ return if each value is nan"""
@@ -508,6 +516,8 @@ def take(self, indices, axis=0, allow_fill=True,
                                            na_value=-1)
         return self._create_from_codes(taken)
 
+    take_nd = take
+
     def map(self, mapper):
         """Apply mapper function to its categories (not codes).