Skip to content

Commit

Permalink
CLN/COMPAT: IntervalIndex
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Feb 15, 2017
1 parent b67b098 commit 067375c
Show file tree
Hide file tree
Showing 43 changed files with 1,763 additions and 2,283 deletions.
31 changes: 31 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Highlights include:

- Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`)
- The ``.ix`` indexer has been deprecated, see :ref:`here <whatsnew_0200.api_breaking.deprecate_ix>`
- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here <whatsnew_0200.enhancements.intervalindex>`
- Switched the test framework to `pytest`_ (:issue:`13097`)

.. _pytest: http://doc.pytest.org/en/latest/
Expand Down Expand Up @@ -120,6 +121,36 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937
- Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`)
- Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`)

.. _whatsnew_0200.enhancements.intervalindex:

IntervalIndex
^^^^^^^^^^^^^

pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval
notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`)

**Previous behavior**:

.. code-block:: ipython

In [2]: pd.cut(range(3), 2)
Out[2]:
[(-0.002, 1], (-0.002, 1], (1, 2]]
Categories (2, object): [(-0.002, 1] < (1, 2]]

# the returned categories are strings, representing Intervals
In [3]: pd.cut(range(3), 2).categories
Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object')

**New behavior**:

.. ipython:: python

c = pd.cut(range(3), 2)
c
c.categories
pd.api.types.is_interval_dtype(c.categories)

.. _whatsnew_0200.enhancements.other:

Other enhancements
Expand Down
50 changes: 31 additions & 19 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
is_int64_dtype,
is_categorical_dtype,
is_extension_type,
is_interval_dtype,
is_datetimetz,
is_period_dtype,
is_period_arraylike,
Expand Down Expand Up @@ -401,31 +402,40 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
if bins is not None:
try:
from pandas.tools.tile import cut
values = Series(values).values
cat, bins = cut(values, bins, retbins=True)
values = Series(values)
ii = cut(values, bins, include_lowest=True)
except TypeError:
raise TypeError("bins argument only works with numeric data.")

if is_extension_type(values) and not is_datetimetz(values):
# handle Categorical and sparse,
# datetime tz can be handeled in ndarray path
result = Series(values).values.value_counts(dropna=dropna)
result.name = name
counts = result.values
# count, remove nulls (from the index), and but the bins
result = ii.value_counts(dropna=dropna)
result = result[result.index.notnull()]
result.index = result.index.astype('interval')
result = result.sort_index()

# if we are dropna and we have NO values
if dropna and (result.values == 0).all():
result = result.iloc[0:0]

# normalizing is by len of all (regarless of dropna)
counts = np.array([len(ii)])

else:
# ndarray path. pass original to handle DatetimeTzBlock
keys, counts = _value_counts_arraylike(values, dropna=dropna)

from pandas import Index, Series
if not isinstance(keys, Index):
keys = Index(keys)
result = Series(counts, index=keys, name=name)
if is_extension_type(values) and not is_datetimetz(values):
# handle Categorical and sparse,
# datetime tz can be handeled in ndarray path
result = Series(values).values.value_counts(dropna=dropna)
result.name = name
counts = result.values
else:
# ndarray path. pass original to handle DatetimeTzBlock
keys, counts = _value_counts_arraylike(values, dropna=dropna)

if bins is not None:
# TODO: This next line should be more efficient
result = result.reindex(np.arange(len(cat.categories)),
fill_value=0)
result.index = bins[:-1]
from pandas import Index, Series
if not isinstance(keys, Index):
keys = Index(keys)
result = Series(counts, index=keys, name=name)

if sort:
result = result.sort_values(ascending=ascending)
Expand Down Expand Up @@ -1244,6 +1254,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None,
allow_fill=allow_fill)
elif is_datetimetz(arr):
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)
elif is_interval_dtype(arr):
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill)

if indexer is None:
indexer = np.arange(arr.shape[axis], dtype=np.int64)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
from pandas.formats.format import set_eng_float_format
from pandas.core.index import (Index, CategoricalIndex, Int64Index,
UInt64Index, RangeIndex, Float64Index,
MultiIndex)
from pandas.core.interval import Interval, IntervalIndex
MultiIndex, IntervalIndex)
from pandas.indexes.interval import Interval, interval_range

from pandas.core.series import Series, TimeSeries
from pandas.core.frame import DataFrame
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
_possibly_downcast_to_dtype,
_invalidate_string_dtypes,
_coerce_to_dtypes,
_coerce_extension_to_embed,
_maybe_upcast_putmask,
_find_common_type)
from pandas.types.common import (is_categorical_dtype,
Expand Down Expand Up @@ -2648,7 +2649,7 @@ def reindexer(value):

# return internal types directly
if is_extension_type(value):
return value
return _coerce_extension_to_embed(value)

# broadcast across multiple columns if necessary
if broadcast and key in self.columns and value.ndim == 1:
Expand Down
40 changes: 21 additions & 19 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from pandas.types.common import (is_numeric_dtype,
is_timedelta64_dtype, is_datetime64_dtype,
is_categorical_dtype,
is_interval_dtype,
is_datetimelike,
is_datetime64_any_dtype,
is_bool, is_integer_dtype,
Expand All @@ -39,10 +40,11 @@

from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
DataError, SpecificationError)
from pandas.core.index import (Index, MultiIndex,
CategoricalIndex, _ensure_index)
from pandas.core.categorical import Categorical
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.interval import IntervalIndex
from pandas.core.internals import BlockManager, make_block
from pandas.core.series import Series
from pandas.core.panel import Panel
Expand Down Expand Up @@ -2592,7 +2594,7 @@ def _convert_grouper(axis, grouper):
return grouper.reindex(axis)._values
elif isinstance(grouper, (list, Series, Index, np.ndarray)):
if len(grouper) != len(axis):
raise AssertionError('Grouper and axis must be same length')
raise ValueError('Grouper and axis must be same length')
return grouper
else:
return grouper
Expand Down Expand Up @@ -3084,36 +3086,37 @@ def value_counts(self, normalize=False, sort=True, ascending=False,

if bins is None:
lab, lev = algos.factorize(val, sort=True)
llab = lambda lab, inc: lab[inc]
else:
raise NotImplementedError('this is broken')
lab, bins = cut(val, bins, retbins=True)
# bins[:-1] for backward compat;
# o.w. cat.categories could be better
# cat = Categorical(cat)
# lab, lev, dropna = cat.codes, bins[:-1], False

if (lab.dtype == object
and lib.is_interval_array_fixed_closed(lab[notnull(lab)])):
lab_index = Index(lab)
assert isinstance(lab, IntervalIndex)
sorter = np.lexsort((lab_index.left, lab_index.right, ids))

# lab is a Categorical with categories an IntervalIndex
lab = cut(Series(val), bins, include_lowest=True)
lev = lab.cat.categories
lab = lev.take(lab.cat.codes)
llab = lambda lab, inc: lab[inc]._multiindex.labels[-1]

if is_interval_dtype(lab):
# TODO: should we do this inside II?
sorter = np.lexsort((lab.left, lab.right, ids))
else:
sorter = np.lexsort((lab, ids))

ids, lab = ids[sorter], lab[sorter]

# group boundaries are where group ids change
idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]

# new values are where sorted labels change
inc = np.r_[True, lab[1:] != lab[:-1]]
lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
inc = np.r_[True, lchanges]
inc[idx] = True # group boundaries are also new values
out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts

# num. of times each group should be repeated
rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))

# multi-index components
labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]]
labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)]
levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
names = self.grouper.names + [self.name]

Expand All @@ -3139,13 +3142,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
acc = rep(d)
out /= acc

if sort: # and bins is None:
if sort and bins is None:
cat = ids[inc][mask] if dropna else ids[inc]
sorter = np.lexsort((out if ascending else -out, cat))
out, labels[-1] = out[sorter], labels[-1][sorter]

# if bins is None:
if True:
if bins is None:
mi = MultiIndex(levels=levels, labels=labels, names=names,
verify_integrity=False)

Expand Down
2 changes: 2 additions & 0 deletions pandas/hashtable.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ PyDateTime_IMPORT
cdef extern from "Python.h":
int PySlice_Check(object)

cdef size_t _INIT_VEC_CAP = 128

include "hashtable_class_helper.pxi"
include "hashtable_func_helper.pxi"

Expand Down
3 changes: 2 additions & 1 deletion pandas/indexes/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
InvalidIndexError)
from pandas.indexes.category import CategoricalIndex # noqa
from pandas.indexes.multi import MultiIndex # noqa
from pandas.indexes.interval import IntervalIndex # noqa
from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa
Int64Index, UInt64Index)
from pandas.indexes.range import RangeIndex # noqa
Expand All @@ -13,7 +14,7 @@
# TODO: there are many places that rely on these private methods existing in
# pandas.core.index
__all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
'CategoricalIndex', 'RangeIndex', 'UInt64Index',
'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index',
'InvalidIndexError',
'_new_Index',
'_ensure_index', '_get_na_value', '_get_combined_index',
Expand Down
30 changes: 28 additions & 2 deletions pandas/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
is_dtype_equal,
is_object_dtype,
is_categorical_dtype,
is_interval_dtype,
is_bool_dtype,
is_signed_integer_dtype,
is_unsigned_integer_dtype,
Expand Down Expand Up @@ -164,6 +165,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
from .category import CategoricalIndex
return CategoricalIndex(data, copy=copy, name=name, **kwargs)

# interval
if is_interval_dtype(data):
from .interval import IntervalIndex
return IntervalIndex.from_intervals(data, name=name,
copy=copy)

# index-like
elif isinstance(data, (np.ndarray, Index, ABCSeries)):

Expand Down Expand Up @@ -268,6 +275,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
elif inferred in ['floating', 'mixed-integer-float']:
from .numeric import Float64Index
return Float64Index(subarr, copy=copy, name=name)
elif inferred == 'interval':
from .interval import IntervalIndex
return IntervalIndex.from_intervals(subarr, name=name,
copy=copy)
elif inferred == 'boolean':
# don't support boolean explicity ATM
pass
Expand Down Expand Up @@ -1180,6 +1191,9 @@ def is_object(self):
def is_categorical(self):
return self.inferred_type in ['categorical']

def is_interval(self):
return self.inferred_type in ['interval']

def is_mixed(self):
return self.inferred_type in ['mixed']

Expand Down Expand Up @@ -3235,6 +3249,13 @@ def _searchsorted_monotonic(self, label, side='left'):

raise ValueError('index must be monotonic increasing or decreasing')

def _get_loc_only_exact_matches(self, key):
"""
This is overriden on subclasses (namely, IntervalIndex) to control
get_slice_bound.
"""
return self.get_loc(key)

def get_slice_bound(self, label, side, kind):
"""
Calculate slice bound that corresponds to given label.
Expand Down Expand Up @@ -3264,7 +3285,7 @@ def get_slice_bound(self, label, side, kind):

# we need to look up the label
try:
slc = self.get_loc(label)
slc = self._get_loc_only_exact_matches(label)
except KeyError as err:
try:
return self._searchsorted_monotonic(label, side)
Expand Down Expand Up @@ -3504,7 +3525,9 @@ def _evaluate_compare(self, other):
if needs_i8_conversion(self) and needs_i8_conversion(other):
return self._evaluate_compare(other, op)

if is_object_dtype(self) and self.nlevels == 1:
if (is_object_dtype(self) and
self.nlevels == 1):

# don't pass MultiIndex
with np.errstate(all='ignore'):
result = _comp_method_OBJECT_ARRAY(
Expand Down Expand Up @@ -3816,6 +3839,9 @@ def _ensure_index(index_like, copy=False):


def _get_na_value(dtype):
if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype):
return tslib.NaT

return {np.datetime64: tslib.NaT,
np.timedelta64: tslib.NaT}.get(dtype, np.nan)

Expand Down
10 changes: 10 additions & 0 deletions pandas/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pandas.types.common import (is_categorical_dtype,
_ensure_platform_int,
is_list_like,
is_interval_dtype,
is_scalar)
from pandas.types.missing import array_equivalent

Expand Down Expand Up @@ -266,6 +267,13 @@ def __array__(self, dtype=None):
""" the array interface, return my values """
return np.array(self._data, dtype=dtype)

@Appender(_index_shared_docs['astype'])
def astype(self, dtype, copy=True):
if is_interval_dtype(dtype):
from pandas import IntervalIndex
return IntervalIndex.from_intervals(np.array(self))
return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)

@cache_readonly
def _isnan(self):
""" return if each value is nan"""
Expand Down Expand Up @@ -508,6 +516,8 @@ def take(self, indices, axis=0, allow_fill=True,
na_value=-1)
return self._create_from_codes(taken)

take_nd = take

def map(self, mapper):
"""Apply mapper function to its categories (not codes).
Expand Down
Loading

0 comments on commit 067375c

Please sign in to comment.