From 067375c4576399198167fc17912ff171bbb20c6c Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 2 Feb 2017 09:24:09 -0500 Subject: [PATCH] CLN/COMPAT: IntervalIndex --- doc/source/whatsnew/v0.20.0.txt | 31 + pandas/core/algorithms.py | 50 +- pandas/core/api.py | 4 +- pandas/core/frame.py | 3 +- pandas/core/groupby.py | 40 +- pandas/hashtable.pyx | 2 + pandas/indexes/api.py | 3 +- pandas/indexes/base.py | 30 +- pandas/indexes/category.py | 10 + pandas/{core => indexes}/interval.py | 416 +++-- pandas/lib.pyx | 6 +- pandas/src/inference.pyx | 30 +- pandas/src/interval.pyx | 57 +- ...te_intervaltree.py => intervaltree.pxi.in} | 198 ++- pandas/src/intervaltree.pyx | 1444 ----------------- pandas/tests/api/test_api.py | 5 +- pandas/tests/frame/test_alter_axes.py | 47 +- pandas/tests/groupby/test_categorical.py | 5 +- pandas/tests/groupby/test_groupby.py | 49 +- pandas/tests/indexes/common.py | 25 +- pandas/tests/indexes/test_base.py | 4 +- pandas/tests/indexes/test_category.py | 16 +- pandas/tests/{ => indexes}/test_interval.py | 587 ++++--- pandas/tests/indexing/test_interval.py | 67 + pandas/tests/scalar/test_interval.py | 116 ++ pandas/tests/series/test_constructors.py | 14 +- pandas/tests/test_algos.py | 30 +- pandas/tests/test_base.py | 33 +- pandas/tests/test_categorical.py | 13 +- pandas/tests/tools/test_tile.py | 183 ++- pandas/tests/types/test_dtypes.py | 118 +- pandas/tests/types/test_missing.py | 8 + pandas/tools/tile.py | 182 ++- pandas/tseries/interval.py | 38 - pandas/types/api.py | 4 + pandas/types/cast.py | 17 +- pandas/types/common.py | 23 + pandas/types/dtypes.py | 109 ++ pandas/types/generic.py | 4 +- pandas/types/inference.py | 2 + pandas/types/missing.py | 5 +- pandas/util/testing.py | 11 + setup.py | 7 +- 43 files changed, 1763 insertions(+), 2283 deletions(-) rename pandas/{core => indexes}/interval.py (56%) rename pandas/src/{generate_intervaltree.py => intervaltree.pxi.in} (68%) delete mode 100644 pandas/src/intervaltree.pyx rename pandas/tests/{ => indexes}/test_interval.py (52%) create mode 100644 pandas/tests/indexing/test_interval.py create mode 100644 pandas/tests/scalar/test_interval.py delete mode 100644 pandas/tseries/interval.py diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 4708abe4d592e..0cfe50219ab1e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -11,6 +11,7 @@ Highlights include: - Building pandas for development now requires ``cython >= 0.23`` (:issue:`14831`) - The ``.ix`` indexer has been deprecated, see :ref:`here ` +- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here ` - Switched the test framework to `pytest`_ (:issue:`13097`) .. _pytest: http://doc.pytest.org/en/latest/ @@ -120,6 +121,36 @@ Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937 - Bug in ``pd.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14915`) - Bug in ``pd.value_counts()`` in which unsigned 64-bit integers were being erroneously truncated in the output (:issue:`14934`) +.. _whatsnew_0200.enhancements.intervalindex: + +IntervalIndex +^^^^^^^^^^^^^ + +pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval +notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`) + +**Previous behavior**: + +.. code-block:: ipython + + In [2]: pd.cut(range(3), 2) + Out[2]: + [(-0.002, 1], (-0.002, 1], (1, 2]] + Categories (2, object): [(-0.002, 1] < (1, 2]] + + # the returned categories are strings, representing Intervals + In [3]: pd.cut(range(3), 2).categories + Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object') + +**New behavior**: + +.. ipython:: python + + c = pd.cut(range(3), 2) + c + c.categories + pd.api.types.is_interval_dtype(c.categories) + .. _whatsnew_0200.enhancements.other: Other enhancements diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8c3dce0e8072a..462cfe5c3f943 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -15,6 +15,7 @@ is_int64_dtype, is_categorical_dtype, is_extension_type, + is_interval_dtype, is_datetimetz, is_period_dtype, is_period_arraylike, @@ -401,31 +402,40 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if bins is not None: try: from pandas.tools.tile import cut - values = Series(values).values - cat, bins = cut(values, bins, retbins=True) + values = Series(values) + ii = cut(values, bins, include_lowest=True) except TypeError: raise TypeError("bins argument only works with numeric data.") - if is_extension_type(values) and not is_datetimetz(values): - # handle Categorical and sparse, - # datetime tz can be handeled in ndarray path - result = Series(values).values.value_counts(dropna=dropna) - result.name = name - counts = result.values + # count, remove nulls (from the index), and but the bins + result = ii.value_counts(dropna=dropna) + result = result[result.index.notnull()] + result.index = result.index.astype('interval') + result = result.sort_index() + + # if we are dropna and we have NO values + if dropna and (result.values == 0).all(): + result = result.iloc[0:0] + + # normalizing is by len of all (regarless of dropna) + counts = np.array([len(ii)]) + else: - # ndarray path. pass original to handle DatetimeTzBlock - keys, counts = _value_counts_arraylike(values, dropna=dropna) - from pandas import Index, Series - if not isinstance(keys, Index): - keys = Index(keys) - result = Series(counts, index=keys, name=name) + if is_extension_type(values) and not is_datetimetz(values): + # handle Categorical and sparse, + # datetime tz can be handeled in ndarray path + result = Series(values).values.value_counts(dropna=dropna) + result.name = name + counts = result.values + else: + # ndarray path. pass original to handle DatetimeTzBlock + keys, counts = _value_counts_arraylike(values, dropna=dropna) - if bins is not None: - # TODO: This next line should be more efficient - result = result.reindex(np.arange(len(cat.categories)), - fill_value=0) - result.index = bins[:-1] + from pandas import Index, Series + if not isinstance(keys, Index): + keys = Index(keys) + result = Series(counts, index=keys, name=name) if sort: result = result.sort_values(ascending=ascending) @@ -1244,6 +1254,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + elif is_interval_dtype(arr): + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) diff --git a/pandas/core/api.py b/pandas/core/api.py index eb2b3f32d58d0..fe589928f09dc 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -11,8 +11,8 @@ from pandas.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, Float64Index, - MultiIndex) -from pandas.core.interval import Interval, IntervalIndex + MultiIndex, IntervalIndex) +from pandas.indexes.interval import Interval, interval_range from pandas.core.series import Series, TimeSeries from pandas.core.frame import DataFrame diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9c66f6dbb273e..424aff3fa5218 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -30,6 +30,7 @@ _possibly_downcast_to_dtype, _invalidate_string_dtypes, _coerce_to_dtypes, + _coerce_extension_to_embed, _maybe_upcast_putmask, _find_common_type) from pandas.types.common import (is_categorical_dtype, @@ -2648,7 +2649,7 @@ def reindexer(value): # return internal types directly if is_extension_type(value): - return value + return _coerce_extension_to_embed(value) # broadcast across multiple columns if necessary if broadcast and key in self.columns and value.ndim == 1: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7b3ac3d112a80..4b269c76e7f0e 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -17,6 +17,7 @@ from pandas.types.common import (is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype, is_categorical_dtype, + is_interval_dtype, is_datetimelike, is_datetime64_any_dtype, is_bool, is_integer_dtype, @@ -39,10 +40,11 @@ from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) +from pandas.core.index import (Index, MultiIndex, + CategoricalIndex, _ensure_index) from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.interval import IntervalIndex from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel @@ -2592,7 +2594,7 @@ def _convert_grouper(axis, grouper): return grouper.reindex(axis)._values elif isinstance(grouper, (list, Series, Index, np.ndarray)): if len(grouper) != len(axis): - raise AssertionError('Grouper and axis must be same length') + raise ValueError('Grouper and axis must be same length') return grouper else: return grouper @@ -3084,28 +3086,29 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if bins is None: lab, lev = algos.factorize(val, sort=True) + llab = lambda lab, inc: lab[inc] else: - raise NotImplementedError('this is broken') - lab, bins = cut(val, bins, retbins=True) - # bins[:-1] for backward compat; - # o.w. cat.categories could be better - # cat = Categorical(cat) - # lab, lev, dropna = cat.codes, bins[:-1], False - - if (lab.dtype == object - and lib.is_interval_array_fixed_closed(lab[notnull(lab)])): - lab_index = Index(lab) - assert isinstance(lab, IntervalIndex) - sorter = np.lexsort((lab_index.left, lab_index.right, ids)) + + # lab is a Categorical with categories an IntervalIndex + lab = cut(Series(val), bins, include_lowest=True) + lev = lab.cat.categories + lab = lev.take(lab.cat.codes) + llab = lambda lab, inc: lab[inc]._multiindex.labels[-1] + + if is_interval_dtype(lab): + # TODO: should we do this inside II? + sorter = np.lexsort((lab.left, lab.right, ids)) else: sorter = np.lexsort((lab, ids)) + ids, lab = ids[sorter], lab[sorter] # group boundaries are where group ids change idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] # new values are where sorted labels change - inc = np.r_[True, lab[1:] != lab[:-1]] + lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) + inc = np.r_[True, lchanges] inc[idx] = True # group boundaries are also new values out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts @@ -3113,7 +3116,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]] + labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self.name] @@ -3139,13 +3142,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False, acc = rep(d) out /= acc - if sort: # and bins is None: + if sort and bins is None: cat = ids[inc][mask] if dropna else ids[inc] sorter = np.lexsort((out if ascending else -out, cat)) out, labels[-1] = out[sorter], labels[-1][sorter] - # if bins is None: - if True: + if bins is None: mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index fa805bcfbd5b0..f77fef85d3c2f 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -39,6 +39,8 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) +cdef size_t _INIT_VEC_CAP = 128 + include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" diff --git a/pandas/indexes/api.py b/pandas/indexes/api.py index 64992e46613e5..3a49c99f12a7f 100644 --- a/pandas/indexes/api.py +++ b/pandas/indexes/api.py @@ -3,6 +3,7 @@ InvalidIndexError) from pandas.indexes.category import CategoricalIndex # noqa from pandas.indexes.multi import MultiIndex # noqa +from pandas.indexes.interval import IntervalIndex # noqa from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa Int64Index, UInt64Index) from pandas.indexes.range import RangeIndex # noqa @@ -13,7 +14,7 @@ # TODO: there are many places that rely on these private methods existing in # pandas.core.index __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', - 'CategoricalIndex', 'RangeIndex', 'UInt64Index', + 'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index', 'InvalidIndexError', '_new_Index', '_ensure_index', '_get_na_value', '_get_combined_index', diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index c483fb0764a4c..f0093c9115c87 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -26,6 +26,7 @@ is_dtype_equal, is_object_dtype, is_categorical_dtype, + is_interval_dtype, is_bool_dtype, is_signed_integer_dtype, is_unsigned_integer_dtype, @@ -164,6 +165,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, from .category import CategoricalIndex return CategoricalIndex(data, copy=copy, name=name, **kwargs) + # interval + if is_interval_dtype(data): + from .interval import IntervalIndex + return IntervalIndex.from_intervals(data, name=name, + copy=copy) + # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): @@ -268,6 +275,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif inferred in ['floating', 'mixed-integer-float']: from .numeric import Float64Index return Float64Index(subarr, copy=copy, name=name) + elif inferred == 'interval': + from .interval import IntervalIndex + return IntervalIndex.from_intervals(subarr, name=name, + copy=copy) elif inferred == 'boolean': # don't support boolean explicity ATM pass @@ -1180,6 +1191,9 @@ def is_object(self): def is_categorical(self): return self.inferred_type in ['categorical'] + def is_interval(self): + return self.inferred_type in ['interval'] + def is_mixed(self): return self.inferred_type in ['mixed'] @@ -3235,6 +3249,13 @@ def _searchsorted_monotonic(self, label, side='left'): raise ValueError('index must be monotonic increasing or decreasing') + def _get_loc_only_exact_matches(self, key): + """ + This is overriden on subclasses (namely, IntervalIndex) to control + get_slice_bound. + """ + return self.get_loc(key) + def get_slice_bound(self, label, side, kind): """ Calculate slice bound that corresponds to given label. @@ -3264,7 +3285,7 @@ def get_slice_bound(self, label, side, kind): # we need to look up the label try: - slc = self.get_loc(label) + slc = self._get_loc_only_exact_matches(label) except KeyError as err: try: return self._searchsorted_monotonic(label, side) @@ -3504,7 +3525,9 @@ def _evaluate_compare(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - if is_object_dtype(self) and self.nlevels == 1: + if (is_object_dtype(self) and + self.nlevels == 1): + # don't pass MultiIndex with np.errstate(all='ignore'): result = _comp_method_OBJECT_ARRAY( @@ -3816,6 +3839,9 @@ def _ensure_index(index_like, copy=False): def _get_na_value(dtype): + if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype): + return tslib.NaT + return {np.datetime64: tslib.NaT, np.timedelta64: tslib.NaT}.get(dtype, np.nan) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index e2e0fd056b111..06f41ddeed84a 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -7,6 +7,7 @@ from pandas.types.common import (is_categorical_dtype, _ensure_platform_int, is_list_like, + is_interval_dtype, is_scalar) from pandas.types.missing import array_equivalent @@ -266,6 +267,13 @@ def __array__(self, dtype=None): """ the array interface, return my values """ return np.array(self._data, dtype=dtype) + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_interval_dtype(dtype): + from pandas import IntervalIndex + return IntervalIndex.from_intervals(np.array(self)) + return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy) + @cache_readonly def _isnan(self): """ return if each value is nan""" @@ -508,6 +516,8 @@ def take(self, indices, axis=0, allow_fill=True, na_value=-1) return self._create_from_codes(taken) + take_nd = take + def map(self, mapper): """Apply mapper function to its categories (not codes). diff --git a/pandas/core/interval.py b/pandas/indexes/interval.py similarity index 56% rename from pandas/core/interval.py rename to pandas/indexes/interval.py index 68e07f21367a0..8db25dcc73773 100644 --- a/pandas/core/interval.py +++ b/pandas/indexes/interval.py @@ -1,19 +1,25 @@ -import operator +""" define the IntervalIndex """ import numpy as np -import pandas as pd -from pandas.core.base import PandasObject, IndexOpsMixin -from pandas.core.common import (_values_from_object, _ensure_platform_int, - notnull, is_datetime_or_timedelta_dtype, - is_integer_dtype, is_float_dtype) -from pandas.core.index import (Index, _ensure_index, default_pprint, - InvalidIndexError, MultiIndex) -from pandas.lib import (Interval, IntervalMixin, IntervalTree, - interval_bounds_to_intervals, - intervals_to_interval_bounds) -from pandas.util.decorators import cache_readonly -import pandas.core.common as com +from pandas.types.missing import notnull, isnull +from pandas.types.dtypes import IntervalDtype +from pandas.types.common import (_ensure_platform_int, + is_datetime_or_timedelta_dtype, + is_integer_dtype, + is_object_dtype, + is_categorical_dtype, + is_float_dtype, + is_interval_dtype) +from pandas.indexes.base import (Index, _ensure_index, + default_pprint, _index_shared_docs) +from pandas.tslib import Timestamp, Timedelta +from pandas.indexes.multi import MultiIndex +from pandas.compat.numpy import function as nv +from pandas.core import common as com +from pandas._interval import (Interval, IntervalMixin, IntervalTree, + intervals_to_interval_bounds) +from pandas.util.decorators import cache_readonly, Appender _VALID_CLOSED = set(['left', 'right', 'both', 'neither']) @@ -21,7 +27,7 @@ def _get_next_label(label): dtype = getattr(label, 'dtype', type(label)) - if isinstance(label, (pd.Timestamp, pd.Timedelta)): + if isinstance(label, (Timestamp, Timedelta)): dtype = 'datetime64' if is_datetime_or_timedelta_dtype(dtype): return label + np.timedelta64(1, 'ns') @@ -36,7 +42,7 @@ def _get_next_label(label): def _get_prev_label(label): dtype = getattr(label, 'dtype', type(label)) - if isinstance(label, (pd.Timestamp, pd.Timedelta)): + if isinstance(label, (Timestamp, Timedelta)): dtype = 'datetime64' if is_datetime_or_timedelta_dtype(dtype): return label - np.timedelta64(1, 'ns') @@ -67,7 +73,7 @@ class IntervalIndex(IntervalMixin, Index): Immutable Index implementing an ordered, sliceable set. IntervalIndex represents an Index of intervals that are all closed on the same side. - .. versionadded:: 0.18 + .. versionadded:: 0.20.0 Properties ---------- @@ -78,25 +84,69 @@ class IntervalIndex(IntervalMixin, Index): neither. Defaults to 'right'. name : object, optional Name to be stored in the index. + copy : boolean, default False + Copy the meta-data """ _typ = 'intervalindex' _comparables = ['name'] _attributes = ['name', 'closed'] _allow_index_ops = True - _engine = None # disable it - def __new__(cls, left, right, closed='right', name=None, fastpath=False): - # TODO: validation + def __new__(cls, left, right=None, closed='right', mask=None, + name=None, copy=False, dtype=None, fastpath=False): + + if right is None: + + if not isinstance(left, IntervalIndex): + return Index(left, closed=closed, name=name, copy=copy) + + if copy: + left = left.copy() + if name is None: + name = left.name + else: + + if copy: + + left = left.copy() + right = right.copy() + + return cls._simple_new(left, right, closed, name, + mask, fastpath=fastpath) + + @classmethod + def _simple_new(cls, left, right, closed='right', name=None, mask=None, + fastpath=True, **kwargs): + + if right is None: + + if not isinstance(left, IntervalIndex): + return Index(left, closed=closed, name=name, **kwargs) + + closed = left._closed + left, right, mask = left._left, left._right, left._mask + result = IntervalMixin.__new__(cls) result._left = _ensure_index(left) result._right = _ensure_index(right) result._closed = closed + result._mask = mask result.name = name if not fastpath: result._validate() result._reset_identity() return result + @Appender(_index_shared_docs['_shallow_copy']) + def _shallow_copy(self, left=None, right=None, **kwargs): + if left is None: + left = self.left + right = self.right + + attributes = self._get_attributes_dict() + attributes.update(kwargs) + return self._simple_new(left, right, **attributes) + def _validate(self): """Verify that the IntervalIndex is valid. """ @@ -113,19 +163,22 @@ def _validate(self): if not (self.left[left_valid] <= self.right[left_valid]).all(): raise ValueError('left side of interval must be <= right side') - def _simple_new(cls, values, name=None, **kwargs): - # ensure we don't end up here (this is a superclass method) - raise NotImplementedError - - def _cleanup(self): - pass + @cache_readonly + def hasnans(self): + """ return if I have any nans; enables various perf speedups """ + if self._mask is None: + return False + return self._mask.any() - @property - def _engine(self): - raise NotImplementedError + @cache_readonly + def _isnan(self): + """ return if each value is nan""" + if self._mask is None: + self._mask = np.zeros(len(self), dtype=np.bool_) + return self._mask @cache_readonly - def _tree(self): + def _engine(self): return IntervalTree(self.left, self.right, closed=self.closed) @property @@ -155,10 +208,12 @@ def from_breaks(cls, breaks, closed='right', name=None): right=[1, 2, 3], closed='right') """ - return cls(breaks[:-1], breaks[1:], closed, name) + breaks = np.asarray(breaks) + mask = isnull(breaks[:-1]) + return cls(breaks[:-1], breaks[1:], closed, mask=mask, name=name) @classmethod - def from_intervals(cls, data, name=None): + def from_intervals(cls, data, name=None, copy=False): """ Construct an IntervalIndex from a 1d array of Interval objects @@ -169,6 +224,8 @@ def from_intervals(cls, data, name=None): sides. name : object, optional Name to be stored in the index. + copy : boolean, default False + by-default copy the data, this is compat only and ignored Examples -------- @@ -187,17 +244,34 @@ def from_intervals(cls, data, name=None): closed='right') """ data = np.asarray(data) - left, right, closed = intervals_to_interval_bounds(data) - return cls(left, right, closed, name) + left, right, closed, mask = intervals_to_interval_bounds(data) + return cls(left, right, closed, mask, name=name, copy=copy) @classmethod def from_tuples(cls, data, closed='right', name=None): left = [] right = [] - for l, r in data: + for d in data: + + if isnull(d): + left.append(np.nan) + right.append(np.nan) + continue + + l, r = d left.append(l) right.append(r) - return cls(np.array(left), np.array(right), closed, name) + + left = Index(left) + right = Index(right) + mask = isnull(left) + + # TODO + # if we have nulls and we previous had *only* + # integer data, then we have changed the dtype + + return cls(Index(left), Index(right), closed, mask=mask, + name=name) def to_tuples(self): return Index(com._asarray_tuplesafe(zip(self.left, self.right))) @@ -224,12 +298,22 @@ def __len__(self): @cache_readonly def values(self): - """Returns the IntervalIndex's data as a numpy array of Interval + """ + Returns the IntervalIndex's data as a numpy array of Interval objects (with dtype='object') """ - left = np.asarray(self.left) - right = np.asarray(self.right) - return interval_bounds_to_intervals(left, right, self.closed) + left = self.left + right = self.right + mask = self._mask + closed = self._closed + + result = np.empty(len(left), dtype=object) + for i in range(len(left)): + if mask is not None and mask[i]: + result[i] = np.nan + else: + result[i] = Interval(left[i], right[i], closed) + return result def __array__(self, result=None): """ the array interface, return my values """ @@ -245,23 +329,40 @@ def _array_values(self): def __reduce__(self): return self.__class__, (self.left, self.right, self.closed, self.name) - def _shallow_copy(self, values=None, name=None): - name = name if name is not None else self.name - if values is not None: - return type(self).from_intervals(values, name=name) - else: - return self.copy(name=name) - + @Appender(_index_shared_docs['copy']) def copy(self, deep=False, name=None): left = self.left.copy(deep=True) if deep else self.left right = self.right.copy(deep=True) if deep else self.right name = name if name is not None else self.name - return type(self)(left, right, closed=self.closed, name=name, - fastpath=True) + return self._shallow_copy(left, right, name=name) + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_interval_dtype(dtype): + if copy: + self = self.copy() + return self + elif is_object_dtype(dtype): + return Index(self.values, dtype=object) + elif is_categorical_dtype(dtype): + from pandas import Categorical + return Categorical(self, ordered=True) + raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype) @cache_readonly def dtype(self): - return np.dtype('O') + return IntervalDtype.construct_from_string(str(self.left.dtype)) + + @property + def inferred_type(self): + return 'interval' + + @Appender(Index.memory_usage.__doc__) + def memory_usage(self, deep=False): + # we don't use an explict engine + # so return the bytes here + return (self.left.memory_usage(deep=deep) + + self.right.memory_usage(deep=deep)) @cache_readonly def mid(self): @@ -274,6 +375,10 @@ def mid(self): delta = self.right.values - self.left.values return Index(self.left.values + 0.5 * delta) + @cache_readonly + def is_monotonic(self): + return self._multiindex.is_monotonic + @cache_readonly def is_monotonic_increasing(self): return self._multiindex.is_monotonic_increasing @@ -303,19 +408,24 @@ def _maybe_cast_slice_bound(self, label, side, kind): def _convert_list_indexer(self, keyarr, kind=None): """ we are passed a list indexer. - Return our indexer or raise if all of the values are not included in the categories + Return our indexer or raise if all of the values are not + included in the categories """ locs = self.get_indexer(keyarr) - # TODO: handle keyarr if it includes intervals - if (locs == -1).any(): - raise KeyError("a list-indexer must only include existing intervals") - + check = locs == -1 + locs = locs[~check] return locs def _check_method(self, method): - if method is not None: + if method is None: + return + + if method in ['bfill', 'backfill', 'pad', 'ffill', 'nearest']: raise NotImplementedError( - 'method %r not yet implemented for IntervalIndex' % method) + 'method {} not yet implemented for ' + 'IntervalIndex'.format(method)) + + raise ValueError("Invalid fill method") def _searchsorted_monotonic(self, label, side, exclude_label=False): if not self.is_non_overlapping_monotonic: @@ -339,7 +449,9 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) def _get_loc_only_exact_matches(self, key): - return self._multiindex._tuple_index.get_loc(key) + # TODO: this expands to a tuple index, see if we can + # do better + return Index(self._multiindex.values).get_loc(key) def _find_non_overlapping_monotonic_bounds(self, key): if isinstance(key, IntervalMixin): @@ -379,9 +491,9 @@ def get_loc(self, key, method=None): # use the interval tree if isinstance(key, Interval): left, right = _get_interval_closed_bounds(key) - return self._tree.get_loc_interval(left, right) + return self._engine.get_loc_interval(left, right) else: - return self._tree.get_loc(key) + return self._engine.get_loc(key) def get_value(self, series, key): # this method seems necessary for Series.__getitem__ but I have no idea @@ -401,19 +513,45 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise ValueError('indexer corresponds to non-unique elements') return np.where(start_plus_one == stop, start, -1) + if isinstance(target, IntervalIndex): + raise NotImplementedError( + 'have not yet implemented get_indexer ' + 'for IntervalIndex indexers') + + if not self.is_unique: + indexer, missing = self._engine.get_indexer_non_unique( + target.values) else: - if isinstance(target, IntervalIndex): - raise NotImplementedError( - 'have not yet implemented get_indexer ' - 'for IntervalIndex indexers') - else: - return self._tree.get_indexer(target) + indexer = self._engine.get_indexer(target.values) + return indexer + + def sort_values(self, return_indexer=False, ascending=True): + """ + Return sorted copy of Index + """ + mask = self._mask + + # nans are sorted to the highest values + _as = self.argsort() + _as[mask] = -1 + + if not ascending: + _as = _as[::-1] + + sorted_index = self.take(_as) + + if return_indexer: + return sorted_index, _as + else: + return sorted_index + + def where(self, cond, other=None): + raise NotImplementedError def delete(self, loc): new_left = self.left.delete(loc) new_right = self.right.delete(loc) - return type(self)(new_left, new_right, self.closed, self.name, - fastpath=True) + return self._shallow_copy(new_left, new_right) def insert(self, loc, item): if not isinstance(item, Interval): @@ -424,8 +562,7 @@ def insert(self, loc, item): 'as the index') new_left = self.left.insert(loc, item.left) new_right = self.right.insert(loc, item.right) - return type(self)(new_left, new_right, self.closed, self.name, - fastpath=True) + return self._shallow_copy(new_left, new_right) def _as_like_interval_index(self, other, error_msg): self._assert_can_do_setop(other) @@ -435,25 +572,47 @@ def _as_like_interval_index(self, other, error_msg): raise ValueError(error_msg) return other - def append(self, other): - msg = ('can only append two IntervalIndex objects that are closed on ' - 'the same side') - other = self._as_like_interval_index(other, msg) - new_left = self.left.append(other.left) - new_right = self.right.append(other.right) - if other.name is not None and other.name != self.name: - name = None - else: - name = self.name - return type(self)(new_left, new_right, self.closed, name, - fastpath=True) + def _append_same_dtype(self, to_concat, name): + """ + assert that we all have the same .closed + we allow a 0-len index here as well + """ + if not len(set([i.closed for i in to_concat if len(i)])) == 1: + msg = ('can only append two IntervalIndex objects ' + 'that are closed on the same side') + raise ValueError(msg) + return super(IntervalIndex, self)._append_same_dtype(to_concat, name) + + @Appender(_index_shared_docs['take']) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None, **kwargs): + nv.validate_take(tuple(), kwargs) + indices = _ensure_platform_int(indices) + left, right = self.left, self.right + + if fill_value is None: + fill_value = self._na_value + mask = indices == -1 + + if not mask.any(): + # we won't change dtype here in this case + # if we don't need + allow_fill = False + + taker = lambda x: x.take(indices, allow_fill=allow_fill, + fill_value=fill_value) + + try: + new_left = taker(left) + new_right = taker(right) + except ValueError: + + # we need to coerce; migth have NA's in an + # interger dtype + new_left = taker(left.astype(float)) + new_right = taker(right.astype(float)) - def take(self, indexer, axis=0): - indexer = com._ensure_platform_int(indexer) - new_left = self.left.take(indexer) - new_right = self.right.take(indexer) - return type(self)(new_left, new_right, self.closed, self.name, - fastpath=True) + return self._shallow_copy(new_left, new_right, mask=mask) def __contains__(self, key): try: @@ -465,10 +624,15 @@ def __contains__(self, key): def __getitem__(self, value): left = self.left[value] right = self.right[value] + mask = None + if self._mask is not None: + mask = self._mask[value] + + # scalar if not isinstance(left, Index): return Interval(left, right, self.closed) - else: - return type(self)(left, right, self.closed, self.name) + + return self._shallow_copy(left, right, mask=mask) # __repr__ associated methods are based on MultiIndex @@ -478,6 +642,7 @@ def _format_attrs(self): ('closed', repr(self.closed))] if self.name is not None: attrs.append(('name', default_pprint(self.name))) + attrs.append(('dtype', "'%s'" % self.dtype)) return attrs def _format_space(self): @@ -490,14 +655,20 @@ def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) def equals(self, other): + if self.is_(other): return True - try: - return (self.left.equals(other.left) - and self.right.equals(other.right) - and self.closed == other.closed) - except AttributeError: - return False + + # if we can coerce to an II + # then we can compare + if not isinstance(other, IntervalIndex): + if not is_interval_dtype(other): + return False + other = Index(getattr(other, '.values', other)) + + return (self.left.equals(other.left) and + self.right.equals(other.right) and + self.closed == other.closed) def _setop(op_name): def func(self, other): @@ -513,9 +684,62 @@ def func(self, other): union = _setop('union') intersection = _setop('intersection') difference = _setop('difference') - sym_diff = _setop('sym_diff') + symmetric_differnce = _setop('symmetric_difference') # TODO: arithmetic operations IntervalIndex._add_logical_methods_disabled() + + +def interval_range(start=None, end=None, freq=None, periods=None, + name=None, closed='right', **kwargs): + """ + Return a fixed frequency IntervalIndex + + Parameters + ---------- + start : string or datetime-like, default None + Left bound for generating data + end : string or datetime-like, default None + Right bound for generating data + freq : interger, string or DateOffset, default 1 + periods : interger, default None + name : str, default None + Name of the resulting index + closed : string, default 'right' + options are: 'left', 'right', 'both', 'neither' + + Notes + ----- + 2 of start, end, or periods must be specified + + Returns + ------- + rng : IntervalIndex + """ + + if freq is None: + freq = 1 + + if start is None: + if periods is None or end is None: + raise ValueError("must specify 2 of start, end, periods") + start = end - periods * freq + elif end is None: + if periods is None or start is None: + raise ValueError("must specify 2 of start, end, periods") + end = start + periods * freq + elif periods is None: + if start is None or end is None: + raise ValueError("must specify 2 of start, end, periods") + pass + + # must all be same units or None + arr = np.array([start, end, freq]) + if is_object_dtype(arr): + raise ValueError("start, end, freq need to be the same type") + + return IntervalIndex.from_breaks(np.arange(start, end, freq), + name=name, + closed=closed) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index d1a2a41846976..a07829a02f15e 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -59,6 +59,8 @@ from tslib cimport (convert_to_tsobject, convert_to_timedelta64, _check_all_nulls) import tslib from tslib import NaT, Timestamp, Timedelta +import _interval +from _interval import Interval cdef int64_t NPY_NAT = util.get_nat() @@ -328,7 +330,7 @@ cpdef bint isscalar(object val): or PyDelta_Check(val) or PyTime_Check(val) or util.is_period_object(val) - or is_decimal(val), + or is_decimal(val) or is_interval(val)) @@ -1967,6 +1969,4 @@ cdef class BlockPlacement: include "reduce.pyx" include "properties.pyx" -include "interval.pyx" -include "intervaltree.pyx" include "inference.pyx" diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 858a7a29ad868..59c789a0ab9fa 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -33,6 +33,10 @@ cpdef bint is_decimal(object obj): return isinstance(obj, Decimal) +cpdef bint is_interval(object obj): + return isinstance(obj, Interval) + + cpdef bint is_period(object val): """ Return a boolean if this is a Period object """ return util.is_period_object(val) @@ -746,22 +750,28 @@ cpdef bint is_period_array(ndarray[object] values): return False return null_count != n -cdef inline bint is_interval(object o): - return isinstance(o, Interval) -def is_interval_array_fixed_closed(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) - cdef str closed +cpdef bint is_interval_array_fixed_closed(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values), null_count = 0 + object closed = None, v + if n == 0: return False for i in range(n): - if not is_interval(values[i]): + v = values[i] + if util._checknull(v): + null_count += 1 + continue + if not is_interval(v): return False - if i == 0: - closed = values[0].closed - elif closed != values[i].closed: + if closed is None: + if not is_interval(values[i]): + return False + closed = values[i].closed + elif closed != v.closed: return False - return True + return null_count != n cdef extern from "parse_helper.h": diff --git a/pandas/src/interval.pyx b/pandas/src/interval.pyx index 495730e0fd6a1..bc2a02e8bc37a 100644 --- a/pandas/src/interval.pyx +++ b/pandas/src/interval.pyx @@ -2,8 +2,10 @@ cimport numpy as np import numpy as np import pandas as pd +cimport util cimport cython import cython +from numpy cimport * from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, PyObject_RichCompare) @@ -44,6 +46,20 @@ cdef _interval_like(other): cdef class Interval(IntervalMixin): + """ + Immutable object implementing an Interval, a bounded slice-like interval. + + .. versionadded:: 0.20.0 + + Properties + ---------- + left, right : values + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'} + Whether the interval is closed on the left-side, right-side, both or + neither. Defaults to 'right'. + """ + cdef readonly object left, right cdef readonly str closed @@ -84,8 +100,9 @@ cdef class Interval(IntervalMixin): return NotImplemented else: op_str = {Py_LT: '<', Py_LE: '<=', Py_GT: '>', Py_GE: '>='}[op] - raise TypeError('unorderable types: %s() %s %s()' % - (type(self).__name__, op_str, type(other).__name__)) + raise TypeError( + 'unorderable types: %s() %s %s()' % + (type(self).__name__, op_str, type(other).__name__)) def __reduce__(self): args = (self.left, self.right, self.closed) @@ -143,29 +160,35 @@ cdef class Interval(IntervalMixin): @cython.wraparound(False) @cython.boundscheck(False) -cpdef interval_bounds_to_intervals(np.ndarray left, np.ndarray right, - str closed): - result = np.empty(len(left), dtype=object) - nulls = pd.isnull(left) | pd.isnull(right) - result[nulls] = np.nan - for i in np.flatnonzero(~nulls): - result[i] = Interval(left[i], right[i], closed) - return result +cpdef intervals_to_interval_bounds(ndarray intervals): + cdef: + object closed = None, interval + int64_t n = len(intervals) + ndarray left, right + ndarray[uint8_t] mask + left = np.empty(n, dtype=object) + right = np.empty(n, dtype=object) + mask = np.zeros(n, dtype=np.uint8) -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef intervals_to_interval_bounds(np.ndarray intervals): - left = np.empty(len(intervals), dtype=object) - right = np.empty(len(intervals), dtype=object) - cdef str closed = None for i in range(len(intervals)): interval = intervals[i] + if util._checknull(interval): + mask[i] = 1 + left[i] = np.nan + right[i] = np.nan + continue + + if not isinstance(interval, Interval): + raise TypeError("{} is not an interval".format(interval)) + left[i] = interval.left right[i] = interval.right if closed is None: closed = interval.closed elif closed != interval.closed: raise ValueError('intervals must all be closed on the same side') - return left, right, closed + return left, right, closed, mask.view(np.bool_) + +include "intervaltree.pxi" diff --git a/pandas/src/generate_intervaltree.py b/pandas/src/intervaltree.pxi.in similarity index 68% rename from pandas/src/generate_intervaltree.py rename to pandas/src/intervaltree.pxi.in index 275a0d40e2433..a15a89302994c 100644 --- a/pandas/src/generate_intervaltree.py +++ b/pandas/src/intervaltree.pxi.in @@ -1,22 +1,9 @@ """ -This file generates `intervaltree.pyx` which is then included in `../lib.pyx` -during building. To regenerate `intervaltree.pyx`, just run: +Template for intervaltree - `python generate_intervaltree.py`. -""" -from __future__ import print_function -import os -from pandas.compat import StringIO -import numpy as np - - -warning_to_new_contributors = """ -# DO NOT EDIT THIS FILE: This file was autogenerated from -# generate_intervaltree.py, so please edit that file and then run -# `python2 generate_intervaltree.py` to re-generate this file. +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -header = r''' from numpy cimport int64_t, float64_t from numpy cimport ndarray, PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take import numpy as np @@ -28,22 +15,27 @@ from hashtable cimport Int64Vector, Int64VectorData -ctypedef fused scalar64_t: +ctypedef fused scalar_t: float64_t + float32_t int64_t + int32_t -NODE_CLASSES = {} - +#---------------------------------------------------------------------- +# IntervalTree +#---------------------------------------------------------------------- cdef class IntervalTree(IntervalMixin): """A centered interval tree Based off the algorithm described on Wikipedia: http://en.wikipedia.org/wiki/Interval_tree + + we are emulating the IndexEngine interface """ cdef: - readonly object left, right, root + readonly object left, right, root, dtype readonly str closed object _left_sorter, _right_sorter @@ -67,15 +59,15 @@ def __init__(self, left, right, closed='right', leaf_size=100): left = np.asarray(left) right = np.asarray(right) - dtype = np.result_type(left, right) - self.left = np.asarray(left, dtype=dtype) - self.right = np.asarray(right, dtype=dtype) + self.dtype = np.result_type(left, right) + self.left = np.asarray(left, dtype=self.dtype) + self.right = np.asarray(right, dtype=self.dtype) indices = np.arange(len(left), dtype='int64') self.closed = closed - node_cls = NODE_CLASSES[str(dtype), closed] + node_cls = NODE_CLASSES[str(self.dtype), closed] self.root = node_cls(self.left, self.right, indices, leaf_size) @property @@ -94,7 +86,7 @@ def right_sorter(self): self._right_sorter = np.argsort(self.right) return self._right_sorter - def get_loc(self, scalar64_t key): + def get_loc(self, scalar_t key): """Return all positions corresponding to intervals that overlap with the given scalar key """ @@ -131,13 +123,14 @@ def get_loc_interval(self, key_left, key_right): uniques = pd.unique(combined) return uniques - def get_indexer(self, scalar64_t[:] target): + def get_indexer(self, scalar_t[:] target): """Return the positions corresponding to unique intervals that overlap with the given array of scalar targets. """ # TODO: write get_indexer_intervals cdef: - int64_t old_len, i + size_t old_len + Py_ssize_t i Int64Vector result result = Int64Vector() @@ -152,12 +145,13 @@ def get_indexer(self, scalar64_t[:] target): old_len = result.data.n return result.to_array() - def get_indexer_non_unique(self, scalar64_t[:] target): + def get_indexer_non_unique(self, scalar_t[:] target): """Return the positions corresponding to intervals that overlap with the given array of scalar targets. Non-unique positions are repeated. """ cdef: - int64_t old_len, i + size_t old_len + Py_ssize_t i Int64Vector result, missing result = Int64Vector() @@ -172,8 +166,14 @@ def get_indexer_non_unique(self, scalar64_t[:] target): return result.to_array(), missing.to_array() def __repr__(self): - return ('' - % self.root.n_elements) + return (''.format( + dtype=self.dtype, closed=self.closed, + n_elements=self.root.n_elements)) + + # compat with IndexEngine interface + def clear_mapping(self): + pass cdef take(ndarray source, ndarray indices): @@ -189,32 +189,57 @@ def __repr__(self): sorted_values = take(values, sorter) sorted_indices = take(indices, sorter) return sorted_values, sorted_indices -''' + +#---------------------------------------------------------------------- +# Nodes +#---------------------------------------------------------------------- # we need specialized nodes and leaves to optimize for different dtype and # closed values -# unfortunately, fused dtypes can't parameterize attributes on extension types, -# so we're stuck using template generation. -node_template = r''' -cdef class {dtype_title}Closed{closed_title}IntervalNode: +{{py: + +nodes = [] +for dtype in ['float32', 'float64', 'int32', 'int64']: + for closed, cmp_left, cmp_right in [ + ('left', '<=', '<'), + ('right', '<', '<='), + ('both', '<=', '<='), + ('neither', '<', '<')]: + cmp_left_converse = '<' if cmp_left == '<=' else '<=' + cmp_right_converse = '<' if cmp_right == '<=' else '<=' + nodes.append((dtype, dtype.title(), + closed, closed.title(), + cmp_left, + cmp_right, + cmp_left_converse, + cmp_right_converse)) + +}} + +NODE_CLASSES = {} + +{{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right, + cmp_left_converse, cmp_right_converse in nodes}} + +cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: """Non-terminal node for an IntervalTree Categorizes intervals by those that fall to the left, those that fall to the right, and those that overlap with the pivot. """ cdef: - {dtype_title}Closed{closed_title}IntervalNode left_node, right_node - {dtype}_t[:] center_left_values, center_right_values, left, right + {{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node + {{dtype}}_t[:] center_left_values, center_right_values, left, right int64_t[:] center_left_indices, center_right_indices, indices - {dtype}_t min_left, max_right - readonly {dtype}_t pivot + {{dtype}}_t min_left, max_right + readonly {{dtype}}_t pivot readonly int64_t n_elements, n_center, leaf_size readonly bint is_leaf_node def __init__(self, - ndarray[{dtype}_t, ndim=1] left, - ndarray[{dtype}_t, ndim=1] right, + ndarray[{{dtype}}_t, ndim=1] left, + ndarray[{{dtype}}_t, ndim=1] right, ndarray[int64_t, ndim=1] indices, int64_t leaf_size): @@ -233,15 +258,18 @@ def __init__(self, self.left = left self.right = right self.indices = indices - self.n_center + self.n_center = 0 else: # calculate a pivot so we can create child nodes self.is_leaf_node = False self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) + left_set, right_set, center_set = self.classify_intervals( + left, right) - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) + self.left_node = self.new_child_node(left, right, + indices, left_set) + self.right_node = self.new_child_node(left, right, + indices, right_set) self.center_left_values, self.center_left_indices = \ sort_values_and_indices(left, indices, center_set) @@ -251,7 +279,7 @@ def __init__(self, @cython.wraparound(False) @cython.boundscheck(False) - cdef classify_intervals(self, {dtype}_t[:] left, {dtype}_t[:] right): + cdef classify_intervals(self, {{dtype}}_t[:] left, {{dtype}}_t[:] right): """Classify the given intervals based upon whether they fall to the left, right, or overlap with this node's pivot. """ @@ -264,9 +292,9 @@ def __init__(self, overlapping_ind = Int64Vector() for i in range(self.n_elements): - if right[i] {cmp_right_converse} self.pivot: + if right[i] {{cmp_right_converse}} self.pivot: left_ind.append(i) - elif self.pivot {cmp_left_converse} left[i]: + elif self.pivot {{cmp_left_converse}} left[i]: right_ind.append(i) else: overlapping_ind.append(i) @@ -276,8 +304,8 @@ def __init__(self, overlapping_ind.to_array()) cdef new_child_node(self, - ndarray[{dtype}_t, ndim=1] left, - ndarray[{dtype}_t, ndim=1] right, + ndarray[{{dtype}}_t, ndim=1] left, + ndarray[{{dtype}}_t, ndim=1] right, ndarray[int64_t, ndim=1] indices, ndarray[int64_t, ndim=1] subset): """Create a new child node. @@ -285,19 +313,19 @@ def __init__(self, left = take(left, subset) right = take(right, subset) indices = take(indices, subset) - return {dtype_title}Closed{closed_title}IntervalNode( + return {{dtype_title}}Closed{{closed_title}}IntervalNode( left, right, indices, self.leaf_size) @cython.wraparound(False) @cython.boundscheck(False) @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): + cpdef query(self, Int64Vector result, scalar_t point): """Recursively query this node and its sub-nodes for intervals that overlap with the query point. """ cdef: int64_t[:] indices - {dtype}_t[:] values + {{dtype}}_t[:] values Py_ssize_t i if self.is_leaf_node: @@ -305,7 +333,7 @@ def __init__(self, # continue the binary tree structure. Instead, we use linear # search. for i in range(self.n_elements): - if self.left[i] {cmp_left} point {cmp_right} self.right[i]: + if self.left[i] {{cmp_left}} point {{cmp_right}} self.right[i]: result.append(self.indices[i]) else: # There are child nodes. Based on comparing our query to the pivot, @@ -314,34 +342,35 @@ def __init__(self, values = self.center_left_values indices = self.center_left_indices for i in range(self.n_center): - if not values[i] {cmp_left} point: + if not values[i] {{cmp_left}} point: break result.append(indices[i]) - if point {cmp_right} self.left_node.max_right: + if point {{cmp_right}} self.left_node.max_right: self.left_node.query(result, point) elif point > self.pivot: values = self.center_right_values indices = self.center_right_indices for i in range(self.n_center - 1, -1, -1): - if not point {cmp_right} values[i]: + if not point {{cmp_right}} values[i]: break result.append(indices[i]) - if self.right_node.min_left {cmp_left} point: + if self.right_node.min_left {{cmp_left}} point: self.right_node.query(result, point) else: result.extend(self.center_left_indices) def __repr__(self): if self.is_leaf_node: - return ('<{dtype_title}Closed{closed_title}IntervalNode: ' + return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: ' '%s elements (terminal)>' % self.n_elements) else: n_left = self.left_node.n_elements n_right = self.right_node.n_elements n_center = self.n_elements - n_left - n_right - return ('<{dtype_title}Closed{closed_title}IntervalNode: pivot %s, ' - '%s elements (%s left, %s right, %s overlapping)>' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) + return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: ' + 'pivot %s, %s elements (%s left, %s right, %s ' + 'overlapping)>' % (self.pivot, self.n_elements, + n_left, n_right, n_center)) def counts(self): if self.is_leaf_node: @@ -352,44 +381,7 @@ def counts(self): r = self.right_node.counts() return (m, (l, r)) -NODE_CLASSES['{dtype}', '{closed}'] = {dtype_title}Closed{closed_title}IntervalNode -''' - - -def generate_node_template(): - output = StringIO() - for dtype in ['float64', 'int64']: - for closed, cmp_left, cmp_right in [ - ('left', '<=', '<'), - ('right', '<', '<='), - ('both', '<=', '<='), - ('neither', '<', '<')]: - cmp_left_converse = '<' if cmp_left == '<=' else '<=' - cmp_right_converse = '<' if cmp_right == '<=' else '<=' - classes = node_template.format(dtype=dtype, - dtype_title=dtype.title(), - closed=closed, - closed_title=closed.title(), - cmp_left=cmp_left, - cmp_right=cmp_right, - cmp_left_converse=cmp_left_converse, - cmp_right_converse=cmp_right_converse) - output.write(classes) - output.write("\n") - return output.getvalue() - - -def generate_cython_file(): - # Put `intervaltree.pyx` in the same directory as this file - directory = os.path.dirname(os.path.realpath(__file__)) - filename = 'intervaltree.pyx' - path = os.path.join(directory, filename) - - with open(path, 'w') as f: - print(warning_to_new_contributors, file=f) - print(header, file=f) - print(generate_node_template(), file=f) - - -if __name__ == '__main__': - generate_cython_file() +NODE_CLASSES['{{dtype}}', + '{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode + +{{endfor}} diff --git a/pandas/src/intervaltree.pyx b/pandas/src/intervaltree.pyx deleted file mode 100644 index 55782c930d4f8..0000000000000 --- a/pandas/src/intervaltree.pyx +++ /dev/null @@ -1,1444 +0,0 @@ - -# DO NOT EDIT THIS FILE: This file was autogenerated from -# generate_intervaltree.py, so please edit that file and then run -# `python2 generate_intervaltree.py` to re-generate this file. - - -from numpy cimport int64_t, float64_t -from numpy cimport ndarray, PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take -import numpy as np - -cimport cython -cimport numpy as cnp -cnp.import_array() - -from hashtable cimport Int64Vector, Int64VectorData - - -ctypedef fused scalar64_t: - float64_t - int64_t - - -NODE_CLASSES = {} - - -cdef class IntervalTree(IntervalMixin): - """A centered interval tree - - Based off the algorithm described on Wikipedia: - http://en.wikipedia.org/wiki/Interval_tree - """ - cdef: - readonly object left, right, root - readonly str closed - object _left_sorter, _right_sorter - - def __init__(self, left, right, closed='right', leaf_size=100): - """ - Parameters - ---------- - left, right : np.ndarray[ndim=1] - Left and right bounds for each interval. Assumed to contain no - NaNs. - closed : {'left', 'right', 'both', 'neither'}, optional - Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. - leaf_size : int, optional - Parameter that controls when the tree switches from creating nodes - to brute-force search. Tune this parameter to optimize query - performance. - """ - if closed not in ['left', 'right', 'both', 'neither']: - raise ValueError("invalid option for 'closed': %s" % closed) - - left = np.asarray(left) - right = np.asarray(right) - dtype = np.result_type(left, right) - self.left = np.asarray(left, dtype=dtype) - self.right = np.asarray(right, dtype=dtype) - - indices = np.arange(len(left), dtype='int64') - - self.closed = closed - - node_cls = NODE_CLASSES[str(dtype), closed] - self.root = node_cls(self.left, self.right, indices, leaf_size) - - @property - def left_sorter(self): - """How to sort the left labels; this is used for binary search - """ - if self._left_sorter is None: - self._left_sorter = np.argsort(self.left) - return self._left_sorter - - @property - def right_sorter(self): - """How to sort the right labels - """ - if self._right_sorter is None: - self._right_sorter = np.argsort(self.right) - return self._right_sorter - - def get_loc(self, scalar64_t key): - """Return all positions corresponding to intervals that overlap with - the given scalar key - """ - result = Int64Vector() - self.root.query(result, key) - if not result.data.n: - raise KeyError(key) - return result.to_array() - - def _get_partial_overlap(self, key_left, key_right, side): - """Return all positions corresponding to intervals with the given side - falling between the left and right bounds of an interval query - """ - if side == 'left': - values = self.left - sorter = self.left_sorter - else: - values = self.right - sorter = self.right_sorter - key = [key_left, key_right] - i, j = values.searchsorted(key, sorter=sorter) - return sorter[i:j] - - def get_loc_interval(self, key_left, key_right): - """Lookup the intervals enclosed in the given interval bounds - - The given interval is presumed to have closed bounds. - """ - import pandas as pd - left_overlap = self._get_partial_overlap(key_left, key_right, 'left') - right_overlap = self._get_partial_overlap(key_left, key_right, 'right') - enclosing = self.get_loc(0.5 * (key_left + key_right)) - combined = np.concatenate([left_overlap, right_overlap, enclosing]) - uniques = pd.unique(combined) - return uniques - - def get_indexer(self, scalar64_t[:] target): - """Return the positions corresponding to unique intervals that overlap - with the given array of scalar targets. - """ - # TODO: write get_indexer_intervals - cdef: - int64_t old_len, i - Int64Vector result - - result = Int64Vector() - old_len = 0 - for i in range(len(target)): - self.root.query(result, target[i]) - if result.data.n == old_len: - result.append(-1) - elif result.data.n > old_len + 1: - raise KeyError( - 'indexer does not intersect a unique set of intervals') - old_len = result.data.n - return result.to_array() - - def get_indexer_non_unique(self, scalar64_t[:] target): - """Return the positions corresponding to intervals that overlap with - the given array of scalar targets. Non-unique positions are repeated. - """ - cdef: - int64_t old_len, i - Int64Vector result, missing - - result = Int64Vector() - missing = Int64Vector() - old_len = 0 - for i in range(len(target)): - self.root.query(result, target[i]) - if result.data.n == old_len: - result.append(-1) - missing.append(i) - old_len = result.data.n - return result.to_array(), missing.to_array() - - def __repr__(self): - return ('' - % self.root.n_elements) - - -cdef take(ndarray source, ndarray indices): - """Take the given positions from a 1D ndarray - """ - return PyArray_Take(source, indices, 0) - - -cdef sort_values_and_indices(all_values, all_indices, subset): - indices = take(all_indices, subset) - values = take(all_values, subset) - sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT) - sorted_values = take(values, sorter) - sorted_indices = take(indices, sorter) - return sorted_values, sorted_indices - - -cdef class Float64ClosedLeftIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedLeftIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedLeftIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'left'] = Float64ClosedLeftIntervalNode - - -cdef class Float64ClosedRightIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedRightIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedRightIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'right'] = Float64ClosedRightIntervalNode - - -cdef class Float64ClosedBothIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedBothIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedBothIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'both'] = Float64ClosedBothIntervalNode - - -cdef class Float64ClosedNeitherIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedNeitherIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedNeitherIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'neither'] = Float64ClosedNeitherIntervalNode - - -cdef class Int64ClosedLeftIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedLeftIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedLeftIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'left'] = Int64ClosedLeftIntervalNode - - -cdef class Int64ClosedRightIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedRightIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedRightIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'right'] = Int64ClosedRightIntervalNode - - -cdef class Int64ClosedBothIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedBothIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedBothIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'both'] = Int64ClosedBothIntervalNode - - -cdef class Int64ClosedNeitherIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedNeitherIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedNeitherIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'neither'] = Int64ClosedNeitherIntervalNode - - diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 90a0c1d5c9347..ee3605fb3d861 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -54,7 +54,7 @@ class TestPDApi(Base, tm.TestCase): 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index', 'Series', 'SparseArray', 'SparseDataFrame', 'SparseSeries', 'TimeGrouper', 'Timedelta', - 'TimedeltaIndex', 'Timestamp'] + 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex'] # these are already deprecated; awaiting removal deprecated_classes = ['TimeSeries', 'WidePanel', @@ -72,7 +72,7 @@ class TestPDApi(Base, tm.TestCase): # top-level functions funcs = ['bdate_range', 'concat', 'crosstab', 'cut', - 'date_range', 'eval', + 'date_range', 'interval_range', 'eval', 'factorize', 'get_dummies', 'get_store', 'infer_freq', 'isnull', 'lreshape', 'match', 'melt', 'notnull', 'offsets', @@ -154,6 +154,7 @@ class TestTypes(Base, tm.TestCase): 'is_string_dtype', 'is_signed_integer_dtype', 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype', 'is_unsigned_integer_dtype', 'is_period', + 'is_interval', 'is_interval_dtype', 'is_period_dtype', 'is_re', 'is_re_compilable', 'is_dict_like', 'is_iterator', 'is_list_like', 'is_hashable', diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index e84bb6407fafc..e5940fa1f015b 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -8,7 +8,7 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, Index, MultiIndex, - RangeIndex, date_range) + RangeIndex, date_range, IntervalIndex) import pandas as pd from pandas.util.testing import (assert_series_equal, @@ -295,6 +295,17 @@ def test_set_index_dst(self): exp = pd.DataFrame({'b': [3, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp) + def test_reset_index_with_intervals(self): + idx = pd.IntervalIndex.from_breaks(np.arange(11), name='x') + original = pd.DataFrame({'x': idx, 'y': np.arange(10)})[['x', 'y']] + + result = original.set_index('x') + expected = pd.DataFrame({'y': np.arange(10)}, index=idx) + assert_frame_equal(result, expected) + + result2 = result.reset_index() + assert_frame_equal(result2, original) + def test_set_index_multiindexcolumns(self): columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) @@ -703,3 +714,37 @@ def test_set_index_preserve_categorical_dtype(self): result = df.set_index(cols).reset_index() result = result.reindex(columns=df.columns) tm.assert_frame_equal(result, df) + + +class TestIntervalIndex(tm.TestCase): + + def test_setitem(self): + + df = DataFrame({'A': range(10)}) + s = pd.cut(df.A, 5) + self.assertIsInstance(s.cat.categories, IntervalIndex) + + # these should end up the same, namely + # an object array of Intervals + df['B'] = s + df['C'] = np.array(s) + df['D'] = s.values + df['E'] = np.array(s.values) + + self.assertTrue(df['B'].dtype == 'object') + self.assertTrue(df['C'].dtype == 'object') + self.assertTrue(df['D'].dtype == 'object') + self.assertTrue(df['E'].dtype == 'object') + + tm.assert_series_equal(df['B'], df['C'], check_names=False) + tm.assert_series_equal(df['B'], df['D'], check_names=False) + tm.assert_series_equal(df['B'], df['E'], check_names=False) + + def test_set_reset_index(self): + + df = DataFrame({'A': range(10)}) + s = pd.cut(df.A, 5) + df['B'] = s + df = df.set_index('B') + + df = df.reset_index() diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index eebd0e0f490c1..bb1764f41e174 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import (Index, MultiIndex, CategoricalIndex, - DataFrame, Categorical, Series) + DataFrame, Categorical, Series, Interval) from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm from .common import MixIn @@ -495,7 +495,8 @@ def test_groupby_categorical_two_columns(self): res = groups_double_key.agg('mean') nan = np.nan idx = MultiIndex.from_product( - [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), + [Categorical([Interval(1, 2), Interval(2, 3), + Interval(3, 6)], ordered=True), [1, 2, 3, 4]], names=["cat", "C2"]) exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 51b0b17b65243..e6a75ae0acab3 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -858,11 +858,13 @@ def test_get_group_empty_bins(self): bins = [0, 5, 10, 15] g = d.groupby(pd.cut(d[0], bins)) - result = g.get_group('(0, 5]') + # TODO: should prob allow a str of Interval work as well + # IOW '(0, 5]' + result = g.get_group(pd.Interval(0, 5)) expected = DataFrame([3, 1], index=[0, 1]) assert_frame_equal(result, expected) - self.assertRaises(KeyError, lambda: g.get_group('(10, 15]')) + self.assertRaises(KeyError, lambda: g.get_group(pd.Interval(10, 15))) def test_get_group_grouped_by_tuple(self): # GH 8121 @@ -4074,49 +4076,6 @@ def test_transform_doesnt_clobber_ints(self): expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) - def test_groupby_categorical_two_columns(self): - - # https://github.com/pydata/pandas/issues/8138 - d = {'cat': pd.Categorical(["a","b","a","b"], categories=["a", "b", "c"], ordered=True), - 'ints': [1, 1, 2, 2],'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - exp = DataFrame({"ints":[1.5,1.5,np.nan], "val":[20,30,np.nan]}, - index=pd.CategoricalIndex(["a", "b", "c"], name="cat")) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat","ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val":[10,30,20,40,np.nan,np.nan], - "cat": ["a","a","b","b","c","c"], - "ints": [1,2,1,2,1,2]}).set_index(["cat","ints"]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6], labels=pd.Categorical(['a', 'b', 'c'])) - values.name = "cat" - groups_double_key = test.groupby([values,'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product([['a', 'b', 'c'], [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1":[nan,nan,nan,nan, 3, 3,nan,nan, nan,nan, 4, 5], - "C3":[nan,nan,nan,nan, 10,100,nan,nan, nan,nan,200,34]}, index=idx) - tm.assert_frame_equal(res, exp) - def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 81ad0524807f3..8ecf0f3629f8c 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -7,7 +7,8 @@ from pandas import (Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, - TimedeltaIndex, PeriodIndex, notnull, isnull) + TimedeltaIndex, PeriodIndex, IntervalIndex, + notnull, isnull) from pandas.types.common import needs_i8_conversion from pandas.util.testing import assertRaisesRegexp @@ -246,18 +247,21 @@ def test_ensure_copied_data(self): tm.assert_numpy_array_equal(index.values, result.values, check_same='copy') - if not isinstance(index, PeriodIndex): - result = index_type(index.values, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index.values, result.values, - check_same='same') - tm.assert_numpy_array_equal(index._values, result._values, - check_same='same') - else: + if isinstance(index, PeriodIndex): # .values an object array of Period, thus copied result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index._values, result._values, check_same='same') + elif isinstance(index, IntervalIndex): + # checked in test_interval.py + pass + else: + result = index_type(index.values, copy=False, **init_kwargs) + tm.assert_numpy_array_equal(index.values, result.values, + check_same='same') + tm.assert_numpy_array_equal(index._values, result._values, + check_same='same') def test_copy_and_deepcopy(self): from copy import copy, deepcopy @@ -374,8 +378,9 @@ def test_memory_usage(self): result2 = index.memory_usage() result3 = index.memory_usage(deep=True) - # RangeIndex doesn't use a hashtable engine - if not isinstance(index, RangeIndex): + # RangeIndex, IntervalIndex + # don't have engines + if not isinstance(index, (RangeIndex, IntervalIndex)): self.assertTrue(result2 > result) if index.inferred_type == 'object': diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2f5b98d145e57..59af490eb5db1 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -16,7 +16,7 @@ from pandas import (period_range, date_range, Series, DataFrame, Float64Index, Int64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, - PeriodIndex) + PeriodIndex, isnull) from pandas.core.index import _get_combined_index from pandas.util.testing import assert_almost_equal from pandas.compat.numpy import np_datetime64_compat @@ -498,7 +498,7 @@ def test_is_(self): def test_asof(self): d = self.dateIndex[0] self.assertEqual(self.dateIndex.asof(d), d) - self.assertTrue(np.isnan(self.dateIndex.asof(d - timedelta(1)))) + self.assertTrue(isnull(self.dateIndex.asof(d - timedelta(1)))) d = self.dateIndex[-1] self.assertEqual(self.dateIndex.asof(d + timedelta(1)), d) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 6b6885c082533..3643b355cde6a 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -11,7 +11,7 @@ import numpy as np -from pandas import Categorical, compat, notnull +from pandas import Categorical, IntervalIndex, compat, notnull from pandas.util.testing import assert_almost_equal import pandas.core.config as cf import pandas as pd @@ -339,6 +339,20 @@ def test_astype(self): self.assertIsInstance(result, Index) self.assertNotIsInstance(result, CategoricalIndex) + # interval + ii = IntervalIndex(left=[-0.001, 2.0], + right=[2, 4], + closed='right') + + ci = CategoricalIndex(Categorical.from_codes([0, 1, -1], categories=ii, ordered=True)) + + result = ci.astype('interval') + expected = ii.take([0, 1, -1]) + tm.assert_index_equal(result, expected) + + result = IntervalIndex.from_intervals(result.values) + tm.assert_index_equal(result, expected) + def test_reindex_base(self): # determined by cat ordering diff --git a/pandas/tests/test_interval.py b/pandas/tests/indexes/test_interval.py similarity index 52% rename from pandas/tests/test_interval.py rename to pandas/tests/indexes/test_interval.py index 1b52e2629b38c..3b72a476c5cb5 100644 --- a/pandas/tests/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -1,193 +1,25 @@ from __future__ import division import numpy as np -from pandas.core.interval import Interval, IntervalIndex -from pandas.core.index import Index -from pandas.lib import IntervalTree - +from pandas import (Interval, IntervalIndex, Index, + interval_range, Timestamp, Timedelta) +from pandas._interval import IntervalTree +from pandas.tests.indexes.common import Base import pandas.util.testing as tm import pandas as pd -class TestInterval(tm.TestCase): - def setUp(self): - self.interval = Interval(0, 1) - - def test_properties(self): - self.assertEqual(self.interval.closed, 'right') - self.assertEqual(self.interval.left, 0) - self.assertEqual(self.interval.right, 1) - self.assertEqual(self.interval.mid, 0.5) - - def test_repr(self): - self.assertEqual(repr(self.interval), - "Interval(0, 1, closed='right')") - self.assertEqual(str(self.interval), "(0, 1]") - - interval_left = Interval(0, 1, closed='left') - self.assertEqual(repr(interval_left), - "Interval(0, 1, closed='left')") - self.assertEqual(str(interval_left), "[0, 1)") - - def test_contains(self): - self.assertIn(0.5, self.interval) - self.assertIn(1, self.interval) - self.assertNotIn(0, self.interval) - self.assertRaises(TypeError, lambda: self.interval in self.interval) - - interval = Interval(0, 1, closed='both') - self.assertIn(0, interval) - self.assertIn(1, interval) - - interval = Interval(0, 1, closed='neither') - self.assertNotIn(0, interval) - self.assertIn(0.5, interval) - self.assertNotIn(1, interval) - - def test_equal(self): - self.assertEqual(Interval(0, 1), Interval(0, 1, closed='right')) - self.assertNotEqual(Interval(0, 1), Interval(0, 1, closed='left')) - self.assertNotEqual(Interval(0, 1), 0) - - def test_comparison(self): - with self.assertRaisesRegexp(TypeError, 'unorderable types'): - Interval(0, 1) < 2 - - self.assertTrue(Interval(0, 1) < Interval(1, 2)) - self.assertTrue(Interval(0, 1) < Interval(0, 2)) - self.assertTrue(Interval(0, 1) < Interval(0.5, 1.5)) - self.assertTrue(Interval(0, 1) <= Interval(0, 1)) - self.assertTrue(Interval(0, 1) > Interval(-1, 2)) - self.assertTrue(Interval(0, 1) >= Interval(0, 1)) - - def test_hash(self): - # should not raise - hash(self.interval) - - def test_math_add(self): - expected = Interval(1, 2) - actual = self.interval + 1 - self.assertEqual(expected, actual) - - expected = Interval(1, 2) - actual = 1 + self.interval - self.assertEqual(expected, actual) - - actual = self.interval - actual += 1 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval + Interval(1, 2) - - def test_math_sub(self): - expected = Interval(-1, 0) - actual = self.interval - 1 - self.assertEqual(expected, actual) - - actual = self.interval - actual -= 1 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval - Interval(1, 2) - - def test_math_mult(self): - expected = Interval(0, 2) - actual = self.interval * 2 - self.assertEqual(expected, actual) - - expected = Interval(0, 2) - actual = 2 * self.interval - self.assertEqual(expected, actual) - - actual = self.interval - actual *= 2 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval * Interval(1, 2) - - def test_math_div(self): - expected = Interval(0, 0.5) - actual = self.interval / 2.0 - self.assertEqual(expected, actual) - - actual = self.interval - actual /= 2.0 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval / Interval(1, 2) - - -class TestIntervalTree(tm.TestCase): - def setUp(self): - self.tree = IntervalTree(np.arange(5), np.arange(5) + 2) - - def test_get_loc(self): - self.assert_numpy_array_equal(self.tree.get_loc(1), [0]) - self.assert_numpy_array_equal(np.sort(self.tree.get_loc(2)), [0, 1]) - with self.assertRaises(KeyError): - self.tree.get_loc(-1) - - def test_get_indexer(self): - self.assert_numpy_array_equal( - self.tree.get_indexer(np.array([1.0, 5.5, 6.5])), [0, 4, -1]) - with self.assertRaises(KeyError): - self.tree.get_indexer(np.array([3.0])) - - def test_get_indexer_non_unique(self): - indexer, missing = self.tree.get_indexer_non_unique( - np.array([1.0, 2.0, 6.5])) - self.assert_numpy_array_equal(indexer[:1], [0]) - self.assert_numpy_array_equal(np.sort(indexer[1:3]), [0, 1]) - self.assert_numpy_array_equal(np.sort(indexer[3:]), [-1]) - self.assert_numpy_array_equal(missing, [2]) - - def test_duplicates(self): - tree = IntervalTree([0, 0, 0], [1, 1, 1]) - self.assert_numpy_array_equal(np.sort(tree.get_loc(0.5)), [0, 1, 2]) - - with self.assertRaises(KeyError): - tree.get_indexer(np.array([0.5])) - - indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) - self.assert_numpy_array_equal(np.sort(indexer), [0, 1, 2]) - self.assert_numpy_array_equal(missing, []) - - def test_get_loc_closed(self): - for closed in ['left', 'right', 'both', 'neither']: - tree = IntervalTree([0], [1], closed=closed) - for p, errors in [(0, tree.open_left), - (1, tree.open_right)]: - if errors: - with self.assertRaises(KeyError): - tree.get_loc(p) - else: - self.assert_numpy_array_equal(tree.get_loc(p), - np.array([0])) - - def test_get_indexer_closed(self): - x = np.arange(1000) - found = x - not_found = -np.ones(1000) - for leaf_size in [1, 10, 100, 10000]: - for closed in ['left', 'right', 'both', 'neither']: - tree = IntervalTree(x, x + 0.5, closed=closed, - leaf_size=leaf_size) - self.assert_numpy_array_equal(found, tree.get_indexer(x + 0.25)) - - expected = found if tree.closed_left else not_found - self.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.0)) - - expected = found if tree.closed_right else not_found - self.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5)) - +class TestIntervalIndex(Base, tm.TestCase): + _holder = IntervalIndex -class TestIntervalIndex(tm.TestCase): def setUp(self): self.index = IntervalIndex([0, 1], [1, 2]) + self.index_with_nan = IntervalIndex.from_tuples( + [(0, 1), np.nan, (1, 2)]) + self.indices = dict(intervalIndex=tm.makeIntervalIndex(10)) + + def create_index(self): + return IntervalIndex.from_breaks(np.arange(10)) def test_constructors(self): expected = self.index @@ -200,7 +32,8 @@ def test_constructors(self): actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) self.assertTrue(expected.equals(actual)) - self.assertRaises(ValueError, IntervalIndex, [0], [1], closed='invalid') + self.assertRaises(ValueError, IntervalIndex, [0], [1], + closed='invalid') # TODO: fix all these commented out tests (here and below) @@ -223,24 +56,74 @@ def test_constructors(self): # self.assertRaises(ValueError, IntervalIndex.from_breaks, # pd.period_range('2000-01-01', periods=3)) - def test_properties(self): - self.assertEqual(len(self.index), 2) - self.assertEqual(self.index.size, 2) + def test_constructors_datetimelike(self): + + # DTI / TDI + for idx in [pd.date_range('20130101', periods=5), + pd.timedelta_range('1 day', periods=5)]: + result = IntervalIndex.from_breaks(idx) + expected = IntervalIndex.from_breaks(idx.values) + tm.assert_index_equal(result, expected) + + expected_scalar_type = type(idx[0]) + i = result[0] + self.assertTrue(isinstance(i.left, expected_scalar_type)) + self.assertTrue(isinstance(i.right, expected_scalar_type)) + + def test_constructors_error(self): - self.assert_numpy_array_equal(self.index.left, [0, 1]) - self.assertIsInstance(self.index.left, Index) + # non-intervals + def f(): + IntervalIndex.from_intervals([0.997, 4.0]) + self.assertRaises(TypeError, f) - self.assert_numpy_array_equal(self.index.right, [1, 2]) - self.assertIsInstance(self.index.right, Index) + def test_properties(self): + index = self.index + self.assertEqual(len(index), 2) + self.assertEqual(index.size, 2) + self.assertEqual(index.shape, (2, )) - self.assert_numpy_array_equal(self.index.mid, [0.5, 1.5]) - self.assertIsInstance(self.index.mid, Index) + self.assert_index_equal(index.left, Index([0, 1])) + self.assert_index_equal(index.right, Index([1, 2])) + self.assert_index_equal(index.mid, Index([0.5, 1.5])) - self.assertEqual(self.index.closed, 'right') + self.assertEqual(index.closed, 'right') expected = np.array([Interval(0, 1), Interval(1, 2)], dtype=object) - self.assert_numpy_array_equal(np.asarray(self.index), expected) - self.assert_numpy_array_equal(self.index.values, expected) + self.assert_numpy_array_equal(np.asarray(index), expected) + self.assert_numpy_array_equal(index.values, expected) + + # with nans + index = self.index_with_nan + self.assertEqual(len(index), 3) + self.assertEqual(index.size, 3) + self.assertEqual(index.shape, (3, )) + + self.assert_index_equal(index.left, Index([0, np.nan, 1])) + self.assert_index_equal(index.right, Index([1, np.nan, 2])) + self.assert_index_equal(index.mid, Index([0.5, np.nan, 1.5])) + + self.assertEqual(index.closed, 'right') + + expected = np.array([Interval(0, 1), np.nan, + Interval(1, 2)], dtype=object) + self.assert_numpy_array_equal(np.asarray(index), expected) + self.assert_numpy_array_equal(index.values, expected) + + def test_with_nans(self): + index = self.index + self.assertFalse(index.hasnans) + self.assert_numpy_array_equal(index.isnull(), + np.array([False, False])) + self.assert_numpy_array_equal(index.notnull(), + np.array([True, True])) + + index = self.index_with_nan + self.assertTrue(index.hasnans) + self.assert_numpy_array_equal(index.notnull(), + np.array([True, False, True])) + self.assert_numpy_array_equal(index.isnull(), + np.array([False, True, False])) def test_copy(self): actual = self.index.copy() @@ -250,6 +133,66 @@ def test_copy(self): self.assertTrue(actual.equals(self.index)) self.assertIsNot(actual.left, self.index.left) + def test_ensure_copied_data(self): + # exercise the copy flag in the constructor + + # not copying + index = self.index + result = IntervalIndex(index, copy=False) + tm.assert_numpy_array_equal(index.left.values, result.left.values, + check_same='same') + tm.assert_numpy_array_equal(index.right.values, result.right.values, + check_same='same') + + # by-definition make a copy + result = IntervalIndex(index.values, copy=False) + tm.assert_numpy_array_equal(index.left.values, result.left.values, + check_same='copy') + tm.assert_numpy_array_equal(index.right.values, result.right.values, + check_same='copy') + + def test_equals(self): + + idx = self.index + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + + self.assertFalse(idx.equals(idx.astype(object))) + self.assertFalse(idx.equals(np.array(idx))) + self.assertFalse(idx.equals(list(idx))) + + self.assertFalse(idx.equals([1, 2])) + self.assertFalse(idx.equals(np.array([1, 2]))) + self.assertFalse(idx.equals( + pd.date_range('20130101', periods=2))) + + def test_astype(self): + + idx = self.index + + for dtype in [np.int64, np.float64, 'datetime64[ns]', + 'datetime64[ns, US/Eastern]', 'timedelta64', + 'period[M]']: + self.assertRaises(ValueError, idx.astype, dtype) + + result = idx.astype(object) + tm.assert_index_equal(result, Index(idx.values, dtype='object')) + self.assertFalse(idx.equals(result)) + self.assertTrue(idx.equals(IntervalIndex.from_intervals(result))) + + result = idx.astype('interval') + tm.assert_index_equal(result, idx) + self.assertTrue(result.equals(idx)) + + result = idx.astype('category') + expected = pd.Categorical(idx, ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_where(self): + self.assertRaises(NotImplementedError, + self.index.where, + self.index.notnull()) + def test_delete(self): expected = IntervalIndex.from_breaks([1, 2]) actual = self.index.delete(0) @@ -289,8 +232,10 @@ def test_monotonic_and_unique(self): self.assertTrue(idx.is_monotonic) def test_repr(self): - expected = ("IntervalIndex(left=[0, 1],\n right=[1, 2]," - "\n closed='right')") + expected = ("IntervalIndex(left=[0, 1]," + "\n right=[1, 2]," + "\n closed='right'," + "\n dtype='interval[int64]')") IntervalIndex((0, 1), (1, 2), closed='right') self.assertEqual(repr(self.index), expected) @@ -306,8 +251,10 @@ def test_get_loc_value(self): idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) self.assertEqual(idx.get_loc(0.5), 0) self.assertEqual(idx.get_loc(1), 0) - self.assert_numpy_array_equal(idx.get_loc(1.5), [0, 1]) - self.assert_numpy_array_equal(np.sort(idx.get_loc(2)), [0, 1]) + self.assert_numpy_array_equal(idx.get_loc(1.5), + np.array([0, 1], dtype='int64')) + self.assert_numpy_array_equal(np.sort(idx.get_loc(2)), + np.array([0, 1], dtype='int64')) self.assertEqual(idx.get_loc(3), 1) self.assertRaises(KeyError, idx.get_loc, 3.5) @@ -374,24 +321,25 @@ def test_get_loc_interval(self): self.assertEqual(self.index.get_loc(Interval(0, 0.5)), 0) self.assertEqual(self.index.get_loc(Interval(0, 1, 'left')), 0) self.assertRaises(KeyError, self.index.get_loc, Interval(2, 3)) - self.assertRaises(KeyError, self.index.get_loc, Interval(-1, 0, 'left')) + self.assertRaises(KeyError, self.index.get_loc, + Interval(-1, 0, 'left')) def test_get_indexer(self): actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = [-1, -1, 0, 0, 1, 1, -1] + expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='int64') self.assert_numpy_array_equal(actual, expected) actual = self.index.get_indexer(self.index) - expected = [0, 1] + expected = np.array([0, 1], dtype='int64') self.assert_numpy_array_equal(actual, expected) index = IntervalIndex.from_breaks([0, 1, 2], closed='left') actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = [-1, 0, 0, 1, 1, -1, -1] + expected = np.array([-1, 0, 0, 1, 1, -1, -1], dtype='int64') self.assert_numpy_array_equal(actual, expected) actual = self.index.get_indexer(index[:1]) - expected = [0] + expected = np.array([0], dtype='int64') self.assert_numpy_array_equal(actual, expected) self.assertRaises(ValueError, self.index.get_indexer, index) @@ -400,19 +348,19 @@ def test_get_indexer_subintervals(self): # return indexers for wholly contained subintervals target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) actual = self.index.get_indexer(target) - expected = [0, 0, 1, 1] + expected = np.array([0, 0, 1, 1], dtype='int64') self.assert_numpy_array_equal(actual, expected) target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) self.assertRaises(ValueError, self.index.get_indexer, target) actual = self.index.get_indexer(target[[0, -1]]) - expected = [0, 1] + expected = np.array([0, 1], dtype='int64') self.assert_numpy_array_equal(actual, expected) target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left') actual = self.index.get_indexer(target) - expected = [0, 0, 0] + expected = np.array([0, 0, 0], dtype='int64') self.assert_numpy_array_equal(actual, expected) def test_contains(self): @@ -430,7 +378,7 @@ def test_non_contiguous(self): index = IntervalIndex.from_tuples([(0, 1), (2, 3)]) target = [0.5, 1.5, 2.5] actual = index.get_indexer(target) - expected = [0, -1, 1] + expected = np.array([0, -1, 1], dtype='int64') self.assert_numpy_array_equal(actual, expected) self.assertNotIn(1.5, index) @@ -444,9 +392,9 @@ def test_union(self): actual = other.union(self.index) self.assertTrue(expected.equals(actual)) - self.assert_numpy_array_equal(self.index.union(self.index), self.index) - self.assert_numpy_array_equal(self.index.union(self.index[:1]), - self.index) + tm.assert_index_equal(self.index.union(self.index), self.index) + tm.assert_index_equal(self.index.union(self.index[:1]), + self.index) def test_intersection(self): other = IntervalIndex.from_breaks([1, 2, 3]) @@ -454,16 +402,17 @@ def test_intersection(self): actual = self.index.intersection(other) self.assertTrue(expected.equals(actual)) - self.assert_numpy_array_equal(self.index.intersection(self.index), - self.index) + tm.assert_index_equal(self.index.intersection(self.index), + self.index) def test_difference(self): - self.assert_numpy_array_equal(self.index.difference(self.index[:1]), - self.index[1:]) + tm.assert_index_equal(self.index.difference(self.index[:1]), + self.index[1:]) - def test_sym_diff(self): - self.assert_numpy_array_equal(self.index[:1].sym_diff(self.index[1:]), - self.index) + def test_symmetric_difference(self): + result = self.index[:1].symmetric_difference(self.index[1:]) + expected = self.index + tm.assert_index_equal(result, expected) def test_set_operation_errors(self): self.assertRaises(ValueError, self.index.union, self.index.left) @@ -473,24 +422,24 @@ def test_set_operation_errors(self): def test_isin(self): actual = self.index.isin(self.index) - self.assert_numpy_array_equal([True, True], actual) + self.assert_numpy_array_equal(np.array([True, True]), actual) actual = self.index.isin(self.index[:1]) - self.assert_numpy_array_equal([True, False], actual) + self.assert_numpy_array_equal(np.array([True, False]), actual) def test_comparison(self): actual = Interval(0, 1) < self.index - expected = [False, True] + expected = np.array([False, True]) self.assert_numpy_array_equal(actual, expected) actual = Interval(0.5, 1.5) < self.index - expected = [False, True] + expected = np.array([False, True]) self.assert_numpy_array_equal(actual, expected) actual = self.index > Interval(0.5, 1.5) self.assert_numpy_array_equal(actual, expected) actual = self.index == self.index - expected = [True, True] + expected = np.array([True, True]) self.assert_numpy_array_equal(actual, expected) actual = self.index <= self.index self.assert_numpy_array_equal(actual, expected) @@ -498,7 +447,7 @@ def test_comparison(self): self.assert_numpy_array_equal(actual, expected) actual = self.index < self.index - expected = [False, False] + expected = np.array([False, False]) self.assert_numpy_array_equal(actual, expected) actual = self.index > self.index self.assert_numpy_array_equal(actual, expected) @@ -507,23 +456,23 @@ def test_comparison(self): self.assert_numpy_array_equal(actual, expected) actual = self.index == self.index.values - self.assert_numpy_array_equal(actual, [True, True]) + self.assert_numpy_array_equal(actual, np.array([True, True])) actual = self.index.values == self.index - self.assert_numpy_array_equal(actual, [True, True]) + self.assert_numpy_array_equal(actual, np.array([True, True])) actual = self.index <= self.index.values - self.assert_numpy_array_equal(actual, [True, True]) + self.assert_numpy_array_equal(actual, np.array([True, True])) actual = self.index != self.index.values - self.assert_numpy_array_equal(actual, [False, False]) + self.assert_numpy_array_equal(actual, np.array([False, False])) actual = self.index > self.index.values - self.assert_numpy_array_equal(actual, [False, False]) + self.assert_numpy_array_equal(actual, np.array([False, False])) actual = self.index.values > self.index - self.assert_numpy_array_equal(actual, [False, False]) + self.assert_numpy_array_equal(actual, np.array([False, False])) # invalid comparisons actual = self.index == 0 - self.assert_numpy_array_equal(actual, [False, False]) + self.assert_numpy_array_equal(actual, np.array([False, False])) actual = self.index == self.index.left - self.assert_numpy_array_equal(actual, [False, False]) + self.assert_numpy_array_equal(actual, np.array([False, False])) with self.assertRaisesRegexp(TypeError, 'unorderable types'): self.index > 0 @@ -540,52 +489,176 @@ def test_missing_values(self): assert idx.equals(idx2) with tm.assertRaisesRegexp(ValueError, 'both left and right sides'): - pd.IntervalIndex([np.nan, 0, 1], [0, 1, 2]) + pd.IntervalIndex([np.nan, 0, 1], np.array([0, 1, 2])) - self.assert_numpy_array_equal(pd.isnull(idx), [True, False, False]) + self.assert_numpy_array_equal(pd.isnull(idx), + np.array([True, False, False])) - def test_order(self): + def test_sort_values(self): expected = IntervalIndex.from_breaks([1, 2, 3, 4]) - actual = IntervalIndex.from_tuples([(3, 4), (1, 2), (2, 3)]).order() - self.assert_numpy_array_equal(expected, actual) + actual = IntervalIndex.from_tuples([(3, 4), (1, 2), + (2, 3)]).sort_values() + tm.assert_index_equal(expected, actual) + + # nan + idx = self.index_with_nan + mask = idx.isnull() + self.assert_numpy_array_equal(mask, np.array([False, True, False])) + + result = idx.sort_values() + mask = result.isnull() + self.assert_numpy_array_equal(mask, np.array([False, True, False])) def test_datetime(self): dates = pd.date_range('2000', periods=3) idx = IntervalIndex.from_breaks(dates) - self.assert_numpy_array_equal(idx.left, dates[:2]) - self.assert_numpy_array_equal(idx.right, dates[-2:]) + tm.assert_index_equal(idx.left, dates[:2]) + tm.assert_index_equal(idx.right, dates[-2:]) expected = pd.date_range('2000-01-01T12:00', periods=2) - self.assert_numpy_array_equal(idx.mid, expected) + tm.assert_index_equal(idx.mid, expected) self.assertIn('2000-01-01T12', idx) target = pd.date_range('1999-12-31T12:00', periods=7, freq='12H') actual = idx.get_indexer(target) - expected = [-1, -1, 0, 0, 1, 1, -1] + expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='int64') self.assert_numpy_array_equal(actual, expected) - # def test_math(self): - # # add, subtract, multiply, divide with scalars should be OK - # actual = 2 * self.index + 1 - # expected = IntervalIndex.from_breaks((2 * np.arange(3) + 1)) - # self.assertTrue(expected.equals(actual)) + def test_append(self): + + index1 = IntervalIndex([0, 1], [1, 2]) + index2 = IntervalIndex([1, 2], [2, 3]) + + result = index1.append(index2) + expected = IntervalIndex([0, 1, 1, 2], [1, 2, 2, 3]) + tm.assert_index_equal(result, expected) + + result = index1.append([index1, index2]) + expected = IntervalIndex([0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3]) + tm.assert_index_equal(result, expected) + + def f(): + index1.append(IntervalIndex([0, 1], [1, 2], closed='both')) + + self.assertRaises(ValueError, f) + + +class TestIntervalRange(tm.TestCase): + + def test_construction(self): + result = interval_range(0, 5, name='foo', closed='both') + expected = IntervalIndex.from_breaks( + np.arange(0, 5), name='foo', closed='both') + tm.assert_index_equal(result, expected) + + def test_errors(self): + + # not enough params + def f(): + interval_range(0) + + self.assertRaises(ValueError, f) + + def f(): + interval_range(periods=2) + + self.assertRaises(ValueError, f) + + def f(): + interval_range() + + self.assertRaises(ValueError, f) + + # mixed units + def f(): + interval_range(0, Timestamp('20130101'), freq=2) - # actual = self.index / 2.0 - 1 - # expected = IntervalIndex.from_breaks((np.arange(3) / 2.0 - 1)) - # self.assertTrue(expected.equals(actual)) + self.assertRaises(ValueError, f) - # with self.assertRaises(TypeError): - # # doesn't make sense to add two IntervalIndex objects - # self.index + self.index + def f(): + interval_range(0, 10, freq=Timedelta('1day')) - # def test_datetime_math(self): + self.assertRaises(ValueError, f) - # expected = IntervalIndex(pd.date_range('2000-01-02', periods=3)) - # actual = idx + pd.to_timedelta(1, unit='D') - # self.assertTrue(expected.equals(actual)) - # TODO: other set operations (left join, right join, intersection), - # set operations with conflicting IntervalIndex objects or other dtypes, - # groupby, cut, reset_index... +class TestIntervalTree(tm.TestCase): + def setUp(self): + gentree = lambda dtype: IntervalTree(np.arange(5, dtype=dtype), + np.arange(5, dtype=dtype) + 2) + self.tree = gentree('int64') + self.trees = {dtype: gentree(dtype) + for dtype in ['int32', 'int64', 'float32', 'float64']} + + def test_get_loc(self): + for dtype, tree in self.trees.items(): + self.assert_numpy_array_equal(tree.get_loc(1), + np.array([0], dtype='int64')) + self.assert_numpy_array_equal(np.sort(tree.get_loc(2)), + np.array([0, 1], dtype='int64')) + with self.assertRaises(KeyError): + tree.get_loc(-1) + + def test_get_indexer(self): + for dtype, tree in self.trees.items(): + self.assert_numpy_array_equal( + tree.get_indexer(np.array([1.0, 5.5, 6.5])), + np.array([0, 4, -1], dtype='int64')) + with self.assertRaises(KeyError): + tree.get_indexer(np.array([3.0])) + + def test_get_indexer_non_unique(self): + indexer, missing = self.tree.get_indexer_non_unique( + np.array([1.0, 2.0, 6.5])) + self.assert_numpy_array_equal(indexer[:1], + np.array([0], dtype='int64')) + self.assert_numpy_array_equal(np.sort(indexer[1:3]), + np.array([0, 1], dtype='int64')) + self.assert_numpy_array_equal(np.sort(indexer[3:]), + np.array([-1], dtype='int64')) + self.assert_numpy_array_equal(missing, np.array([2], dtype='int64')) + + def test_duplicates(self): + tree = IntervalTree([0, 0, 0], [1, 1, 1]) + self.assert_numpy_array_equal(np.sort(tree.get_loc(0.5)), + np.array([0, 1, 2], dtype='int64')) + + with self.assertRaises(KeyError): + tree.get_indexer(np.array([0.5])) + + indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) + self.assert_numpy_array_equal(np.sort(indexer), + np.array([0, 1, 2], dtype='int64')) + self.assert_numpy_array_equal(missing, np.array([], dtype='int64')) + + def test_get_loc_closed(self): + for closed in ['left', 'right', 'both', 'neither']: + tree = IntervalTree([0], [1], closed=closed) + for p, errors in [(0, tree.open_left), + (1, tree.open_right)]: + if errors: + with self.assertRaises(KeyError): + tree.get_loc(p) + else: + self.assert_numpy_array_equal(tree.get_loc(p), + np.array([0], dtype='int64')) + + def test_get_indexer_closed(self): + x = np.arange(1000, dtype='int64') + found = x + not_found = (-1 * np.ones(1000)).astype('int64') + for leaf_size in [1, 10, 100, 10000]: + for closed in ['left', 'right', 'both', 'neither']: + tree = IntervalTree(x, x + 0.5, closed=closed, + leaf_size=leaf_size) + self.assert_numpy_array_equal(found, + tree.get_indexer(x + 0.25)) + + expected = found if tree.closed_left else not_found + self.assert_numpy_array_equal(expected, + tree.get_indexer(x + 0.0)) + + expected = found if tree.closed_right else not_found + self.assert_numpy_array_equal(expected, + tree.get_indexer(x + 0.5)) diff --git a/pandas/tests/indexing/test_interval.py b/pandas/tests/indexing/test_interval.py new file mode 100644 index 0000000000000..df5c412f0aaad --- /dev/null +++ b/pandas/tests/indexing/test_interval.py @@ -0,0 +1,67 @@ +import numpy as np +import pandas as pd + +from pandas import Series, DataFrame, IntervalIndex, Interval +import pandas.util.testing as tm + + +class TestIntervalIndex(tm.TestCase): + + def setUp(self): + self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + + def test_loc_getitem_series(self): + + s = self.s + expected = 0 + self.assertEqual(expected, s.loc[0.5]) + self.assertEqual(expected, s.loc[1]) + self.assertEqual(expected, s.loc[Interval(0, 1)]) + self.assertRaises(KeyError, s.loc.__getitem__, 0) + + expected = s.iloc[:3] + tm.assert_series_equal(expected, s.loc[:3]) + tm.assert_series_equal(expected, s.loc[:2.5]) + tm.assert_series_equal(expected, s.loc[0.1:2.5]) + tm.assert_series_equal(expected, s.loc[-1:3]) + + expected = s.iloc[1:4] + tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, s.loc[[2, 3, 4]]) + tm.assert_series_equal(expected, s.loc[[1.5, 3, 4]]) + + def test_loc_getitem_frame(self): + + df = DataFrame({'A': range(10)}) + s = pd.cut(df.A, 5) + df['B'] = s + df = df.set_index('B') + + result = df.loc[4] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + def f(): + df.loc[10] + + self.assertRaises(KeyError, f) + + # single list-like + result = df.loc[[4]] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + # non-unique + result = df.loc[[4, 5]] + expected = df.take([4, 5, 4, 5]) + tm.assert_frame_equal(result, expected) + + def f(): + df.loc[[10]] + + self.assertRaises(KeyError, f) + + # partial missing + result = df.loc[[10, 4]] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/test_interval.py b/pandas/tests/scalar/test_interval.py new file mode 100644 index 0000000000000..5f93c1ed79496 --- /dev/null +++ b/pandas/tests/scalar/test_interval.py @@ -0,0 +1,116 @@ +from __future__ import division + +from pandas import Interval +import pandas.util.testing as tm + + +class TestInterval(tm.TestCase): + def setUp(self): + self.interval = Interval(0, 1) + + def test_properties(self): + self.assertEqual(self.interval.closed, 'right') + self.assertEqual(self.interval.left, 0) + self.assertEqual(self.interval.right, 1) + self.assertEqual(self.interval.mid, 0.5) + + def test_repr(self): + self.assertEqual(repr(self.interval), + "Interval(0, 1, closed='right')") + self.assertEqual(str(self.interval), "(0, 1]") + + interval_left = Interval(0, 1, closed='left') + self.assertEqual(repr(interval_left), + "Interval(0, 1, closed='left')") + self.assertEqual(str(interval_left), "[0, 1)") + + def test_contains(self): + self.assertIn(0.5, self.interval) + self.assertIn(1, self.interval) + self.assertNotIn(0, self.interval) + self.assertRaises(TypeError, lambda: self.interval in self.interval) + + interval = Interval(0, 1, closed='both') + self.assertIn(0, interval) + self.assertIn(1, interval) + + interval = Interval(0, 1, closed='neither') + self.assertNotIn(0, interval) + self.assertIn(0.5, interval) + self.assertNotIn(1, interval) + + def test_equal(self): + self.assertEqual(Interval(0, 1), Interval(0, 1, closed='right')) + self.assertNotEqual(Interval(0, 1), Interval(0, 1, closed='left')) + self.assertNotEqual(Interval(0, 1), 0) + + def test_comparison(self): + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + Interval(0, 1) < 2 + + self.assertTrue(Interval(0, 1) < Interval(1, 2)) + self.assertTrue(Interval(0, 1) < Interval(0, 2)) + self.assertTrue(Interval(0, 1) < Interval(0.5, 1.5)) + self.assertTrue(Interval(0, 1) <= Interval(0, 1)) + self.assertTrue(Interval(0, 1) > Interval(-1, 2)) + self.assertTrue(Interval(0, 1) >= Interval(0, 1)) + + def test_hash(self): + # should not raise + hash(self.interval) + + def test_math_add(self): + expected = Interval(1, 2) + actual = self.interval + 1 + self.assertEqual(expected, actual) + + expected = Interval(1, 2) + actual = 1 + self.interval + self.assertEqual(expected, actual) + + actual = self.interval + actual += 1 + self.assertEqual(expected, actual) + + with self.assertRaises(TypeError): + self.interval + Interval(1, 2) + + def test_math_sub(self): + expected = Interval(-1, 0) + actual = self.interval - 1 + self.assertEqual(expected, actual) + + actual = self.interval + actual -= 1 + self.assertEqual(expected, actual) + + with self.assertRaises(TypeError): + self.interval - Interval(1, 2) + + def test_math_mult(self): + expected = Interval(0, 2) + actual = self.interval * 2 + self.assertEqual(expected, actual) + + expected = Interval(0, 2) + actual = 2 * self.interval + self.assertEqual(expected, actual) + + actual = self.interval + actual *= 2 + self.assertEqual(expected, actual) + + with self.assertRaises(TypeError): + self.interval * Interval(1, 2) + + def test_math_div(self): + expected = Interval(0, 0.5) + actual = self.interval / 2.0 + self.assertEqual(expected, actual) + + actual = self.interval + actual /= 2.0 + self.assertEqual(expected, actual) + + with self.assertRaises(TypeError): + self.interval / Interval(1, 2) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index aef4c9269bc62..c94c74a41afb5 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -10,8 +10,7 @@ from pandas.types.common import is_categorical_dtype, is_datetime64tz_dtype from pandas import (Index, Series, isnull, date_range, - period_range, NaT) -from pandas.core.index import MultiIndex + NaT, period_range, MultiIndex, IntervalIndex) from pandas.tseries.index import Timestamp, DatetimeIndex from pandas import lib, tslib @@ -550,6 +549,17 @@ def test_constructor_with_datetime_tz(self): expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected) + def test_construction_interval(self): + # construction from interval & array of intervals + index = IntervalIndex.from_breaks(np.arange(3), closed='right') + result = Series(index) + repr(result) + str(result) + tm.assert_index_equal(Index(result.values), index) + + result = Series(index.values) + tm.assert_index_equal(Index(result.values), index) + def test_construction_consistency(self): # make sure that we are not re-localizing upon construction diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6d2a5b3ae8131..6ff994da7f7f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -6,7 +6,8 @@ from numpy import nan from datetime import datetime from itertools import permutations -from pandas import Series, Categorical, CategoricalIndex, Index +from pandas import (Series, Categorical, CategoricalIndex, + Index, IntervalIndex) import pandas as pd from pandas import compat @@ -477,8 +478,9 @@ def test_value_counts(self): # tm.assertIsInstance(factor, n) result = algos.value_counts(factor) - breaks = [-1.192, -0.535, 0.121, 0.777, 1.433] - expected_index = pd.IntervalIndex.from_breaks(breaks) + breaks = [-1.194, -0.535, 0.121, 0.777, 1.433] + expected_index = pd.IntervalIndex.from_breaks( + breaks).astype('category') expected = Series([1, 1, 1, 1], index=expected_index) tm.assert_series_equal(result.sort_index(), expected.sort_index()) @@ -486,13 +488,15 @@ def test_value_counts(self): def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - self.assertEqual(result.tolist(), [4]) - self.assertEqual(result.index[0], pd.Interval(0.999, 4.0)) + expected = Series([4], + index=IntervalIndex.from_tuples([(0.996, 4.0)])) + tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) - self.assertEqual(result.tolist(), [2, 2]) - self.assertEqual(result.index.min(), pd.Interval(0.999, 2.5)) - self.assertEqual(result.index.max(), pd.Interval(2.5, 4.0)) + expected = Series([2, 2], + index=IntervalIndex.from_tuples([(0.996, 2.5), + (2.5, 4.0)])) + tm.assert_series_equal(result, expected) def test_value_counts_dtypes(self): result = algos.value_counts([1, 1.]) @@ -543,7 +547,7 @@ def test_categorical(self): s = Series(pd.Categorical(list('aaabbc'))) result = s.value_counts() expected = pd.Series([3, 2, 1], - index=pd.CategoricalIndex(['a', 'b', 'c'])) + index=CategoricalIndex(['a', 'b', 'c'])) tm.assert_series_equal(result, expected, check_index_type=True) # preserve order? @@ -556,13 +560,13 @@ def test_categorical_nans(self): s = Series(pd.Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan) s.iloc[1] = np.nan result = s.value_counts() - expected = pd.Series([4, 3, 2], index=pd.CategoricalIndex( + expected = pd.Series([4, 3, 2], index=CategoricalIndex( ['a', 'b', 'c'], categories=['a', 'b', 'c'])) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) expected = pd.Series([ 4, 3, 2, 1 - ], index=pd.CategoricalIndex(['a', 'b', 'c', np.nan])) + ], index=CategoricalIndex(['a', 'b', 'c', np.nan])) tm.assert_series_equal(result, expected, check_index_type=True) # out of order @@ -570,12 +574,12 @@ def test_categorical_nans(self): list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c'])) s.iloc[1] = np.nan result = s.value_counts() - expected = pd.Series([4, 3, 2], index=pd.CategoricalIndex( + expected = pd.Series([4, 3, 2], index=CategoricalIndex( ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True)) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) - expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex( + expected = pd.Series([4, 3, 2, 1], index=CategoricalIndex( ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True)) tm.assert_series_equal(result, expected, check_index_type=True) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 4902e46319eb8..8fe27ccad82e3 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -31,7 +31,6 @@ def test_string_methods_dont_fail(self): unicode(self.container) # noqa def test_tricky_container(self): - import nose if not hasattr(self, 'unicode_container'): pytest.skip('Need unicode_container to test with this') repr(self.unicode_container) @@ -639,10 +638,10 @@ def test_value_counts_bins(self): s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.999, 3.0): 4}) + exp1 = Series({Interval(0.997, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.999, 3.0): 1.0}) + exp1n = Series({Interval(0.997, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): @@ -653,26 +652,20 @@ def test_value_counts_bins(self): self.assertEqual(s1.nunique(), 3) - res4 = s1.value_counts(bins=4) -<<<<<<< 845208055845b0db58d2bfee7ba39f6862ce141c - exp4 = Series({0.998: 2, - 1.5: 1, - 2.0: 0, - 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) + # these return the same + res4 = s1.value_counts(bins=4, dropna=True) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) - res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series( - {0.998: 0.5, - 1.5: 0.25, - 2.0: 0.0, - 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) -======= - intervals = IntervalIndex.from_breaks([0.999, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1], index=intervals.take([0, 3, 1])) + + res4 = s1.value_counts(bins=4, dropna=False) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) + res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25], index=intervals.take([0, 3, 1])) ->>>>>>> API/ENH: IntervalIndex + exp4n = Series([0.5, 0.25, 0.25, 0], + index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 0b1fea2097dc3..847acc37ead86 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -17,7 +17,8 @@ import pandas.compat as compat import pandas.util.testing as tm from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex, - Timestamp, CategoricalIndex, Interval, isnull) + Timestamp, CategoricalIndex, Interval, IntervalIndex, + isnull) from pandas.compat import range, lrange, u, PY3 from pandas.core.config import option_context @@ -120,6 +121,16 @@ def test_constructor_unsortable(self): self.assertRaises( TypeError, lambda: Categorical(arr, ordered=True)) + def test_constructor_interval(self): + result = Categorical([Interval(1, 2), Interval(2, 3), Interval(3, 6)], + ordered=True) + ii = IntervalIndex.from_intervals([Interval(1, 2), + Interval(2, 3), + Interval(3, 6)]) + exp = Categorical(ii, ordered=True) + self.assert_categorical_equal(result, exp) + tm.assert_index_equal(result.categories, ii) + def test_is_equal_dtype(self): # test dtype comparisons between cats diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index 46b3560523888..e5ceaf6450a72 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -3,56 +3,60 @@ import numpy as np from pandas.compat import zip -from pandas import DataFrame, Series, Index, unique, isnull +from pandas import (Series, Index, isnull, + to_datetime, DatetimeIndex, Timestamp, + Interval, IntervalIndex, Categorical, + cut, qcut, date_range) import pandas.util.testing as tm -from pandas.util.testing import assertRaisesRegexp -import pandas.core.common as com from pandas.core.algorithms import quantile -from pandas.core.categorical import Categorical -from pandas.core.interval import Interval, IntervalIndex -from pandas.tools.tile import cut, qcut import pandas.tools.tile as tmod -from pandas import to_datetime, DatetimeIndex, Timestamp class TestCut(tm.TestCase): def test_simple(self): - data = np.ones(5) + data = np.ones(5, dtype='int64') result = cut(data, 4, labels=False) - desired = np.array([1, 1, 1, 1, 1]) - tm.assert_numpy_array_equal(result, desired, - check_dtype=False) + desired = np.array([1, 1, 1, 1, 1], dtype='int64') + tm.assert_numpy_array_equal(result, desired) def test_bins(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 1, 2, 0])) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + expected = intervals.take([0, 0, 0, 1, 2, 0]).astype('category') + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, + 6.53333333, 9.7])) def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 2, 3, 0, 0])) - tm.assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7]) + expected = intervals.astype('category').take([0, 0, 0, 2, 3, 0, 0]) + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, + 7.325, 9.7])) def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3), closed='left') - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 2, 3, 0, 1])) - tm.assert_almost_equal(bins, [0.2, 2.575, 4.95, 7.325, 9.7095]) + expected = intervals.take([0, 0, 0, 2, 3, 0, 1]).astype('category') + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, + 7.325, 9.7095])) def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 1, 2, 0])) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + expected = intervals.take([0, 0, 0, 1, 2, 0]).astype('category') + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, + 6.53333333, 9.7])) def test_bins_not_monotonic(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] @@ -81,12 +85,12 @@ def test_labels(self): result, bins = cut(arr, 4, retbins=True) ex_levels = IntervalIndex.from_breaks([-1e-3, 0.25, 0.5, 0.75, 1]) - self.assert_numpy_array_equal(unique(result), ex_levels) + tm.assert_index_equal(result.categories, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = IntervalIndex.from_breaks([0, 0.25, 0.5, 0.75, 1 + 1e-3], closed='left') - self.assert_numpy_array_equal(unique(result), ex_levels) + tm.assert_index_equal(result.categories, ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -98,8 +102,9 @@ def test_label_precision(self): arr = np.arange(0, 0.73, 0.01) result = cut(arr, 4, precision=2) - ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72]) - self.assert_numpy_array_equal(unique(result), ex_levels) + ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, + 0.54, 0.72]) + tm.assert_index_equal(result.categories, ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -109,12 +114,12 @@ def test_na_handling(self): result_arr = np.asarray(result) - ex_arr = np.where(com.isnull(arr), np.nan, result_arr) + ex_arr = np.where(isnull(arr), np.nan, result_arr) tm.assert_almost_equal(result_arr, ex_arr) result = cut(arr, 4, labels=False) - ex_result = np.where(com.isnull(arr), np.nan, result) + ex_result = np.where(isnull(arr), np.nan, result) tm.assert_almost_equal(result, ex_result) def test_inf_handling(self): @@ -125,8 +130,8 @@ def test_inf_handling(self): result = cut(data, bins) result_ser = cut(data_ser, bins) - ex_uniques = IntervalIndex.from_breaks(bins).values - tm.assert_numpy_array_equal(unique(result), ex_uniques) + ex_uniques = IntervalIndex.from_breaks(bins) + tm.assert_index_equal(result.categories, ex_uniques) self.assertEqual(result[5], Interval(4, np.inf)) self.assertEqual(result[0], Interval(-np.inf, 2)) self.assertEqual(result_ser[5], Interval(4, np.inf)) @@ -135,12 +140,17 @@ def test_inf_handling(self): def test_qcut(self): arr = np.random.randn(1000) + # we store the bins as Index that have been rounded + # to comparisions are a bit tricky labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) - tm.assert_almost_equal(bins, ex_bins) + result = labels.categories.left.values + self.assertTrue(np.allclose(result, ex_bins[:-1], atol=1e-2)) + result = labels.categories.right.values + self.assertTrue(np.allclose(result, ex_bins[1:], atol=1e-2)) ex_levels = cut(arr, ex_bins, include_lowest=True) - self.assert_categorical_equal(labels, ex_levels) + tm.assert_categorical_equal(labels, ex_levels) def test_qcut_bounds(self): arr = np.random.randn(1000) @@ -153,11 +163,11 @@ def test_qcut_specify_quantiles(self): factor = qcut(arr, [0, .25, .5, .75, 1.]) expected = qcut(arr, 4) - self.assert_numpy_array_equal(factor, expected) + tm.assert_categorical_equal(factor, expected) def test_qcut_all_bins_same(self): - assertRaisesRegexp(ValueError, "edges.*unique", qcut, - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) + tm.assertRaisesRegexp(ValueError, "edges.*unique", qcut, + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) def test_cut_out_of_bounds(self): arr = np.random.randn(100) @@ -174,36 +184,39 @@ def test_cut_pass_labels(self): labels = ['Small', 'Medium', 'Large'] result = cut(arr, bins, labels=labels) - exp = ['Medium'] + 4 * ['Small'] + ['Medium', 'Large'] - self.assert_numpy_array_equal(result, exp) + exp = Categorical(['Medium'] + 4 * ['Small'] + ['Medium', 'Large'], + ordered=True) + self.assert_categorical_equal(result, exp) - result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2], labels)) + result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2], + labels)) exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels) - self.assertTrue(result.equals(exp)) + self.assert_categorical_equal(result, exp) def test_qcut_include_lowest(self): values = np.arange(10) - cats = qcut(values, 4) + ii = qcut(values, 4) - ex_levels = [Interval(0, 2.25, closed='both'), Interval(2.25, 4.5), - Interval(4.5, 6.75), Interval(6.75, 9)] - self.assert_numpy_array_equal(unique(cats), ex_levels) + ex_levels = IntervalIndex.from_intervals( + [Interval(-0.001, 2.25), + Interval(2.25, 4.5), + Interval(4.5, 6.75), + Interval(6.75, 9)]) + tm.assert_index_equal(ii.categories, ex_levels) def test_qcut_nas(self): arr = np.random.randn(100) arr[:20] = np.nan result = qcut(arr, 4) - self.assertTrue(com.isnull(result[:20]).all()) + self.assertTrue(isnull(result[:20]).all()) def test_qcut_index(self): - # the result is closed on a different side for the first interval, but - # we should still be able to make an index result = qcut([0, 2], 2) - index = Index(result) - expected = Index([Interval(0, 1, closed='both'), Interval(1, 2)]) - self.assert_numpy_array_equal(index, expected) + expected = Index([Interval(-0.001, 1), Interval(1, 2)]).astype( + 'category') + self.assert_categorical_equal(result, expected) def test_round_frac(self): # it works @@ -247,41 +260,46 @@ def test_qcut_binning_issues(self): self.assertTrue(ep <= sn) def test_cut_return_intervals(self): - s = Series([0,1,2,3,4,5,6,7,8]) - res = cut(s,3) + s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = cut(s, 3) exp_bins = np.linspace(0, 8, num=4).round(3) exp_bins[0] -= 0.008 - exp = Series(IntervalIndex.from_breaks(exp_bins).take([0,0,0,1,1,1,2,2,2])) + exp = Series(IntervalIndex.from_breaks(exp_bins, closed='right').take( + [0, 0, 0, 1, 1, 1, 2, 2, 2])).astype('category', ordered=True) tm.assert_series_equal(res, exp) def test_qcut_return_intervals(self): - s = Series([0,1,2,3,4,5,6,7,8]) - res = qcut(s,[0,0.333,0.666,1]) - exp_levels = np.array([Interval(0, 2.664, closed='both'), + s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = qcut(s, [0, 0.333, 0.666, 1]) + exp_levels = np.array([Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]) - exp = Series(exp_levels.take([0,0,0,1,1,1,2,2,2])) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype( + 'category', ordered=True) tm.assert_series_equal(res, exp) def test_series_retbins(self): # GH 8589 s = Series(np.arange(4)) result, bins = cut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, - np.array([0, 0, 1, 1], dtype=np.int8)) - tm.assert_numpy_array_equal(bins, np.array([-0.003, 1.5, 3])) + expected = Series(IntervalIndex.from_breaks( + [-0.003, 1.5, 3], closed='right').repeat(2)).astype('category', + ordered=True) + tm.assert_series_equal(result, expected) result, bins = qcut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, - np.array([0, 0, 1, 1], dtype=np.int8)) - tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) + expected = Series(IntervalIndex.from_breaks( + [-0.001, 1.5, 3], closed='right').repeat(2)).astype('category', + ordered=True) + tm.assert_series_equal(result, expected) def test_qcut_duplicates_bin(self): # GH 7751 values = [0, 0, 0, 0, 1, 2, 3] - result_levels = ['[0, 1]', '(1, 3]'] + expected = IntervalIndex.from_intervals([Interval(-0.001, 1), + Interval(1, 3)]) - cats = qcut(values, 3, duplicates='drop') - self.assertTrue((cats.categories == result_levels).all()) + result = qcut(values, 3, duplicates='drop') + tm.assert_index_equal(result.categories, expected) self.assertRaises(ValueError, qcut, values, 3) self.assertRaises(ValueError, qcut, values, 3, duplicates='raise') @@ -305,11 +323,18 @@ def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) + result, bins = cut(data, 3, retbins=True) - expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', - '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', - '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], - ).astype("category", ordered=True) + expected = ( + Series(IntervalIndex.from_intervals([ + Interval(Timestamp('2012-12-31 23:57:07.200000'), + Timestamp('2013-01-01 16:00:00')), + Interval(Timestamp('2013-01-01 16:00:00'), + Timestamp('2013-01-02 08:00:00')), + Interval(Timestamp('2013-01-02 08:00:00'), + Timestamp('2013-01-03 00:00:00'))])) + .astype('category', ordered=True)) + tm.assert_series_equal(result, expected) # testing for time data to be present as list @@ -333,9 +358,11 @@ def test_datetime_cut(self): def test_datetime_bin(self): data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] - expected = Series(['(2012-12-12 00:00:00, 2012-12-14 00:00:00]', - '(2012-12-14 00:00:00, 2012-12-16 00:00:00]'], - ).astype("category", ordered=True) + expected = ( + Series(IntervalIndex.from_intervals([ + Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), + Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])) + .astype('category', ordered=True)) for conv in [Timestamp, Timestamp, np.datetime64]: bins = [conv(v) for v in bin_data] @@ -350,13 +377,19 @@ def test_datetime_bin(self): result = cut(data, bins=bin_pydatetime) tm.assert_series_equal(Series(result), expected) - result, bins = cut(s, 2, retbins=True, labels=[0, 1]) - tm.assert_numpy_array_equal(result, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [-0.003, 1.5, 3]) + def test_datetime_nan(self): + + def f(): + cut(date_range('20130101', periods=3), bins=[0, 2, 4]) + self.assertRaises(ValueError, f) - result, bins = qcut(s, 2, retbins=True, labels=[0, 1]) - tm.assert_numpy_array_equal(result, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [0, 1.5, 3]) + result = cut(date_range('20130102', periods=5), + bins=date_range('20130101', periods=2)) + mask = result.categories.isnull() + self.assert_numpy_array_equal(mask, np.array([False])) + mask = result.isnull() + self.assert_numpy_array_equal( + mask, np.array([False, True, True, True, True])) def curpath(): diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index 8ef2868ae324f..7c41b01090d82 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -3,14 +3,15 @@ import numpy as np import pandas as pd -from pandas import Series, Categorical, date_range +from pandas import Series, Categorical, IntervalIndex, date_range -from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype +from pandas.types.dtypes import (DatetimeTZDtype, PeriodDtype, + IntervalDtype, CategoricalDtype) from pandas.types.common import (is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, - is_datetime64_dtype, + is_datetime64_dtype, is_interval_dtype, is_datetime64_any_dtype, is_string_dtype, _coerce_to_dtype) import pandas.util.testing as tm @@ -350,3 +351,114 @@ def test_empty(self): def test_not_string(self): # though PeriodDtype has object kind, it cannot be string self.assertFalse(is_string_dtype(PeriodDtype('D'))) + + +class TestIntervalDtype(Base, tm.TestCase): + + # TODO: placeholder + def setUp(self): + self.dtype = IntervalDtype('int64') + + def test_construction(self): + with tm.assertRaises(ValueError): + IntervalDtype('xx') + + for s in ['interval[int64]', 'Interval[int64]', 'int64']: + i = IntervalDtype(s) + self.assertEqual(i.subtype, np.dtype('int64')) + self.assertTrue(is_interval_dtype(i)) + + def test_construction_generic(self): + # generic + i = IntervalDtype('interval') + self.assertIs(i.subtype, None) + self.assertTrue(is_interval_dtype(i)) + self.assertTrue(str(i) == 'interval') + + i = IntervalDtype() + self.assertIs(i.subtype, None) + self.assertTrue(is_interval_dtype(i)) + self.assertTrue(str(i) == 'interval') + + def test_subclass(self): + a = IntervalDtype('interval[int64]') + b = IntervalDtype('interval[int64]') + + self.assertTrue(issubclass(type(a), type(a))) + self.assertTrue(issubclass(type(a), type(b))) + + def test_is_dtype(self): + self.assertTrue(IntervalDtype.is_dtype(self.dtype)) + self.assertTrue(IntervalDtype.is_dtype('interval')) + self.assertTrue(IntervalDtype.is_dtype(IntervalDtype('float64'))) + self.assertTrue(IntervalDtype.is_dtype(IntervalDtype('int64'))) + self.assertTrue(IntervalDtype.is_dtype(IntervalDtype(np.int64))) + + self.assertFalse(IntervalDtype.is_dtype('D')) + self.assertFalse(IntervalDtype.is_dtype('3D')) + self.assertFalse(IntervalDtype.is_dtype('U')) + self.assertFalse(IntervalDtype.is_dtype('S')) + self.assertFalse(IntervalDtype.is_dtype('foo')) + self.assertFalse(IntervalDtype.is_dtype(np.object_)) + self.assertFalse(IntervalDtype.is_dtype(np.int64)) + self.assertFalse(IntervalDtype.is_dtype(np.float64)) + + def test_identity(self): + self.assertEqual(IntervalDtype('interval[int64]'), + IntervalDtype('interval[int64]')) + + def test_coerce_to_dtype(self): + self.assertEqual(_coerce_to_dtype('interval[int64]'), + IntervalDtype('interval[int64]')) + + def test_construction_from_string(self): + result = IntervalDtype('interval[int64]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + result = IntervalDtype.construct_from_string('interval[int64]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + with tm.assertRaises(TypeError): + IntervalDtype.construct_from_string('foo') + with tm.assertRaises(TypeError): + IntervalDtype.construct_from_string('interval[foo]') + with tm.assertRaises(TypeError): + IntervalDtype.construct_from_string('foo[int64]') + + def test_equality(self): + self.assertTrue(is_dtype_equal(self.dtype, 'interval[int64]')) + self.assertTrue(is_dtype_equal(self.dtype, IntervalDtype('int64'))) + self.assertTrue(is_dtype_equal(self.dtype, IntervalDtype('int64'))) + self.assertTrue(is_dtype_equal(IntervalDtype('int64'), + IntervalDtype('int64'))) + + self.assertFalse(is_dtype_equal(self.dtype, 'int64')) + self.assertFalse(is_dtype_equal(IntervalDtype('int64'), + IntervalDtype('float64'))) + + def test_basic(self): + self.assertTrue(is_interval_dtype(self.dtype)) + + ii = IntervalIndex.from_breaks(range(3)) + + self.assertTrue(is_interval_dtype(ii.dtype)) + self.assertTrue(is_interval_dtype(ii)) + + s = Series(ii, name='A') + + # dtypes + # series results in object dtype currently, + self.assertFalse(is_interval_dtype(s.dtype)) + self.assertFalse(is_interval_dtype(s)) + + def test_basic_dtype(self): + self.assertTrue(is_interval_dtype('interval[int64]')) + self.assertTrue(is_interval_dtype(IntervalIndex.from_tuples([(0, 1)]))) + self.assertTrue(is_interval_dtype + (IntervalIndex.from_breaks(np.arange(4)))) + self.assertTrue(is_interval_dtype( + IntervalIndex.from_breaks(date_range('20130101', periods=3)))) + self.assertFalse(is_interval_dtype('U')) + self.assertFalse(is_interval_dtype('S')) + self.assertFalse(is_interval_dtype('foo')) + self.assertFalse(is_interval_dtype(np.object_)) + self.assertFalse(is_interval_dtype(np.int64)) + self.assertFalse(is_interval_dtype(np.float64)) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py index cab44f1122ae1..87ec1a08c4785 100644 --- a/pandas/tests/types/test_missing.py +++ b/pandas/tests/types/test_missing.py @@ -54,6 +54,14 @@ def test_0d_array(self): self.assertFalse(isnull(np.array(0.0, dtype=object))) self.assertFalse(isnull(np.array(0, dtype=object))) + def test_empty_object(self): + + for shape in [(4, 0), (4,)]: + arr = np.empty(shape=shape, dtype=object) + result = isnull(arr) + expected = np.empty(shape=shape, dtype=bool) + tm.assert_numpy_array_equal(result, expected) + def test_isnull(self): self.assertFalse(isnull(1.)) self.assertTrue(isnull(None)) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index 07dc17b02280d..6ed6d7526dc2d 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -3,24 +3,21 @@ """ from pandas.types.missing import isnull -from pandas.types.common import (is_float, is_integer, - is_scalar) +from pandas.types.common import (is_integer, + is_scalar, + is_categorical_dtype, + is_datetime64_dtype, + is_timedelta64_dtype) -from pandas.core.api import Series -from pandas.core.categorical import Categorical -from pandas.core.index import _ensure_index -from pandas.core.interval import IntervalIndex, Interval import pandas.core.algorithms as algos import pandas.core.nanops as nanops -from pandas.compat import zip -from pandas import to_timedelta, to_datetime -from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype +from pandas import (to_timedelta, to_datetime, + Categorical, Timestamp, Timedelta, + Series, Interval, IntervalIndex) from pandas.lib import infer_dtype import numpy as np -import warnings - def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): @@ -97,7 +94,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") - # TODO: IntervalIndex try: # for array-like sz = x.size except AttributeError: @@ -124,13 +120,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, else: bins = np.asarray(bins) - bins = _convert_bin_to_numeric_type(bins) + bins = _convert_bin_to_numeric_type(bins, dtype) if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, precision=precision, - include_lowest=include_lowest, dtype=dtype) + include_lowest=include_lowest, + dtype=dtype) return _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name) @@ -154,8 +151,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): the resulting bins. If False, return only integer indicators of the bins. retbins : bool, optional - Whether to return the bins or not. Can be useful if bins is given - as a scalar. + Whether to return the (bins, labels) or not. Can be useful if bins + is given as a scalar. precision : int, optional The precision at which to store and display the bins labels duplicates : {default 'raise', 'drop'}, optional @@ -232,42 +229,18 @@ def _bins_to_cuts(x, bins, right=True, labels=None, if labels is not False: if labels is None: - - # TODO: IntervalIndex - increases = 0 - while True: - try: - levels = _format_levels(bins, precision, right=right, - include_lowest=include_lowest, - dtype=dtype) - except ValueError: - increases += 1 - precision += 1 - if increases >= 20: - raise - else: - break - - # - #closed = 'right' if right else 'left' - #precision = _infer_precision(precision, bins) - #breaks = [_round_frac(b, precision) for b in bins] - #labels = IntervalIndex.from_breaks(breaks, closed=closed).values - - #if right and include_lowest: - # labels[0] = Interval(labels[0].left, labels[0].right, - # closed='both') - + labels = _format_labels(bins, precision, right=right, + include_lowest=include_lowest, + dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') - - if not com.is_categorical(labels): - labels = np.asarray(labels) + if not is_categorical_dtype(labels): + labels = Categorical(labels, ordered=True) np.putmask(ids, na_mask, 0) - result = com.take_nd(labels, ids - 1) + result = algos.take_nd(labels, ids - 1) else: result = ids - 1 @@ -277,42 +250,6 @@ def _bins_to_cuts(x, bins, right=True, labels=None, return result, bins -def _format_levels(bins, prec, right=True, - include_lowest=False, dtype=None): - fmt = lambda v: _format_label(v, precision=prec, dtype=dtype) - if right: - levels = [] - for a, b in zip(bins, bins[1:]): - fa, fb = fmt(a), fmt(b) - -def _round_frac(x, precision): - """Round the fractional part of the given number - """ - if not np.isfinite(x) or x == 0: - return x - else: - levels = ['[%s, %s)' % (fmt(a), fmt(b)) - for a, b in zip(bins, bins[1:])] - return levels - - -def _format_label(x, precision=3, dtype=None): - fmt_str = '%%.%dg' % precision - - if is_datetime64_dtype(dtype): - return to_datetime(x, unit='ns') - if is_timedelta64_dtype(dtype): - return to_timedelta(x, unit='ns') - if np.isinf(x): - return str(x) - elif is_float(x): - frac, whole = np.modf(x) - if whole == 0: - digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision - else: - digits = precision - return np.around(x, digits) - def _trim_zeros(x): while len(x) > 1 and x[-1] == '0': @@ -340,17 +277,65 @@ def _coerce_to_type(x): return x, dtype -def _convert_bin_to_numeric_type(x): +def _convert_bin_to_numeric_type(bins, dtype): """ if the passed bin is of datetime/timedelta type, this method converts it to integer + + Parameters + ---------- + bins : list-liek of bins + dtype : dtype of data + + Raises + ------ + ValueError if bins are not of a compat dtype to dtype """ - dtype = infer_dtype(x) - if dtype == 'timedelta' or dtype == 'timedelta64': - x = to_timedelta(x).view(np.int64) - elif dtype == 'datetime' or dtype == 'datetime64': - x = to_datetime(x).view(np.int64) - return x + bins_dtype = infer_dtype(bins) + if is_timedelta64_dtype(dtype): + if bins_dtype in ['timedelta', 'timedelta64']: + bins = to_timedelta(bins).view(np.int64) + else: + raise ValueError("bins must be of timedelta64 dtype") + elif is_datetime64_dtype(dtype): + if bins_dtype in ['datetime', 'datetime64']: + bins = to_datetime(bins).view(np.int64) + else: + raise ValueError("bins must be of datetime64 dtype") + + return bins + + +def _format_labels(bins, precision, right=True, + include_lowest=False, dtype=None): + """ based on the dtype, return our labels """ + + closed = 'right' if right else 'left' + + if is_datetime64_dtype(dtype): + formatter = Timestamp + adjust = lambda x: x - Timedelta('1ns') + elif is_timedelta64_dtype(dtype): + formatter = Timedelta + adjust = lambda x: x - Timedelta('1ns') + else: + precision = _infer_precision(precision, bins) + formatter = lambda x: _round_frac(x, precision) + adjust = lambda x: x - 10 ** (-precision) + + breaks = [formatter(b) for b in bins] + labels = IntervalIndex.from_breaks(breaks, closed=closed) + + if right and include_lowest: + # we will adjust the left hand side by precision to + # account that we are all right closed + v = adjust(labels[0].left) + + i = IntervalIndex.from_intervals( + [Interval(v, labels[0].right, closed='right')]) + labels = i.append(labels[1:]) + + return labels def _preprocess_for_cut(x): @@ -372,7 +357,8 @@ def _preprocess_for_cut(x): return x_is_series, series_index, name, x -def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): +def _postprocess_for_cut(fac, bins, retbins, x_is_series, + series_index, name): """ handles post processing for the cut method where we combine the index information if the originally passed @@ -386,6 +372,22 @@ def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): return fac, bins + +def _round_frac(x, precision): + """ + Round the fractional part of the given number + """ + if not np.isfinite(x) or x == 0: + return x + else: + frac, whole = np.modf(x) + if whole == 0: + digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision + else: + digits = precision + return np.around(x, digits) + + def _infer_precision(base_precision, bins): """Infer an appropriate precision for _round_frac """ diff --git a/pandas/tseries/interval.py b/pandas/tseries/interval.py deleted file mode 100644 index 6698c7e924758..0000000000000 --- a/pandas/tseries/interval.py +++ /dev/null @@ -1,38 +0,0 @@ - -from pandas.core.index import Index - - -class Interval(object): - """ - Represents an interval of time defined by two timestamps - """ - - def __init__(self, start, end): - self.start = start - self.end = end - - -class PeriodInterval(object): - """ - Represents an interval of time defined by two Period objects (time - ordinals) - """ - - def __init__(self, start, end): - self.start = start - self.end = end - - -class IntervalIndex(Index): - """ - - """ - - def __new__(self, starts, ends): - pass - - def dtype(self): - return self.values.dtype - -if __name__ == '__main__': - pass diff --git a/pandas/types/api.py b/pandas/types/api.py index c809cb3614a8c..d1c7f2372ca5b 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -10,6 +10,10 @@ is_categorical, is_categorical_dtype, + # interval + is_interval, + is_interval_dtype, + # datetimelike is_datetimetz, is_datetime64_dtype, diff --git a/pandas/types/cast.py b/pandas/types/cast.py index b1a17df64aecf..2b6eb226ef9b1 100644 --- a/pandas/types/cast.py +++ b/pandas/types/cast.py @@ -7,7 +7,7 @@ from pandas.compat import string_types, text_type, PY3 from .common import (_ensure_object, is_bool, is_integer, is_float, is_complex, is_datetimetz, is_categorical_dtype, - is_datetimelike, + is_datetimelike, is_interval_dtype, is_extension_type, is_object_dtype, is_datetime64tz_dtype, is_datetime64_dtype, is_timedelta64_dtype, is_dtype_equal, @@ -485,6 +485,21 @@ def conv(r, dtype): return [conv(r, dtype) for r, dtype in zip(result, dtypes)] +def _coerce_extension_to_embed(value): + """ + we have an extension type, coerce it to a type + suitable for embedding (in a Series/DataFrame) + """ + + # TODO: maybe we should have a method on Categorical + # to actually do this instead + if is_categorical_dtype(value): + if is_interval_dtype(value.categories): + return np.array(value) + + return value + + def _astype_nansafe(arr, dtype, copy=True): """ return a view if copy is False, but need to be very careful as the result shape could change! """ diff --git a/pandas/types/common.py b/pandas/types/common.py index e58e0826ea49a..f8ad127c301e8 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -7,6 +7,7 @@ from .dtypes import (CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, + IntervalDtype, IntervalDtypeType, ExtensionDtype) from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, @@ -97,6 +98,10 @@ def is_period_dtype(arr_or_dtype): return PeriodDtype.is_dtype(arr_or_dtype) +def is_interval_dtype(arr_or_dtype): + return IntervalDtype.is_dtype(arr_or_dtype) + + def is_categorical_dtype(arr_or_dtype): return CategoricalDtype.is_dtype(arr_or_dtype) @@ -339,6 +344,8 @@ def _coerce_to_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): dtype = PeriodDtype(dtype) + elif is_interval_dtype(dtype): + dtype = IntervalDtype(dtype) else: dtype = np.dtype(dtype) return dtype @@ -355,6 +362,8 @@ def _get_dtype(arr_or_dtype): return arr_or_dtype elif isinstance(arr_or_dtype, PeriodDtype): return arr_or_dtype + elif isinstance(arr_or_dtype, IntervalDtype): + return arr_or_dtype elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtype.construct_from_string(arr_or_dtype) @@ -362,6 +371,8 @@ def _get_dtype(arr_or_dtype): return DatetimeTZDtype.construct_from_string(arr_or_dtype) elif is_period_dtype(arr_or_dtype): return PeriodDtype.construct_from_string(arr_or_dtype) + elif is_interval_dtype(arr_or_dtype): + return IntervalDtype.construct_from_string(arr_or_dtype) if hasattr(arr_or_dtype, 'dtype'): arr_or_dtype = arr_or_dtype.dtype @@ -377,6 +388,8 @@ def _get_dtype_type(arr_or_dtype): return CategoricalDtypeType elif isinstance(arr_or_dtype, DatetimeTZDtype): return DatetimeTZDtypeType + elif isinstance(arr_or_dtype, IntervalDtype): + return IntervalDtypeType elif isinstance(arr_or_dtype, PeriodDtype): return PeriodDtypeType elif isinstance(arr_or_dtype, string_types): @@ -386,6 +399,8 @@ def _get_dtype_type(arr_or_dtype): return DatetimeTZDtypeType elif is_period_dtype(arr_or_dtype): return PeriodDtypeType + elif is_interval_dtype(arr_or_dtype): + return IntervalDtypeType return _get_dtype_type(np.dtype(arr_or_dtype)) try: return arr_or_dtype.dtype.type @@ -470,6 +485,8 @@ def pandas_dtype(dtype): return dtype elif isinstance(dtype, CategoricalDtype): return dtype + elif isinstance(dtype, IntervalDtype): + return dtype elif isinstance(dtype, string_types): try: return DatetimeTZDtype.construct_from_string(dtype) @@ -483,6 +500,12 @@ def pandas_dtype(dtype): except TypeError: pass + elif dtype.startswith('interval[') or dtype.startswith('Interval['): + try: + return IntervalDtype.construct_from_string(dtype) + except TypeError: + pass + try: return CategoricalDtype.construct_from_string(dtype) except TypeError: diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index 5b6d7905d4095..457d0c444baa3 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -365,3 +365,112 @@ def is_dtype(cls, dtype): else: return False return super(PeriodDtype, cls).is_dtype(dtype) + + +class IntervalDtypeType(type): + """ + the type of IntervalDtype, this metaclass determines subclass ability + """ + pass + + +class IntervalDtype(ExtensionDtype): + __metaclass__ = IntervalDtypeType + """ + A Interval duck-typed class, suitable for holding an interval + + THIS IS NOT A REAL NUMPY DTYPE + """ + type = IntervalDtypeType + kind = None + str = '|O08' + base = np.dtype('O') + num = 103 + _metadata = ['subtype'] + _match = re.compile("(I|i)nterval\[(?P.+)\]") + _cache = {} + + def __new__(cls, subtype=None): + """ + Parameters + ---------- + subtype : the dtype of the Interval + """ + + if isinstance(subtype, IntervalDtype): + return subtype + elif subtype is None or (isinstance(subtype, compat.string_types) and + subtype == 'interval'): + subtype = None + else: + if isinstance(subtype, compat.string_types): + m = cls._match.search(subtype) + if m is not None: + subtype = m.group('subtype') + + from pandas.types.common import pandas_dtype + try: + subtype = pandas_dtype(subtype) + except TypeError: + raise ValueError("could not construct IntervalDtype") + + try: + return cls._cache[str(subtype)] + except KeyError: + u = object.__new__(cls) + u.subtype = subtype + cls._cache[str(subtype)] = u + return u + + @classmethod + def construct_from_string(cls, string): + """ + attempt to construct this type from a string, raise a TypeError + if its not possible + """ + if isinstance(string, compat.string_types): + try: + return cls(string) + except ValueError: + pass + raise TypeError("could not construct IntervalDtype") + + def __unicode__(self): + if self.subtype is None: + return "interval" + return "interval[{subtype}]".format(subtype=self.subtype) + + @property + def name(self): + return str(self) + + def __hash__(self): + # make myself hashable + return hash(str(self)) + + def __eq__(self, other): + if isinstance(other, compat.string_types): + return other == self.name or other == self.name.title() + + return (isinstance(other, IntervalDtype) and + self.subtype == other.subtype) + + @classmethod + def is_dtype(cls, dtype): + """ + Return a boolean if we if the passed type is an actual dtype that we + can match (via string or type) + """ + + if isinstance(dtype, compat.string_types): + if dtype.lower().startswith('interval'): + try: + if cls.construct_from_string(dtype) is not None: + return True + else: + return False + except ValueError: + return False + else: + return False + return super(IntervalDtype, cls).is_dtype(dtype) diff --git a/pandas/types/generic.py b/pandas/types/generic.py index 756fb47596700..a0a089be96b1e 100644 --- a/pandas/types/generic.py +++ b/pandas/types/generic.py @@ -32,12 +32,14 @@ def _check(cls, inst): ("periodindex", )) ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex", )) +ABCIntervalIndex = create_pandas_abc_type("ABCIntervalIndex", "_typ", + ("intervalindex", )) ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", "int64index", "rangeindex", "float64index", "uint64index", "multiindex", "datetimeindex", "timedeltaindex", "periodindex", - "categoricalindex")) + "categoricalindex", "intervalindex")) ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", )) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) diff --git a/pandas/types/inference.py b/pandas/types/inference.py index d2a2924b27659..31991d7afe17a 100644 --- a/pandas/types/inference.py +++ b/pandas/types/inference.py @@ -20,6 +20,8 @@ is_decimal = lib.is_decimal +is_interval = lib.is_interval + def is_number(obj): return isinstance(obj, (Number, np.number)) diff --git a/pandas/types/missing.py b/pandas/types/missing.py index e6791b79bf3bd..4a72e29b36a74 100644 --- a/pandas/types/missing.py +++ b/pandas/types/missing.py @@ -9,7 +9,7 @@ from .common import (is_string_dtype, is_datetimelike, is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, + is_timedelta64_dtype, is_interval_dtype, is_complex_dtype, is_categorical_dtype, is_string_like_dtype, is_bool_dtype, is_integer_dtype, is_dtype_equal, @@ -127,6 +127,9 @@ def _isnull_ndarraylike(obj): if not isinstance(values, Categorical): values = values.values result = values.isnull() + elif is_interval_dtype(values): + from pandas import IntervalIndex + result = IntervalIndex(obj).isnull() else: # Working around NumPy ticket 1542 diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 6a4fe5d33c090..f09b37a77c08c 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -29,6 +29,7 @@ is_number, is_bool, needs_i8_conversion, is_categorical_dtype, + is_interval_dtype, is_sequence, is_list_like) from pandas.formats.printing import pprint_thing @@ -816,6 +817,9 @@ def _get_ilevel_values(index, level): assert_attr_equal('names', left, right, obj=obj) if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): assert_attr_equal('freq', left, right, obj=obj) + if (isinstance(left, pd.IntervalIndex) or + isinstance(right, pd.IntervalIndex)): + assert_attr_equal('closed', left, right, obj=obj) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): @@ -1185,6 +1189,12 @@ def assert_series_equal(left, right, check_dtype=True, else: assert_numpy_array_equal(left.get_values(), right.get_values(), check_dtype=check_dtype) + elif is_interval_dtype(left) or is_interval_dtype(right): + # TODO: big hack here + l = pd.IntervalIndex(left) + r = pd.IntervalIndex(right) + assert_index_equal(l, r, obj='{0}.index'.format(obj)) + else: _testing.assert_almost_equal(left.get_values(), right.get_values(), check_less_precise=check_less_precise, @@ -1576,6 +1586,7 @@ def makeIntervalIndex(k=10, name=None): x = np.linspace(0, 100, num=(k + 1)) return IntervalIndex.from_breaks(x, name=name) + def makeBoolIndex(k=10, name=None): if k == 1: return Index([True], name=name) diff --git a/setup.py b/setup.py index cbcadce459c67..a223d4c735778 100755 --- a/setup.py +++ b/setup.py @@ -117,7 +117,8 @@ def is_platform_mac(): 'hashtable': ['hashtable_class_helper.pxi.in', 'hashtable_func_helper.pxi.in'], 'index': ['index_class_helper.pxi.in'], - '_sparse': ['sparse_op_helper.pxi.in'] + '_sparse': ['sparse_op_helper.pxi.in'], + '_interval': ['intervaltree.pxi.in'] } _pxifiles = [] _pxi_dep = {} @@ -499,6 +500,10 @@ def pxd(name): _join={'pyxfile': 'src/join', 'pxdfiles': ['src/util', 'hashtable'], 'depends': _pxi_dep['_join']}, + _interval={'pyxfile': 'src/interval', + 'pxdfiles': ['hashtable'], + 'depends': ([srcpath('interval', suffix='.pyx')] + + _pxi_dep['_interval'])}, _window={'pyxfile': 'window', 'pxdfiles': ['src/skiplist', 'src/util'], 'depends': ['pandas/src/skiplist.pyx',