From 340c98bdd4cca28d664a906c0390141e86fd310d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 2 Feb 2017 09:24:09 -0500 Subject: [PATCH] CLN/COMPAT: IntervalIndex --- asv_bench/benchmarks/indexing.py | 20 + doc/source/api.rst | 21 + doc/source/whatsnew/v0.20.0.txt | 31 + pandas/_libs/hashtable.pyx | 2 + pandas/{src => _libs}/interval.pyx | 104 +- .../intervaltree.pxi.in} | 207 +-- pandas/_libs/lib.pyx | 6 +- pandas/_libs/src/inference.pyx | 28 +- pandas/_libs/tslib.pyx | 12 + pandas/core/algorithms.py | 22 +- pandas/core/api.py | 4 +- pandas/core/groupby.py | 40 +- pandas/core/indexing.py | 8 +- pandas/core/interval.py | 521 ------ pandas/formats/format.py | 15 + pandas/indexes/api.py | 3 +- pandas/indexes/base.py | 73 +- pandas/indexes/category.py | 70 +- pandas/indexes/interval.py | 983 +++++++++++ pandas/indexes/multi.py | 4 +- pandas/src/intervaltree.pyx | 1444 ----------------- pandas/tests/api/test_api.py | 4 +- pandas/tests/frame/test_alter_axes.py | 66 +- pandas/tests/groupby/test_categorical.py | 5 +- pandas/tests/groupby/test_groupby.py | 49 +- pandas/tests/indexes/common.py | 25 +- pandas/tests/indexes/test_base.py | 4 +- pandas/tests/indexes/test_category.py | 18 +- pandas/tests/indexes/test_interval.py | 799 +++++++++ pandas/tests/indexing/test_interval.py | 141 ++ pandas/tests/scalar/test_interval.py | 129 ++ pandas/tests/series/test_constructors.py | 14 +- pandas/tests/series/test_missing.py | 11 +- pandas/tests/test_algos.py | 25 +- pandas/tests/test_base.py | 21 +- pandas/tests/test_categorical.py | 12 +- pandas/tests/test_interval.py | 591 ------- pandas/tests/tools/test_tile.py | 224 +-- pandas/tests/types/test_dtypes.py | 118 +- pandas/tests/types/test_missing.py | 8 + pandas/tools/tile.py | 183 ++- pandas/tseries/base.py | 10 +- pandas/tseries/interval.py | 35 - pandas/tseries/period.py | 3 + pandas/types/api.py | 4 + pandas/types/common.py | 23 + pandas/types/dtypes.py | 109 ++ pandas/types/generic.py | 4 +- pandas/types/inference.py | 2 + pandas/types/missing.py | 5 +- pandas/util/testing.py | 11 + setup.py | 5 + 52 files changed, 3211 insertions(+), 3065 deletions(-) rename pandas/{src => _libs}/interval.pyx (68%) rename pandas/{src/generate_intervaltree.py => _libs/intervaltree.pxi.in} (68%) delete mode 100644 pandas/core/interval.py create mode 100644 pandas/indexes/interval.py delete mode 100644 pandas/src/intervaltree.pyx create mode 100644 pandas/tests/indexes/test_interval.py create mode 100644 pandas/tests/indexing/test_interval.py create mode 100644 pandas/tests/scalar/test_interval.py delete mode 100644 pandas/tests/test_interval.py delete mode 100644 pandas/tseries/interval.py diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index d938cc6a6dc4d..a32c9f25a0f09 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -226,6 +226,26 @@ def time_is_monotonic(self): self.miint.is_monotonic +class IntervalIndexing(object): + goal_time = 0.2 + + def setup(self): + self.monotonic = Series(np.arange(1000000), + index=IntervalIndex.from_breaks(np.arange(1000001))) + + def time_getitem_scalar(self): + self.monotonic[80000] + + def time_loc_scalar(self): + self.monotonic.loc[80000] + + def time_getitem_list(self): + self.monotonic[80000:] + + def time_loc_list(self): + self.monotonic.loc[80000:] + + class PanelIndexing(object): goal_time = 0.2 diff --git a/doc/source/api.rst b/doc/source/api.rst index bf9d521e2a12a..6ba8c2b8ead67 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1405,6 +1405,27 @@ Categorical Components CategoricalIndex.as_ordered CategoricalIndex.as_unordered +.. _api.intervalindex: + +IntervalIndex +------------- + +.. autosummary:: + :toctree: generated/ + + IntervalIndex + +IntervalIndex Components +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + IntervalIndex.from_arrays + IntervalIndex.from_tuples + IntervalIndex.from_breaks + IntervalIndex.from_intervals + .. _api.multiindex: MultiIndex diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a105a6801fb61..6daeb29a6e67e 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -13,6 +13,7 @@ Highlights include: - ``Panel`` has been deprecated, see :ref:`here ` - Improved user API when accessing levels in ``.groupby()``, see :ref:`here ` - Improved support for UInt64 dtypes, see :ref:`here ` +- Addition of an ``IntervalIndex`` and ``Interval`` scalar type, see :ref:`here ` - A new orient for JSON serialization, ``orient='table'``, that uses the Table Schema spec, see :ref:`here ` - Window Binary Corr/Cov operations return a MultiIndexed ``DataFrame`` rather than a ``Panel``, as ``Panel`` is now deprecated, see :ref:`here ` - Support for S3 handling now uses ``s3fs``, see :ref:`here ` @@ -314,6 +315,36 @@ To convert a ``SparseDataFrame`` back to sparse SciPy matrix in COO format, you sdf.to_coo() +.. _whatsnew_0200.enhancements.intervalindex: + +IntervalIndex +^^^^^^^^^^^^^ + +pandas has gain an ``IntervalIndex`` with its own dtype, ``interval`` as well as the ``Interval`` scalar type. These allow first-class support for interval +notation, specifically as return type for ``pd.cut`` and ``pd.qcut``. (:issue:`7640`, :issue:`8625`) + +**Previous behavior**: + +.. code-block:: ipython + + In [2]: pd.cut(range(3), 2) + Out[2]: + [(-0.002, 1], (-0.002, 1], (1, 2]] + Categories (2, object): [(-0.002, 1] < (1, 2]] + + # the returned categories are strings, representing Intervals + In [3]: pd.cut(range(3), 2).categories + Out[3]: Index(['(-0.002, 1]', '(1, 2]'], dtype='object') + +**New behavior**: + +.. ipython:: python + + c = pd.cut(range(3), 2) + c + c.categories + pd.api.types.is_interval_dtype(c.categories) + .. _whatsnew_0200.enhancements.other: Other Enhancements diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index a4e5bee9a8746..c8aedcef77502 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -39,6 +39,8 @@ PyDateTime_IMPORT cdef extern from "Python.h": int PySlice_Check(object) +cdef size_t _INIT_VEC_CAP = 128 + include "hashtable_class_helper.pxi" include "hashtable_func_helper.pxi" diff --git a/pandas/src/interval.pyx b/pandas/_libs/interval.pyx similarity index 68% rename from pandas/src/interval.pyx rename to pandas/_libs/interval.pyx index 495730e0fd6a1..60a34aff16e9d 100644 --- a/pandas/src/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -2,8 +2,11 @@ cimport numpy as np import numpy as np import pandas as pd +cimport util cimport cython import cython +from numpy cimport * +from tslib import Timestamp from cpython.object cimport (Py_EQ, Py_NE, Py_GT, Py_LT, Py_GE, Py_LE, PyObject_RichCompare) @@ -44,6 +47,20 @@ cdef _interval_like(other): cdef class Interval(IntervalMixin): + """ + Immutable object implementing an Interval, a bounded slice-like interval. + + .. versionadded:: 0.20.0 + + Properties + ---------- + left, right : values + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'} + Whether the interval is closed on the left-side, right-side, both or + neither. Defaults to 'right'. + """ + cdef readonly object left, right cdef readonly str closed @@ -84,88 +101,115 @@ cdef class Interval(IntervalMixin): return NotImplemented else: op_str = {Py_LT: '<', Py_LE: '<=', Py_GT: '>', Py_GE: '>='}[op] - raise TypeError('unorderable types: %s() %s %s()' % - (type(self).__name__, op_str, type(other).__name__)) + raise TypeError( + 'unorderable types: %s() %s %s()' % + (type(self).__name__, op_str, type(other).__name__)) def __reduce__(self): args = (self.left, self.right, self.closed) return (type(self), args) + def _repr_base(self): + left = self.left + right = self.right + + # TODO: need more general formatting methodology here + if isinstance(left, Timestamp) and isinstance(right, Timestamp): + left = left._short_repr + right = right._short_repr + + return left, right + def __repr__(self): + + left, right = self._repr_base() return ('%s(%r, %r, closed=%r)' % - (type(self).__name__, self.left, self.right, self.closed)) + (type(self).__name__, left, right, self.closed)) def __str__(self): + + left, right = self._repr_base() start_symbol = '[' if self.closed_left else '(' end_symbol = ']' if self.closed_right else ')' - return '%s%s, %s%s' % (start_symbol, self.left, self.right, end_symbol) + return '%s%s, %s%s' % (start_symbol, left, right, end_symbol) def __add__(self, y): if isinstance(y, numbers.Number): return Interval(self.left + y, self.right + y) elif isinstance(y, Interval) and isinstance(self, numbers.Number): return Interval(y.left + self, y.right + self) - else: - raise NotImplemented + return NotImplemented def __sub__(self, y): if isinstance(y, numbers.Number): return Interval(self.left - y, self.right - y) - else: - raise NotImplemented + return NotImplemented def __mul__(self, y): if isinstance(y, numbers.Number): return Interval(self.left * y, self.right * y) elif isinstance(y, Interval) and isinstance(self, numbers.Number): return Interval(y.left * self, y.right * self) - else: - return NotImplemented + return NotImplemented def __div__(self, y): if isinstance(y, numbers.Number): return Interval(self.left / y, self.right / y) - else: - return NotImplemented + return NotImplemented def __truediv__(self, y): if isinstance(y, numbers.Number): return Interval(self.left / y, self.right / y) - else: - return NotImplemented + return NotImplemented def __floordiv__(self, y): if isinstance(y, numbers.Number): return Interval(self.left // y, self.right // y) - else: - return NotImplemented + return NotImplemented @cython.wraparound(False) @cython.boundscheck(False) -cpdef interval_bounds_to_intervals(np.ndarray left, np.ndarray right, - str closed): - result = np.empty(len(left), dtype=object) - nulls = pd.isnull(left) | pd.isnull(right) - result[nulls] = np.nan - for i in np.flatnonzero(~nulls): - result[i] = Interval(left[i], right[i], closed) - return result +cpdef intervals_to_interval_bounds(ndarray intervals): + """ + Parameters + ---------- + intervals: ndarray object array of Intervals / nulls + Returns + ------- + tuples (left: ndarray object array, + right: ndarray object array, + closed: str) + + """ + + cdef: + object closed = None, interval + int64_t n = len(intervals) + ndarray left, right + + left = np.empty(n, dtype=object) + right = np.empty(n, dtype=object) -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef intervals_to_interval_bounds(np.ndarray intervals): - left = np.empty(len(intervals), dtype=object) - right = np.empty(len(intervals), dtype=object) - cdef str closed = None for i in range(len(intervals)): interval = intervals[i] + if util._checknull(interval): + left[i] = np.nan + right[i] = np.nan + continue + + if not isinstance(interval, Interval): + raise TypeError("type {} with value {} is not an interval".format( + type(interval), interval)) + left[i] = interval.left right[i] = interval.right if closed is None: closed = interval.closed elif closed != interval.closed: raise ValueError('intervals must all be closed on the same side') + return left, right, closed +include "intervaltree.pxi" diff --git a/pandas/src/generate_intervaltree.py b/pandas/_libs/intervaltree.pxi.in similarity index 68% rename from pandas/src/generate_intervaltree.py rename to pandas/_libs/intervaltree.pxi.in index 275a0d40e2433..4fa0d6d156fa2 100644 --- a/pandas/src/generate_intervaltree.py +++ b/pandas/_libs/intervaltree.pxi.in @@ -1,22 +1,9 @@ """ -This file generates `intervaltree.pyx` which is then included in `../lib.pyx` -during building. To regenerate `intervaltree.pyx`, just run: +Template for intervaltree - `python generate_intervaltree.py`. +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ -from __future__ import print_function -import os -from pandas.compat import StringIO -import numpy as np - -warning_to_new_contributors = """ -# DO NOT EDIT THIS FILE: This file was autogenerated from -# generate_intervaltree.py, so please edit that file and then run -# `python2 generate_intervaltree.py` to re-generate this file. -""" - -header = r''' from numpy cimport int64_t, float64_t from numpy cimport ndarray, PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take import numpy as np @@ -28,22 +15,27 @@ from hashtable cimport Int64Vector, Int64VectorData -ctypedef fused scalar64_t: +ctypedef fused scalar_t: float64_t + float32_t int64_t + int32_t -NODE_CLASSES = {} - +#---------------------------------------------------------------------- +# IntervalTree +#---------------------------------------------------------------------- cdef class IntervalTree(IntervalMixin): """A centered interval tree Based off the algorithm described on Wikipedia: http://en.wikipedia.org/wiki/Interval_tree + + we are emulating the IndexEngine interface """ cdef: - readonly object left, right, root + readonly object left, right, root, dtype readonly str closed object _left_sorter, _right_sorter @@ -67,15 +59,15 @@ def __init__(self, left, right, closed='right', leaf_size=100): left = np.asarray(left) right = np.asarray(right) - dtype = np.result_type(left, right) - self.left = np.asarray(left, dtype=dtype) - self.right = np.asarray(right, dtype=dtype) + self.dtype = np.result_type(left, right) + self.left = np.asarray(left, dtype=self.dtype) + self.right = np.asarray(right, dtype=self.dtype) indices = np.arange(len(left), dtype='int64') self.closed = closed - node_cls = NODE_CLASSES[str(dtype), closed] + node_cls = NODE_CLASSES[str(self.dtype), closed] self.root = node_cls(self.left, self.right, indices, leaf_size) @property @@ -94,7 +86,7 @@ def right_sorter(self): self._right_sorter = np.argsort(self.right) return self._right_sorter - def get_loc(self, scalar64_t key): + def get_loc(self, scalar_t key): """Return all positions corresponding to intervals that overlap with the given scalar key """ @@ -131,13 +123,15 @@ def get_loc_interval(self, key_left, key_right): uniques = pd.unique(combined) return uniques - def get_indexer(self, scalar64_t[:] target): + def get_indexer(self, scalar_t[:] target): """Return the positions corresponding to unique intervals that overlap with the given array of scalar targets. """ + # TODO: write get_indexer_intervals cdef: - int64_t old_len, i + size_t old_len + Py_ssize_t i Int64Vector result result = Int64Vector() @@ -152,12 +146,13 @@ def get_indexer(self, scalar64_t[:] target): old_len = result.data.n return result.to_array() - def get_indexer_non_unique(self, scalar64_t[:] target): + def get_indexer_non_unique(self, scalar_t[:] target): """Return the positions corresponding to intervals that overlap with the given array of scalar targets. Non-unique positions are repeated. """ cdef: - int64_t old_len, i + size_t old_len + Py_ssize_t i Int64Vector result, missing result = Int64Vector() @@ -172,8 +167,14 @@ def get_indexer_non_unique(self, scalar64_t[:] target): return result.to_array(), missing.to_array() def __repr__(self): - return ('' - % self.root.n_elements) + return (''.format( + dtype=self.dtype, closed=self.closed, + n_elements=self.root.n_elements)) + + # compat with IndexEngine interface + def clear_mapping(self): + pass cdef take(ndarray source, ndarray indices): @@ -189,37 +190,66 @@ def __repr__(self): sorted_values = take(values, sorter) sorted_indices = take(indices, sorter) return sorted_values, sorted_indices -''' + +#---------------------------------------------------------------------- +# Nodes +#---------------------------------------------------------------------- # we need specialized nodes and leaves to optimize for different dtype and # closed values -# unfortunately, fused dtypes can't parameterize attributes on extension types, -# so we're stuck using template generation. -node_template = r''' -cdef class {dtype_title}Closed{closed_title}IntervalNode: +{{py: + +nodes = [] +for dtype in ['float32', 'float64', 'int32', 'int64']: + for closed, cmp_left, cmp_right in [ + ('left', '<=', '<'), + ('right', '<', '<='), + ('both', '<=', '<='), + ('neither', '<', '<')]: + cmp_left_converse = '<' if cmp_left == '<=' else '<=' + cmp_right_converse = '<' if cmp_right == '<=' else '<=' + nodes.append((dtype, dtype.title(), + closed, closed.title(), + cmp_left, + cmp_right, + cmp_left_converse, + cmp_right_converse)) + +}} + +NODE_CLASSES = {} + +{{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right, + cmp_left_converse, cmp_right_converse in nodes}} + +cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: """Non-terminal node for an IntervalTree Categorizes intervals by those that fall to the left, those that fall to the right, and those that overlap with the pivot. """ cdef: - {dtype_title}Closed{closed_title}IntervalNode left_node, right_node - {dtype}_t[:] center_left_values, center_right_values, left, right + {{dtype_title}}Closed{{closed_title}}IntervalNode left_node, right_node + {{dtype}}_t[:] center_left_values, center_right_values, left, right int64_t[:] center_left_indices, center_right_indices, indices - {dtype}_t min_left, max_right - readonly {dtype}_t pivot + {{dtype}}_t min_left, max_right + readonly {{dtype}}_t pivot readonly int64_t n_elements, n_center, leaf_size readonly bint is_leaf_node def __init__(self, - ndarray[{dtype}_t, ndim=1] left, - ndarray[{dtype}_t, ndim=1] right, + ndarray[{{dtype}}_t, ndim=1] left, + ndarray[{{dtype}}_t, ndim=1] right, ndarray[int64_t, ndim=1] indices, int64_t leaf_size): self.n_elements = len(left) self.leaf_size = leaf_size + + # min_left and min_right are used to speed-up query by skipping + # query on sub-nodes. If this node has size 0, query is cheap, + # so these values don't matter. if left.size > 0: self.min_left = left.min() self.max_right = right.max() @@ -233,15 +263,18 @@ def __init__(self, self.left = left self.right = right self.indices = indices - self.n_center + self.n_center = 0 else: # calculate a pivot so we can create child nodes self.is_leaf_node = False self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) + left_set, right_set, center_set = self.classify_intervals( + left, right) - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) + self.left_node = self.new_child_node(left, right, + indices, left_set) + self.right_node = self.new_child_node(left, right, + indices, right_set) self.center_left_values, self.center_left_indices = \ sort_values_and_indices(left, indices, center_set) @@ -251,7 +284,7 @@ def __init__(self, @cython.wraparound(False) @cython.boundscheck(False) - cdef classify_intervals(self, {dtype}_t[:] left, {dtype}_t[:] right): + cdef classify_intervals(self, {{dtype}}_t[:] left, {{dtype}}_t[:] right): """Classify the given intervals based upon whether they fall to the left, right, or overlap with this node's pivot. """ @@ -264,9 +297,9 @@ def __init__(self, overlapping_ind = Int64Vector() for i in range(self.n_elements): - if right[i] {cmp_right_converse} self.pivot: + if right[i] {{cmp_right_converse}} self.pivot: left_ind.append(i) - elif self.pivot {cmp_left_converse} left[i]: + elif self.pivot {{cmp_left_converse}} left[i]: right_ind.append(i) else: overlapping_ind.append(i) @@ -276,8 +309,8 @@ def __init__(self, overlapping_ind.to_array()) cdef new_child_node(self, - ndarray[{dtype}_t, ndim=1] left, - ndarray[{dtype}_t, ndim=1] right, + ndarray[{{dtype}}_t, ndim=1] left, + ndarray[{{dtype}}_t, ndim=1] right, ndarray[int64_t, ndim=1] indices, ndarray[int64_t, ndim=1] subset): """Create a new child node. @@ -285,19 +318,19 @@ def __init__(self, left = take(left, subset) right = take(right, subset) indices = take(indices, subset) - return {dtype_title}Closed{closed_title}IntervalNode( + return {{dtype_title}}Closed{{closed_title}}IntervalNode( left, right, indices, self.leaf_size) @cython.wraparound(False) @cython.boundscheck(False) @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): + cpdef query(self, Int64Vector result, scalar_t point): """Recursively query this node and its sub-nodes for intervals that overlap with the query point. """ cdef: int64_t[:] indices - {dtype}_t[:] values + {{dtype}}_t[:] values Py_ssize_t i if self.is_leaf_node: @@ -305,7 +338,7 @@ def __init__(self, # continue the binary tree structure. Instead, we use linear # search. for i in range(self.n_elements): - if self.left[i] {cmp_left} point {cmp_right} self.right[i]: + if self.left[i] {{cmp_left}} point {{cmp_right}} self.right[i]: result.append(self.indices[i]) else: # There are child nodes. Based on comparing our query to the pivot, @@ -314,36 +347,41 @@ def __init__(self, values = self.center_left_values indices = self.center_left_indices for i in range(self.n_center): - if not values[i] {cmp_left} point: + if not values[i] {{cmp_left}} point: break result.append(indices[i]) - if point {cmp_right} self.left_node.max_right: + if point {{cmp_right}} self.left_node.max_right: self.left_node.query(result, point) elif point > self.pivot: values = self.center_right_values indices = self.center_right_indices for i in range(self.n_center - 1, -1, -1): - if not point {cmp_right} values[i]: + if not point {{cmp_right}} values[i]: break result.append(indices[i]) - if self.right_node.min_left {cmp_left} point: + if self.right_node.min_left {{cmp_left}} point: self.right_node.query(result, point) else: result.extend(self.center_left_indices) def __repr__(self): if self.is_leaf_node: - return ('<{dtype_title}Closed{closed_title}IntervalNode: ' + return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: ' '%s elements (terminal)>' % self.n_elements) else: n_left = self.left_node.n_elements n_right = self.right_node.n_elements n_center = self.n_elements - n_left - n_right - return ('<{dtype_title}Closed{closed_title}IntervalNode: pivot %s, ' - '%s elements (%s left, %s right, %s overlapping)>' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) + return ('<{{dtype_title}}Closed{{closed_title}}IntervalNode: ' + 'pivot %s, %s elements (%s left, %s right, %s ' + 'overlapping)>' % (self.pivot, self.n_elements, + n_left, n_right, n_center)) def counts(self): + """ + Inspect counts on this node + useful for debugging purposes + """ if self.is_leaf_node: return self.n_elements else: @@ -352,44 +390,7 @@ def counts(self): r = self.right_node.counts() return (m, (l, r)) -NODE_CLASSES['{dtype}', '{closed}'] = {dtype_title}Closed{closed_title}IntervalNode -''' - - -def generate_node_template(): - output = StringIO() - for dtype in ['float64', 'int64']: - for closed, cmp_left, cmp_right in [ - ('left', '<=', '<'), - ('right', '<', '<='), - ('both', '<=', '<='), - ('neither', '<', '<')]: - cmp_left_converse = '<' if cmp_left == '<=' else '<=' - cmp_right_converse = '<' if cmp_right == '<=' else '<=' - classes = node_template.format(dtype=dtype, - dtype_title=dtype.title(), - closed=closed, - closed_title=closed.title(), - cmp_left=cmp_left, - cmp_right=cmp_right, - cmp_left_converse=cmp_left_converse, - cmp_right_converse=cmp_right_converse) - output.write(classes) - output.write("\n") - return output.getvalue() - - -def generate_cython_file(): - # Put `intervaltree.pyx` in the same directory as this file - directory = os.path.dirname(os.path.realpath(__file__)) - filename = 'intervaltree.pyx' - path = os.path.join(directory, filename) - - with open(path, 'w') as f: - print(warning_to_new_contributors, file=f) - print(header, file=f) - print(generate_node_template(), file=f) - - -if __name__ == '__main__': - generate_cython_file() +NODE_CLASSES['{{dtype}}', + '{{closed}}'] = {{dtype_title}}Closed{{closed_title}}IntervalNode + +{{endfor}} diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index f90fd1e5bb44b..31402c38c770d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -61,6 +61,8 @@ from tslib cimport (convert_to_tsobject, convert_to_timedelta64, _check_all_nulls) import tslib from tslib import NaT, Timestamp, Timedelta +import interval +from interval import Interval cdef int64_t NPY_NAT = util.get_nat() @@ -259,7 +261,7 @@ cpdef bint isscalar(object val): or PyDelta_Check(val) or PyTime_Check(val) or util.is_period_object(val) - or is_decimal(val), + or is_decimal(val) or is_interval(val)) @@ -1898,6 +1900,4 @@ cdef class BlockPlacement: include "reduce.pyx" include "properties.pyx" -include "interval.pyx" -include "intervaltree.pyx" include "inference.pyx" diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 0c85f488dd311..f7dbae4ab736e 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -33,6 +33,10 @@ cpdef bint is_decimal(object obj): return isinstance(obj, Decimal) +cpdef bint is_interval(object obj): + return isinstance(obj, Interval) + + cpdef bint is_period(object val): """ Return a boolean if this is a Period object """ return util.is_period_object(val) @@ -430,7 +434,7 @@ def infer_dtype(object value): return 'period' elif is_interval(val): - if is_interval_array_fixed_closed(values): + if is_interval_array(values): return 'interval' for i in range(n): @@ -883,22 +887,22 @@ cpdef bint is_period_array(ndarray[object] values): return False return null_count != n -cdef inline bint is_interval(object o): - return isinstance(o, Interval) -def is_interval_array_fixed_closed(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) - cdef str closed +cpdef bint is_interval_array(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values), null_count = 0 + object v + if n == 0: return False for i in range(n): - if not is_interval(values[i]): - return False - if i == 0: - closed = values[0].closed - elif closed != values[i].closed: + v = values[i] + if util._checknull(v): + null_count += 1 + continue + if not is_interval(v): return False - return True + return null_count != n cdef extern from "parse_helper.h": diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index ed0bb263ed6cf..47679966e3d5c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -1296,6 +1296,18 @@ cdef class _Timestamp(datetime): return result + property _short_repr: + def __get__(self): + # format a Timestamp with only _date_repr if possible + # otherwise _repr_base + if (self.hour == 0 and + self.minute == 0 and + self.second == 0 and + self.microsecond == 0 and + self.nanosecond == 0): + return self._date_repr + return self._repr_base + property asm8: def __get__(self): return np.datetime64(self.value, 'ns') diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d72ee71570adb..2a2789843207a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -605,8 +605,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if bins is not None: try: from pandas.tools.tile import cut - values = Series(values).values - cat, bins = cut(values, bins, retbins=True) + values = Series(values) + ii = cut(values, bins, include_lowest=True) except TypeError: raise TypeError("bins argument only works with numeric data.") @@ -623,12 +623,18 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) + # count, remove nulls (from the index), and but the bins + result = ii.value_counts(dropna=dropna) + result = result[result.index.notnull()] + result.index = result.index.astype('interval') + result = result.sort_index() - if bins is not None: - # TODO: This next line should be more efficient - result = result.reindex(np.arange(len(cat.categories)), - fill_value=0) - result.index = bins[:-1] + # if we are dropna and we have NO values + if dropna and (result.values == 0).all(): + result = result.iloc[0:0] + + # normalizing is by len of all (regardless of dropna) + counts = np.array([len(ii)]) if sort: result = result.sort_values(ascending=ascending) @@ -1395,6 +1401,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + elif is_interval_dtype(arr): + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) diff --git a/pandas/core/api.py b/pandas/core/api.py index dbb5e22358c18..ea5be17ef3aaf 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -11,8 +11,8 @@ from pandas.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, UInt64Index, RangeIndex, Float64Index, - MultiIndex) -from pandas.core.interval import Interval, IntervalIndex + MultiIndex, IntervalIndex) +from pandas.indexes.interval import Interval, interval_range from pandas.core.series import Series from pandas.core.frame import DataFrame diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 69c90d8cc9efd..45a9577c8d8b2 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -18,6 +18,7 @@ from pandas.types.common import (is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype, is_categorical_dtype, + is_interval_dtype, is_datetimelike, is_datetime64_any_dtype, is_bool, is_integer_dtype, @@ -40,10 +41,11 @@ from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) +from pandas.core.index import (Index, MultiIndex, + CategoricalIndex, _ensure_index) from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.interval import IntervalIndex from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series from pandas.core.panel import Panel @@ -2659,7 +2661,7 @@ def _convert_grouper(axis, grouper): return grouper.reindex(axis)._values elif isinstance(grouper, (list, Series, Index, np.ndarray)): if len(grouper) != len(axis): - raise AssertionError('Grouper and axis must be same length') + raise ValueError('Grouper and axis must be same length') return grouper else: return grouper @@ -3144,28 +3146,29 @@ def value_counts(self, normalize=False, sort=True, ascending=False, if bins is None: lab, lev = algorithms.factorize(val, sort=True) + llab = lambda lab, inc: lab[inc] else: - raise NotImplementedError('this is broken') - lab, bins = cut(val, bins, retbins=True) - # bins[:-1] for backward compat; - # o.w. cat.categories could be better - # cat = Categorical(cat) - # lab, lev, dropna = cat.codes, bins[:-1], False - - if (lab.dtype == object - and lib.is_interval_array_fixed_closed(lab[notnull(lab)])): - lab_index = Index(lab) - assert isinstance(lab, IntervalIndex) - sorter = np.lexsort((lab_index.left, lab_index.right, ids)) + + # lab is a Categorical with categories an IntervalIndex + lab = cut(Series(val), bins, include_lowest=True) + lev = lab.cat.categories + lab = lev.take(lab.cat.codes) + llab = lambda lab, inc: lab[inc]._multiindex.labels[-1] + + if is_interval_dtype(lab): + # TODO: should we do this inside II? + sorter = np.lexsort((lab.left, lab.right, ids)) else: sorter = np.lexsort((lab, ids)) + ids, lab = ids[sorter], lab[sorter] # group boundaries are where group ids change idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] # new values are where sorted labels change - inc = np.r_[True, lab[1:] != lab[:-1]] + lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) + inc = np.r_[True, lchanges] inc[idx] = True # group boundaries are also new values out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts @@ -3173,7 +3176,7 @@ def value_counts(self, normalize=False, sort=True, ascending=False, rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx)) # multi-index components - labels = list(map(rep, self.grouper.recons_labels)) + [lab[inc]] + labels = list(map(rep, self.grouper.recons_labels)) + [llab(lab, inc)] levels = [ping.group_index for ping in self.grouper.groupings] + [lev] names = self.grouper.names + [self._selection_name] @@ -3199,13 +3202,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False, acc = rep(d) out /= acc - if sort: # and bins is None: + if sort and bins is None: cat = ids[inc][mask] if dropna else ids[inc] sorter = np.lexsort((out if ascending else -out, cat)) out, labels[-1] = out[sorter], labels[-1][sorter] - # if bins is None: - if True: + if bins is None: mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9e22bdd5facc4..c9ff26d135f58 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1087,10 +1087,10 @@ def _getitem_iterable(self, key, axis=0): return self.obj.take(inds, axis=axis, convert=False) else: # Have the index compute an indexer or return None - # if it cannot handle + # if it cannot handle; we only act on all found values indexer, keyarr = labels._convert_listlike_indexer( key, kind=self.name) - if indexer is not None: + if indexer is not None and (indexer != -1).all(): return self.obj.take(indexer, axis=axis) # existing labels are unique and indexer are unique @@ -1429,7 +1429,7 @@ def error(): try: key = self._convert_scalar_indexer(key, axis) - if key not in ax: + if not ax._is_contained_in(key): error() except TypeError as e: @@ -1897,7 +1897,7 @@ def convert_to_index_sliceable(obj, key): elif isinstance(key, compat.string_types): # we are an actual column - if key in obj._data.items: + if obj._data.items._is_contained_in(key): return None # We might have a datetimelike string that we can translate to a diff --git a/pandas/core/interval.py b/pandas/core/interval.py deleted file mode 100644 index 68e07f21367a0..0000000000000 --- a/pandas/core/interval.py +++ /dev/null @@ -1,521 +0,0 @@ -import operator - -import numpy as np -import pandas as pd - -from pandas.core.base import PandasObject, IndexOpsMixin -from pandas.core.common import (_values_from_object, _ensure_platform_int, - notnull, is_datetime_or_timedelta_dtype, - is_integer_dtype, is_float_dtype) -from pandas.core.index import (Index, _ensure_index, default_pprint, - InvalidIndexError, MultiIndex) -from pandas.lib import (Interval, IntervalMixin, IntervalTree, - interval_bounds_to_intervals, - intervals_to_interval_bounds) -from pandas.util.decorators import cache_readonly -import pandas.core.common as com - - -_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) - - -def _get_next_label(label): - dtype = getattr(label, 'dtype', type(label)) - if isinstance(label, (pd.Timestamp, pd.Timedelta)): - dtype = 'datetime64' - if is_datetime_or_timedelta_dtype(dtype): - return label + np.timedelta64(1, 'ns') - elif is_integer_dtype(dtype): - return label + 1 - elif is_float_dtype(dtype): - return np.nextafter(label, np.infty) - else: - raise TypeError('cannot determine next label for type %r' - % type(label)) - - -def _get_prev_label(label): - dtype = getattr(label, 'dtype', type(label)) - if isinstance(label, (pd.Timestamp, pd.Timedelta)): - dtype = 'datetime64' - if is_datetime_or_timedelta_dtype(dtype): - return label - np.timedelta64(1, 'ns') - elif is_integer_dtype(dtype): - return label - 1 - elif is_float_dtype(dtype): - return np.nextafter(label, -np.infty) - else: - raise TypeError('cannot determine next label for type %r' - % type(label)) - - -def _get_interval_closed_bounds(interval): - """ - Given an Interval or IntervalIndex, return the corresponding interval with - closed bounds. - """ - left, right = interval.left, interval.right - if interval.open_left: - left = _get_next_label(left) - if interval.open_right: - right = _get_prev_label(right) - return left, right - - -class IntervalIndex(IntervalMixin, Index): - """ - Immutable Index implementing an ordered, sliceable set. IntervalIndex - represents an Index of intervals that are all closed on the same side. - - .. versionadded:: 0.18 - - Properties - ---------- - left, right : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional - Whether the intervals are closed on the left-side, right-side, both or - neither. Defaults to 'right'. - name : object, optional - Name to be stored in the index. - """ - _typ = 'intervalindex' - _comparables = ['name'] - _attributes = ['name', 'closed'] - _allow_index_ops = True - _engine = None # disable it - - def __new__(cls, left, right, closed='right', name=None, fastpath=False): - # TODO: validation - result = IntervalMixin.__new__(cls) - result._left = _ensure_index(left) - result._right = _ensure_index(right) - result._closed = closed - result.name = name - if not fastpath: - result._validate() - result._reset_identity() - return result - - def _validate(self): - """Verify that the IntervalIndex is valid. - """ - # TODO: exclude periods? - if self.closed not in _VALID_CLOSED: - raise ValueError("invalid options for 'closed': %s" % self.closed) - if len(self.left) != len(self.right): - raise ValueError('left and right must have the same length') - left_valid = notnull(self.left) - right_valid = notnull(self.right) - if not (left_valid == right_valid).all(): - raise ValueError('missing values must be missing in the same ' - 'location both left and right sides') - if not (self.left[left_valid] <= self.right[left_valid]).all(): - raise ValueError('left side of interval must be <= right side') - - def _simple_new(cls, values, name=None, **kwargs): - # ensure we don't end up here (this is a superclass method) - raise NotImplementedError - - def _cleanup(self): - pass - - @property - def _engine(self): - raise NotImplementedError - - @cache_readonly - def _tree(self): - return IntervalTree(self.left, self.right, closed=self.closed) - - @property - def _constructor(self): - return type(self).from_intervals - - @classmethod - def from_breaks(cls, breaks, closed='right', name=None): - """ - Construct an IntervalIndex from an array of splits - - Parameters - ---------- - breaks : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, optional - Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. - name : object, optional - Name to be stored in the index. - - Examples - -------- - - >>> IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex(left=[0, 1, 2], - right=[1, 2, 3], - closed='right') - """ - return cls(breaks[:-1], breaks[1:], closed, name) - - @classmethod - def from_intervals(cls, data, name=None): - """ - Construct an IntervalIndex from a 1d array of Interval objects - - Parameters - ---------- - data : array-like (1-dimensional) - Array of Interval objects. All intervals must be closed on the same - sides. - name : object, optional - Name to be stored in the index. - - Examples - -------- - - >>> IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) - IntervalIndex(left=[0, 1], - right=[1, 2], - closed='right') - - The generic Index constructor work identically when it infers an array - of all intervals: - - >>> Index([Interval(0, 1), Interval(1, 2)]) - IntervalIndex(left=[0, 1], - right=[1, 2], - closed='right') - """ - data = np.asarray(data) - left, right, closed = intervals_to_interval_bounds(data) - return cls(left, right, closed, name) - - @classmethod - def from_tuples(cls, data, closed='right', name=None): - left = [] - right = [] - for l, r in data: - left.append(l) - right.append(r) - return cls(np.array(left), np.array(right), closed, name) - - def to_tuples(self): - return Index(com._asarray_tuplesafe(zip(self.left, self.right))) - - @cache_readonly - def _multiindex(self): - return MultiIndex.from_arrays([self.left, self.right], - names=['left', 'right']) - - @property - def left(self): - return self._left - - @property - def right(self): - return self._right - - @property - def closed(self): - return self._closed - - def __len__(self): - return len(self.left) - - @cache_readonly - def values(self): - """Returns the IntervalIndex's data as a numpy array of Interval - objects (with dtype='object') - """ - left = np.asarray(self.left) - right = np.asarray(self.right) - return interval_bounds_to_intervals(left, right, self.closed) - - def __array__(self, result=None): - """ the array interface, return my values """ - return self.values - - def __array_wrap__(self, result, context=None): - # we don't want the superclass implementation - return result - - def _array_values(self): - return self.values - - def __reduce__(self): - return self.__class__, (self.left, self.right, self.closed, self.name) - - def _shallow_copy(self, values=None, name=None): - name = name if name is not None else self.name - if values is not None: - return type(self).from_intervals(values, name=name) - else: - return self.copy(name=name) - - def copy(self, deep=False, name=None): - left = self.left.copy(deep=True) if deep else self.left - right = self.right.copy(deep=True) if deep else self.right - name = name if name is not None else self.name - return type(self)(left, right, closed=self.closed, name=name, - fastpath=True) - - @cache_readonly - def dtype(self): - return np.dtype('O') - - @cache_readonly - def mid(self): - """Returns the mid-point of each interval in the index as an array - """ - try: - return Index(0.5 * (self.left.values + self.right.values)) - except TypeError: - # datetime safe version - delta = self.right.values - self.left.values - return Index(self.left.values + 0.5 * delta) - - @cache_readonly - def is_monotonic_increasing(self): - return self._multiindex.is_monotonic_increasing - - @cache_readonly - def is_monotonic_decreasing(self): - return self._multiindex.is_monotonic_decreasing - - @cache_readonly - def is_unique(self): - return self._multiindex.is_unique - - @cache_readonly - def is_non_overlapping_monotonic(self): - # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) - # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) - # we already require left <= right - return ((self.right[:-1] <= self.left[1:]).all() or - (self.left[:-1] >= self.right[1:]).all()) - - def _convert_scalar_indexer(self, key, kind=None): - return key - - def _maybe_cast_slice_bound(self, label, side, kind): - return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) - - def _convert_list_indexer(self, keyarr, kind=None): - """ - we are passed a list indexer. - Return our indexer or raise if all of the values are not included in the categories - """ - locs = self.get_indexer(keyarr) - # TODO: handle keyarr if it includes intervals - if (locs == -1).any(): - raise KeyError("a list-indexer must only include existing intervals") - - return locs - - def _check_method(self, method): - if method is not None: - raise NotImplementedError( - 'method %r not yet implemented for IntervalIndex' % method) - - def _searchsorted_monotonic(self, label, side, exclude_label=False): - if not self.is_non_overlapping_monotonic: - raise KeyError('can only get slices from an IntervalIndex if ' - 'bounds are non-overlapping and all monotonic ' - 'increasing or decreasing') - - if isinstance(label, IntervalMixin): - raise NotImplementedError - - if ((side == 'left' and self.left.is_monotonic_increasing) or - (side == 'right' and self.left.is_monotonic_decreasing)): - sub_idx = self.right - if self.open_right or exclude_label: - label = _get_next_label(label) - else: - sub_idx = self.left - if self.open_left or exclude_label: - label = _get_prev_label(label) - - return sub_idx._searchsorted_monotonic(label, side) - - def _get_loc_only_exact_matches(self, key): - return self._multiindex._tuple_index.get_loc(key) - - def _find_non_overlapping_monotonic_bounds(self, key): - if isinstance(key, IntervalMixin): - start = self._searchsorted_monotonic( - key.left, 'left', exclude_label=key.open_left) - stop = self._searchsorted_monotonic( - key.right, 'right', exclude_label=key.open_right) - else: - # scalar - start = self._searchsorted_monotonic(key, 'left') - stop = self._searchsorted_monotonic(key, 'right') - return start, stop - - def get_loc(self, key, method=None): - self._check_method(method) - - original_key = key - - if self.is_non_overlapping_monotonic: - if isinstance(key, Interval): - left = self._maybe_cast_slice_bound(key.left, 'left', None) - right = self._maybe_cast_slice_bound(key.right, 'right', None) - key = Interval(left, right, key.closed) - else: - key = self._maybe_cast_slice_bound(key, 'left', None) - - start, stop = self._find_non_overlapping_monotonic_bounds(key) - - if start + 1 == stop: - return start - elif start < stop: - return slice(start, stop) - else: - raise KeyError(original_key) - - else: - # use the interval tree - if isinstance(key, Interval): - left, right = _get_interval_closed_bounds(key) - return self._tree.get_loc_interval(left, right) - else: - return self._tree.get_loc(key) - - def get_value(self, series, key): - # this method seems necessary for Series.__getitem__ but I have no idea - # what it should actually do here... - loc = self.get_loc(key) # nb. this can't handle slice objects - return series.iloc[loc] - - def get_indexer(self, target, method=None, limit=None, tolerance=None): - self._check_method(method) - target = _ensure_index(target) - - if self.is_non_overlapping_monotonic: - start, stop = self._find_non_overlapping_monotonic_bounds(target) - - start_plus_one = start + 1 - if (start_plus_one < stop).any(): - raise ValueError('indexer corresponds to non-unique elements') - return np.where(start_plus_one == stop, start, -1) - - else: - if isinstance(target, IntervalIndex): - raise NotImplementedError( - 'have not yet implemented get_indexer ' - 'for IntervalIndex indexers') - else: - return self._tree.get_indexer(target) - - def delete(self, loc): - new_left = self.left.delete(loc) - new_right = self.right.delete(loc) - return type(self)(new_left, new_right, self.closed, self.name, - fastpath=True) - - def insert(self, loc, item): - if not isinstance(item, Interval): - raise ValueError('can only insert Interval objects into an ' - 'IntervalIndex') - if not item.closed == self.closed: - raise ValueError('inserted item must be closed on the same side ' - 'as the index') - new_left = self.left.insert(loc, item.left) - new_right = self.right.insert(loc, item.right) - return type(self)(new_left, new_right, self.closed, self.name, - fastpath=True) - - def _as_like_interval_index(self, other, error_msg): - self._assert_can_do_setop(other) - other = _ensure_index(other) - if (not isinstance(other, IntervalIndex) or - self.closed != other.closed): - raise ValueError(error_msg) - return other - - def append(self, other): - msg = ('can only append two IntervalIndex objects that are closed on ' - 'the same side') - other = self._as_like_interval_index(other, msg) - new_left = self.left.append(other.left) - new_right = self.right.append(other.right) - if other.name is not None and other.name != self.name: - name = None - else: - name = self.name - return type(self)(new_left, new_right, self.closed, name, - fastpath=True) - - def take(self, indexer, axis=0): - indexer = com._ensure_platform_int(indexer) - new_left = self.left.take(indexer) - new_right = self.right.take(indexer) - return type(self)(new_left, new_right, self.closed, self.name, - fastpath=True) - - def __contains__(self, key): - try: - self.get_loc(key) - return True - except KeyError: - return False - - def __getitem__(self, value): - left = self.left[value] - right = self.right[value] - if not isinstance(left, Index): - return Interval(left, right, self.closed) - else: - return type(self)(left, right, self.closed, self.name) - - # __repr__ associated methods are based on MultiIndex - - def _format_attrs(self): - attrs = [('left', default_pprint(self.left)), - ('right', default_pprint(self.right)), - ('closed', repr(self.closed))] - if self.name is not None: - attrs.append(('name', default_pprint(self.name))) - return attrs - - def _format_space(self): - return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - - def _format_data(self): - return None - - def argsort(self, *args, **kwargs): - return np.lexsort((self.right, self.left)) - - def equals(self, other): - if self.is_(other): - return True - try: - return (self.left.equals(other.left) - and self.right.equals(other.right) - and self.closed == other.closed) - except AttributeError: - return False - - def _setop(op_name): - def func(self, other): - msg = ('can only do set operations between two IntervalIndex ' - 'objects that are closed on the same side') - other = self._as_like_interval_index(other, msg) - result = getattr(self._multiindex, op_name)(other._multiindex) - result_name = self.name if self.name == other.name else None - return type(self).from_tuples(result.values, closed=self.closed, - name=result_name) - return func - - union = _setop('union') - intersection = _setop('intersection') - difference = _setop('difference') - sym_diff = _setop('sym_diff') - - # TODO: arithmetic operations - - -IntervalIndex._add_logical_methods_disabled() diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 66a81aadc4213..907198d98cf5b 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -15,6 +15,7 @@ is_float_dtype, is_period_arraylike, is_integer_dtype, + is_interval_dtype, is_datetimetz, is_integer, is_float, @@ -575,6 +576,7 @@ def to_string(self): pprint_thing(frame.index))) text = info_line else: + strcols = self._to_str_columns() if self.line_width is None: # no need to wrap around just print # the whole frame @@ -2027,6 +2029,8 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', if is_categorical_dtype(values): fmt_klass = CategoricalArrayFormatter + elif is_interval_dtype(values): + fmt_klass = IntervalArrayFormatter elif is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter elif is_period_arraylike(values): @@ -2294,6 +2298,17 @@ def _format_strings(self): return fmt_values.tolist() +class IntervalArrayFormatter(GenericArrayFormatter): + + def __init__(self, values, *args, **kwargs): + GenericArrayFormatter.__init__(self, values, *args, **kwargs) + + def _format_strings(self): + formatter = self.formatter or str + fmt_values = np.array([formatter(x) for x in self.values]) + return fmt_values + + class PeriodArrayFormatter(IntArrayFormatter): def _format_strings(self): diff --git a/pandas/indexes/api.py b/pandas/indexes/api.py index a3cb54ca97071..db076b60ab34e 100644 --- a/pandas/indexes/api.py +++ b/pandas/indexes/api.py @@ -3,6 +3,7 @@ InvalidIndexError) from pandas.indexes.category import CategoricalIndex # noqa from pandas.indexes.multi import MultiIndex # noqa +from pandas.indexes.interval import IntervalIndex # noqa from pandas.indexes.numeric import (NumericIndex, Float64Index, # noqa Int64Index, UInt64Index) from pandas.indexes.range import RangeIndex # noqa @@ -13,7 +14,7 @@ # TODO: there are many places that rely on these private methods existing in # pandas.core.index __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index', - 'CategoricalIndex', 'RangeIndex', 'UInt64Index', + 'CategoricalIndex', 'IntervalIndex', 'RangeIndex', 'UInt64Index', 'InvalidIndexError', '_new_Index', '_ensure_index', '_get_na_value', '_get_combined_index', diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index ab5c01388e652..c0635f07238b5 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -24,6 +24,7 @@ is_dtype_equal, is_object_dtype, is_categorical_dtype, + is_interval_dtype, is_bool_dtype, is_signed_integer_dtype, is_unsigned_integer_dtype, @@ -49,9 +50,9 @@ from pandas.formats.printing import pprint_thing from pandas.core.ops import _comp_method_OBJECT_ARRAY from pandas.core.strings import StringAccessorMixin - from pandas.core.config import get_option + # simplify default_pprint = lambda x, max_seq_items=None: \ pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True, @@ -138,6 +139,9 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): _is_numeric_dtype = False _can_hold_na = True + # would we like our indexing holder to defer to us + _defer_to_indexing = False + # prioritize current class for _shallow_copy_with_infer, # used to infer integers as datetime-likes _infer_as_myclass = False @@ -167,6 +171,12 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, from .category import CategoricalIndex return CategoricalIndex(data, copy=copy, name=name, **kwargs) + # interval + if is_interval_dtype(data): + from .interval import IntervalIndex + return IntervalIndex.from_intervals(data, name=name, + copy=copy) + # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): @@ -276,6 +286,10 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif inferred in ['floating', 'mixed-integer-float']: from .numeric import Float64Index return Float64Index(subarr, copy=copy, name=name) + elif inferred == 'interval': + from .interval import IntervalIndex + return IntervalIndex.from_intervals(subarr, name=name, + copy=copy) elif inferred == 'boolean': # don't support boolean explicity ATM pass @@ -1210,6 +1224,9 @@ def is_object(self): def is_categorical(self): return self.inferred_type in ['categorical'] + def is_interval(self): + return self.inferred_type in ['interval'] + def is_mixed(self): return self.inferred_type in ['mixed'] @@ -1413,11 +1430,6 @@ def _convert_index_indexer(self, keyarr): @Appender(_index_shared_docs['_convert_list_indexer']) def _convert_list_indexer(self, keyarr, kind=None): - """ - passed a key that is tuplesafe that is integer based - and we have a mixed index (e.g. number/labels). figure out - the indexer. return None if we can't help - """ if (kind in [None, 'iloc', 'ix'] and is_integer_dtype(keyarr) and not self.is_floating() and not isinstance(keyarr, ABCPeriodIndex)): @@ -1553,9 +1565,41 @@ def __nonzero__(self): __bool__ = __nonzero__ + _index_shared_docs['__contains__'] = """ + return a boolean if this key is IN the index + + Parameters + ---------- + key : object + + Returns + ------- + boolean + """ + + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): hash(key) - # work around some kind of odd cython bug + try: + return key in self._engine + except TypeError: + return False + + _index_shared_docs['_is_contained_in'] = """ + return a boolean if this key is IN the index + + Parameters + ---------- + key : object + + Returns + ------- + boolean + """ + + @Appender(_index_shared_docs['_is_contained_in'] % _index_doc_kwargs) + def _is_contained_in(self, key): + hash(key) try: return key in self._engine except TypeError: @@ -3341,6 +3385,13 @@ def _searchsorted_monotonic(self, label, side='left'): raise ValueError('index must be monotonic increasing or decreasing') + def _get_loc_only_exact_matches(self, key): + """ + This is overriden on subclasses (namely, IntervalIndex) to control + get_slice_bound. + """ + return self.get_loc(key) + def get_slice_bound(self, label, side, kind): """ Calculate slice bound that corresponds to given label. @@ -3370,7 +3421,7 @@ def get_slice_bound(self, label, side, kind): # we need to look up the label try: - slc = self.get_loc(label) + slc = self._get_loc_only_exact_matches(label) except KeyError as err: try: return self._searchsorted_monotonic(label, side) @@ -3606,7 +3657,9 @@ def _evaluate_compare(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - if is_object_dtype(self) and self.nlevels == 1: + if (is_object_dtype(self) and + self.nlevels == 1): + # don't pass MultiIndex with np.errstate(all='ignore'): result = _comp_method_OBJECT_ARRAY( @@ -3918,6 +3971,8 @@ def _ensure_index(index_like, copy=False): def _get_na_value(dtype): + if is_datetime64_any_dtype(dtype) or is_timedelta64_dtype(dtype): + return libts.NaT return {np.datetime64: libts.NaT, np.timedelta64: libts.NaT}.get(dtype, np.nan) diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index 7cfc95de5f538..4800375cd5d38 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -7,7 +7,9 @@ from pandas.types.common import (is_categorical_dtype, _ensure_platform_int, is_list_like, + is_interval_dtype, is_scalar) +from pandas.core.common import _asarray_tuplesafe from pandas.types.missing import array_equivalent @@ -17,7 +19,6 @@ import pandas.core.base as base import pandas.core.missing as missing import pandas.indexes.base as ibase -from pandas.core.common import _asarray_tuplesafe _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass='CategoricalIndex')) @@ -261,14 +262,35 @@ def ordered(self): def _reverse_indexer(self): return self._data._reverse_indexer() + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): hash(key) + + if self.categories._defer_to_indexing: + return key in self.categories + + return key in self.values + + @Appender(_index_shared_docs['_is_contained_in'] % _index_doc_kwargs) + def _is_contained_in(self, key): + hash(key) + + if self.categories._defer_to_indexing: + return self.categories._is_contained_in(key) + return key in self.values def __array__(self, dtype=None): """ the array interface, return my values """ return np.array(self._data, dtype=dtype) + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_interval_dtype(dtype): + from pandas import IntervalIndex + return IntervalIndex.from_intervals(np.array(self)) + return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy) + @cache_readonly def _isnan(self): """ return if each value is nan""" @@ -431,8 +453,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase._ensure_index(target) - if isinstance(target, CategoricalIndex): - target = target.categories + if self.equals(target): + return np.arange(len(self), dtype='intp') if method == 'pad' or method == 'backfill': raise NotImplementedError("method='pad' and method='backfill' not " @@ -440,10 +462,17 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " 'for CategoricalIndex') - else: + if (isinstance(target, CategoricalIndex) and + self.values.is_dtype_equal(target)): + # we have the same codes + codes = target.codes + else: + if isinstance(target, CategoricalIndex): + target = target.categories codes = self.categories.get_indexer(target) - indexer, _ = self._engine.get_indexer_non_unique(codes) + + indexer, _ = self._engine.get_indexer_non_unique(codes) return _ensure_platform_int(indexer) @@ -457,20 +486,39 @@ def get_indexer_non_unique(self, target): codes = self.categories.get_indexer(target) return self._engine.get_indexer_non_unique(codes) + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + if self.categories._defer_to_indexing: + return self.categories._convert_scalar_indexer(key, kind=kind) + + return super(CategoricalIndex, self)._convert_scalar_indexer( + key, kind=kind) + @Appender(_index_shared_docs['_convert_list_indexer']) def _convert_list_indexer(self, keyarr, kind=None): # Return our indexer or raise if all of the values are not included in # the categories - codes = self.categories.get_indexer(keyarr) - if (codes == -1).any(): - raise KeyError("a list-indexer must only include values that are " - "in the categories") - return None + if self.categories._defer_to_indexing: + indexer = self.categories._convert_list_indexer(keyarr, kind=kind) + return Index(self.codes).get_indexer_for(indexer) + + indexer = self.categories.get_indexer(keyarr) + if (indexer == -1).any(): + raise KeyError( + "a list-indexer must only " + "include values that are " + "in the categories") + + return self.get_indexer(keyarr) @Appender(_index_shared_docs['_convert_arr_indexer']) def _convert_arr_indexer(self, keyarr): keyarr = _asarray_tuplesafe(keyarr) + + if self.categories._defer_to_indexing: + return keyarr + return self._shallow_copy(keyarr) @Appender(_index_shared_docs['_convert_index_indexer']) @@ -488,6 +536,8 @@ def take(self, indices, axis=0, allow_fill=True, na_value=-1) return self._create_from_codes(taken) + take_nd = take + def map(self, mapper): """Apply mapper function to its categories (not codes). diff --git a/pandas/indexes/interval.py b/pandas/indexes/interval.py new file mode 100644 index 0000000000000..127655972e7f2 --- /dev/null +++ b/pandas/indexes/interval.py @@ -0,0 +1,983 @@ +""" define the IntervalIndex """ + +import numpy as np + +from pandas.types.missing import notnull, isnull +from pandas.types.generic import ABCPeriodIndex +from pandas.types.dtypes import IntervalDtype +from pandas.types.common import (_ensure_platform_int, + is_list_like, + is_datetime_or_timedelta_dtype, + is_integer_dtype, + is_object_dtype, + is_categorical_dtype, + is_float_dtype, + is_interval_dtype, + is_scalar, + is_integer) +from pandas.indexes.base import (Index, _ensure_index, + default_pprint, _index_shared_docs) + +from pandas._libs import Timestamp, Timedelta +from pandas._libs.interval import (Interval, IntervalMixin, IntervalTree, + intervals_to_interval_bounds) + +from pandas.indexes.multi import MultiIndex +from pandas.compat.numpy import function as nv +from pandas.core import common as com +from pandas.util.decorators import cache_readonly, Appender +from pandas.core.config import get_option + +import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) +_index_doc_kwargs.update( + dict(klass='IntervalIndex', + target_klass='IntervalIndex or list of Intervals')) + + +_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) + + +def _get_next_label(label): + dtype = getattr(label, 'dtype', type(label)) + if isinstance(label, (Timestamp, Timedelta)): + dtype = 'datetime64' + if is_datetime_or_timedelta_dtype(dtype): + return label + np.timedelta64(1, 'ns') + elif is_integer_dtype(dtype): + return label + 1 + elif is_float_dtype(dtype): + return np.nextafter(label, np.infty) + else: + raise TypeError('cannot determine next label for type %r' + % type(label)) + + +def _get_prev_label(label): + dtype = getattr(label, 'dtype', type(label)) + if isinstance(label, (Timestamp, Timedelta)): + dtype = 'datetime64' + if is_datetime_or_timedelta_dtype(dtype): + return label - np.timedelta64(1, 'ns') + elif is_integer_dtype(dtype): + return label - 1 + elif is_float_dtype(dtype): + return np.nextafter(label, -np.infty) + else: + raise TypeError('cannot determine next label for type %r' + % type(label)) + + +def _get_interval_closed_bounds(interval): + """ + Given an Interval or IntervalIndex, return the corresponding interval with + closed bounds. + """ + left, right = interval.left, interval.right + if interval.open_left: + left = _get_next_label(left) + if interval.open_right: + right = _get_prev_label(right) + return left, right + + +def _new_IntervalIndex(cls, d): + """ This is called upon unpickling, + rather than the default which doesn't + have arguments and breaks __new__ """ + + return cls.from_arrays(**d) + + +class IntervalIndex(IntervalMixin, Index): + """ + Immutable Index implementing an ordered, sliceable set. IntervalIndex + represents an Index of intervals that are all closed on the same side. + + .. versionadded:: 0.20.0 + + Properties + ---------- + left, right : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both or + neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + Copy the meta-data + """ + _typ = 'intervalindex' + _comparables = ['name'] + _attributes = ['name', 'closed'] + _allow_index_ops = True + + # we would like our indexing holder to defer to us + _defer_to_indexing = True + + _mask = None + + def __new__(cls, data, closed='right', + name=None, copy=False, dtype=None, + fastpath=False, verify_integrity=True): + + if fastpath: + return cls._simple_new(data.left, data.right, closed, name, + copy=copy, verify_integrity=False) + + if name is None and hasattr(data, 'name'): + name = data.name + + if isinstance(data, IntervalIndex): + left = data.left + right = data.right + + else: + + # don't allow scalars + if is_scalar(data): + cls._scalar_data_error(data) + + data = IntervalIndex.from_intervals(data, name=name) + left, right = data.left, data.right + + return cls._simple_new(left, right, closed, name, + copy=copy, verify_integrity=verify_integrity) + + @classmethod + def _simple_new(cls, left, right, closed=None, name=None, + copy=False, verify_integrity=True): + result = IntervalMixin.__new__(cls) + + if closed is None: + closed = 'right' + left = _ensure_index(left, copy=copy) + right = _ensure_index(right, copy=copy) + + # coerce dtypes to match if needed + if is_float_dtype(left) and is_integer_dtype(right): + right = right.astype(left.dtype) + if is_float_dtype(right) and is_integer_dtype(left): + left = left.astype(right.dtype) + + if type(left) != type(right): + raise ValueError("must not have differing left [{}] " + "and right [{}] types".format( + type(left), type(right))) + + if isinstance(left, ABCPeriodIndex): + raise ValueError("Period dtypes are not supported, " + "use a PeriodIndex instead") + + result._left = left + result._right = right + result._closed = closed + result.name = name + if verify_integrity: + result._validate() + result._reset_identity() + return result + + @Appender(_index_shared_docs['_shallow_copy']) + def _shallow_copy(self, left=None, right=None, **kwargs): + if left is None: + + # no values passed + left, right = self.left, self.right + + elif right is None: + + # only single value passed, could be an IntervalIndex + # or array of Intervals + if not isinstance(left, IntervalIndex): + left = type(self).from_intervals(left) + + left, right = left.left, left.right + else: + + # both left and right are values + pass + + attributes = self._get_attributes_dict() + attributes.update(kwargs) + attributes['verify_integrity'] = False + return self._simple_new(left, right, **attributes) + + def _validate(self): + """ + Verify that the IntervalIndex is valid. + """ + if self.closed not in _VALID_CLOSED: + raise ValueError("invalid options for 'closed': %s" % self.closed) + if len(self.left) != len(self.right): + raise ValueError('left and right must have the same length') + left_mask = notnull(self.left) + right_mask = notnull(self.right) + if not (left_mask == right_mask).all(): + raise ValueError('missing values must be missing in the same ' + 'location both left and right sides') + if not (self.left[left_mask] <= self.right[left_mask]).all(): + raise ValueError('left side of interval must be <= right side') + self._mask = ~left_mask + + @cache_readonly + def hasnans(self): + """ return if I have any nans; enables various perf speedups """ + return self._isnan.any() + + @cache_readonly + def _isnan(self): + """ return if each value is nan""" + if self._mask is None: + self._mask = isnull(self.left) + return self._mask + + @cache_readonly + def _engine(self): + return IntervalTree(self.left, self.right, closed=self.closed) + + @property + def _constructor(self): + return type(self).from_intervals + + def __contains__(self, key): + """ + return a boolean if this key is IN the index + We *only* accept an Interval + + Parameters + ---------- + key : Interval + + Returns + ------- + boolean + """ + if not isinstance(key, Interval): + return False + + try: + self.get_loc(key) + return True + except KeyError: + return False + + def _is_contained_in(self, key): + """ + return a boolean if this key is IN the index + + We accept / allow keys to be not *just* actual + objects. + + Parameters + ---------- + key : int, float, Interval + + Returns + ------- + boolean + """ + try: + self.get_loc(key) + return True + except KeyError: + return False + + @classmethod + def from_breaks(cls, breaks, closed='right', name=None, copy=False): + """ + Construct an IntervalIndex from an array of splits + + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + copy the data + + Examples + -------- + + >>> IntervalIndex.from_breaks([0, 1, 2, 3]) + IntervalIndex(left=[0, 1, 2], + right=[1, 2, 3], + closed='right') + """ + breaks = np.asarray(breaks) + return cls.from_arrays(breaks[:-1], breaks[1:], closed, + name=name, copy=copy) + + @classmethod + def from_arrays(cls, left, right, closed='right', name=None, copy=False): + """ + Construct an IntervalIndex from a a left and right array + + Parameters + ---------- + left : array-like (1-dimensional) + Left bounds for each interval. + right : array-like (1-dimensional) + Right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + copy the data + + Examples + -------- + + >>> IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) + IntervalIndex(left=[0, 1, 2], + right=[1, 2, 3], + closed='right') + """ + left = np.asarray(left) + right = np.asarray(right) + return cls._simple_new(left, right, closed, name=name, + copy=copy, verify_integrity=True) + + @classmethod + def from_intervals(cls, data, name=None, copy=False): + """ + Construct an IntervalIndex from a 1d array of Interval objects + + Parameters + ---------- + data : array-like (1-dimensional) + Array of Interval objects. All intervals must be closed on the same + sides. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + + Examples + -------- + + >>> IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) + IntervalIndex(left=[0, 1], + right=[1, 2], + closed='right') + + The generic Index constructor work identically when it infers an array + of all intervals: + + >>> Index([Interval(0, 1), Interval(1, 2)]) + IntervalIndex(left=[0, 1], + right=[1, 2], + closed='right') + """ + data = np.asarray(data) + left, right, closed = intervals_to_interval_bounds(data) + return cls.from_arrays(left, right, closed, name=name, copy=False) + + @classmethod + def from_tuples(cls, data, closed='right', name=None, copy=False): + """ + Construct an IntervalIndex from a list/array of tuples + + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. Defaults to 'right'. + name : object, optional + Name to be stored in the index. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + + Examples + -------- + + """ + left = [] + right = [] + for d in data: + + if isnull(d): + left.append(np.nan) + right.append(np.nan) + continue + + l, r = d + left.append(l) + right.append(r) + + # TODO + # if we have nulls and we previous had *only* + # integer data, then we have changed the dtype + + return cls.from_arrays(left, right, closed, name=name, copy=False) + + def to_tuples(self): + return Index(com._asarray_tuplesafe(zip(self.left, self.right))) + + @cache_readonly + def _multiindex(self): + return MultiIndex.from_arrays([self.left, self.right], + names=['left', 'right']) + + @property + def left(self): + return self._left + + @property + def right(self): + return self._right + + @property + def closed(self): + return self._closed + + def __len__(self): + return len(self.left) + + @cache_readonly + def values(self): + """ + Returns the IntervalIndex's data as a numpy array of Interval + objects (with dtype='object') + """ + left = self.left + right = self.right + mask = self._isnan + closed = self._closed + + result = np.empty(len(left), dtype=object) + for i in range(len(left)): + if mask[i]: + result[i] = np.nan + else: + result[i] = Interval(left[i], right[i], closed) + return result + + def __array__(self, result=None): + """ the array interface, return my values """ + return self.values + + def __array_wrap__(self, result, context=None): + # we don't want the superclass implementation + return result + + def _array_values(self): + return self.values + + def __reduce__(self): + d = dict(left=self.left, + right=self.right) + d.update(self._get_attributes_dict()) + return _new_IntervalIndex, (self.__class__, d), None + + @Appender(_index_shared_docs['copy']) + def copy(self, deep=False, name=None): + left = self.left.copy(deep=True) if deep else self.left + right = self.right.copy(deep=True) if deep else self.right + name = name if name is not None else self.name + return type(self).from_arrays(left, right, name=name) + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_interval_dtype(dtype): + if copy: + self = self.copy() + return self + elif is_object_dtype(dtype): + return Index(self.values, dtype=object) + elif is_categorical_dtype(dtype): + from pandas import Categorical + return Categorical(self, ordered=True) + raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype) + + @cache_readonly + def dtype(self): + return IntervalDtype.construct_from_string(str(self.left.dtype)) + + @property + def inferred_type(self): + return 'interval' + + @Appender(Index.memory_usage.__doc__) + def memory_usage(self, deep=False): + # we don't use an explict engine + # so return the bytes here + return (self.left.memory_usage(deep=deep) + + self.right.memory_usage(deep=deep)) + + @cache_readonly + def mid(self): + """Returns the mid-point of each interval in the index as an array + """ + try: + return Index(0.5 * (self.left.values + self.right.values)) + except TypeError: + # datetime safe version + delta = self.right.values - self.left.values + return Index(self.left.values + 0.5 * delta) + + @cache_readonly + def is_monotonic(self): + return self._multiindex.is_monotonic + + @cache_readonly + def is_monotonic_increasing(self): + return self._multiindex.is_monotonic_increasing + + @cache_readonly + def is_monotonic_decreasing(self): + return self._multiindex.is_monotonic_decreasing + + @cache_readonly + def is_unique(self): + return self._multiindex.is_unique + + @cache_readonly + def is_non_overlapping_monotonic(self): + # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) + # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) + # we already require left <= right + return ((self.right[:-1] <= self.left[1:]).all() or + (self.left[:-1] >= self.right[1:]).all()) + + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + if kind == 'iloc': + return super(IntervalIndex, self)._convert_scalar_indexer( + key, kind=kind) + return key + + def _maybe_cast_slice_bound(self, label, side, kind): + return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) + + @Appender(_index_shared_docs['_convert_list_indexer']) + def _convert_list_indexer(self, keyarr, kind=None): + """ + we are passed a list-like indexer. Return the + indexer for matching intervals. + """ + locs = self.get_indexer_for(keyarr) + check = locs == -1 + locs = locs[~check] + return locs + + def _maybe_cast_indexed(self, key): + """ + we need to cast the key, which could be a scalar + or an array-like to the type of our subtype + """ + if is_float_dtype(self.dtype.subtype): + if is_integer(key): + key = float(key) + elif isinstance(key, (np.ndarray, Index)): + key = key.astype('float64') + return key + + def _check_method(self, method): + if method is None: + return + + if method in ['bfill', 'backfill', 'pad', 'ffill', 'nearest']: + raise NotImplementedError( + 'method {} not yet implemented for ' + 'IntervalIndex'.format(method)) + + raise ValueError("Invalid fill method") + + def _searchsorted_monotonic(self, label, side, exclude_label=False): + if not self.is_non_overlapping_monotonic: + raise KeyError('can only get slices from an IntervalIndex if ' + 'bounds are non-overlapping and all monotonic ' + 'increasing or decreasing') + + if isinstance(label, IntervalMixin): + raise NotImplementedError + + if ((side == 'left' and self.left.is_monotonic_increasing) or + (side == 'right' and self.left.is_monotonic_decreasing)): + sub_idx = self.right + if self.open_right or exclude_label: + label = _get_next_label(label) + else: + sub_idx = self.left + if self.open_left or exclude_label: + label = _get_prev_label(label) + + return sub_idx._searchsorted_monotonic(label, side) + + def _get_loc_only_exact_matches(self, key): + if isinstance(key, Interval): + # TODO: this expands to a tuple index, see if we can + # do better + return Index(self._multiindex.values).get_loc(key) + raise KeyError + + def _find_non_overlapping_monotonic_bounds(self, key): + if isinstance(key, IntervalMixin): + start = self._searchsorted_monotonic( + key.left, 'left', exclude_label=key.open_left) + stop = self._searchsorted_monotonic( + key.right, 'right', exclude_label=key.open_right) + elif isinstance(key, slice): + # slice + start, stop = key.start, key.stop + if (key.step or 1) != 1: + raise NotImplementedError("cannot slice with a slice step") + if start is None: + start = 0 + else: + start = self._searchsorted_monotonic(start, 'left') + if stop is None: + stop = len(self) + else: + stop = self._searchsorted_monotonic(stop, 'right') + else: + # scalar or index-like + + start = self._searchsorted_monotonic(key, 'left') + stop = self._searchsorted_monotonic(key, 'right') + return start, stop + + def get_loc(self, key, method=None): + self._check_method(method) + + original_key = key + key = self._maybe_cast_indexed(key) + + if self.is_non_overlapping_monotonic: + if isinstance(key, Interval): + left = self._maybe_cast_slice_bound(key.left, 'left', None) + right = self._maybe_cast_slice_bound(key.right, 'right', None) + key = Interval(left, right, key.closed) + else: + key = self._maybe_cast_slice_bound(key, 'left', None) + + start, stop = self._find_non_overlapping_monotonic_bounds(key) + + if start is None or stop is None: + return slice(start, stop) + elif start + 1 == stop: + return start + elif start < stop: + return slice(start, stop) + else: + raise KeyError(original_key) + + else: + # use the interval tree + if isinstance(key, Interval): + left, right = _get_interval_closed_bounds(key) + return self._engine.get_loc_interval(left, right) + else: + return self._engine.get_loc(key) + + def get_value(self, series, key): + if com.is_bool_indexer(key): + loc = key + elif is_list_like(key): + loc = self.get_indexer(key) + else: + loc = self.get_loc(key) + return series.iloc[loc] + + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + self._check_method(method) + target = _ensure_index(target) + target = self._maybe_cast_indexed(target) + + if self.equals(target): + return np.arange(len(self), dtype='intp') + + if self.is_non_overlapping_monotonic: + start, stop = self._find_non_overlapping_monotonic_bounds(target) + + start_plus_one = start + 1 + if not ((start_plus_one < stop).any()): + return np.where(start_plus_one == stop, start, -1) + + if not self.is_unique: + raise ValueError("get_indexer cannot handle non-unique indices") + + # find the left and right indexers + lindexer = self._engine.get_indexer(target.left.values) + rindexer = self._engine.get_indexer(target.right.values) + + # we want to return an indexer on the intervals + # however, our keys could provide overlapping of multiple + # intervals, so we iterate thru the indexers and construct + # a set of indexers + + indexer = [] + n = len(self) + + for l, r in zip(lindexer, rindexer): + + # not found + if l == -1 and r == -1: + indexer.append(np.array([-1])) + + elif r == -1: + indexer.append(np.arange(l, n)) + + elif l == -1: + if r == 0: + indexer.append(np.array([-1])) + else: + indexer.append(np.arange(0, r + 1)) + + else: + indexer.append(np.arange(l, r)) + + indexer = np.concatenate(indexer) + + return _ensure_platform_int(indexer) + + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = self._maybe_cast_indexed(_ensure_index(target)) + return super(IntervalIndex, self).get_indexer_non_unique(target) + + @Appender(_index_shared_docs['where']) + def where(self, cond, other=None): + if other is None: + other = self._na_value + values = np.where(cond, self.values, other) + return self._shallow_copy(values) + + def delete(self, loc): + new_left = self.left.delete(loc) + new_right = self.right.delete(loc) + return self._shallow_copy(new_left, new_right) + + def insert(self, loc, item): + if not isinstance(item, Interval): + raise ValueError('can only insert Interval objects into an ' + 'IntervalIndex') + if not item.closed == self.closed: + raise ValueError('inserted item must be closed on the same side ' + 'as the index') + new_left = self.left.insert(loc, item.left) + new_right = self.right.insert(loc, item.right) + return self._shallow_copy(new_left, new_right) + + def _as_like_interval_index(self, other, error_msg): + self._assert_can_do_setop(other) + other = _ensure_index(other) + if (not isinstance(other, IntervalIndex) or + self.closed != other.closed): + raise ValueError(error_msg) + return other + + def _append_same_dtype(self, to_concat, name): + """ + assert that we all have the same .closed + we allow a 0-len index here as well + """ + if not len(set([i.closed for i in to_concat if len(i)])) == 1: + msg = ('can only append two IntervalIndex objects ' + 'that are closed on the same side') + raise ValueError(msg) + return super(IntervalIndex, self)._append_same_dtype(to_concat, name) + + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None, **kwargs): + nv.validate_take(tuple(), kwargs) + indices = _ensure_platform_int(indices) + left, right = self.left, self.right + + if fill_value is None: + fill_value = self._na_value + mask = indices == -1 + + if not mask.any(): + # we won't change dtype here in this case + # if we don't need + allow_fill = False + + taker = lambda x: x.take(indices, allow_fill=allow_fill, + fill_value=fill_value) + + try: + new_left = taker(left) + new_right = taker(right) + except ValueError: + + # we need to coerce; migth have NA's in an + # interger dtype + new_left = taker(left.astype(float)) + new_right = taker(right.astype(float)) + + return self._shallow_copy(new_left, new_right) + + def __getitem__(self, value): + mask = self._isnan[value] + if is_scalar(mask) and mask: + return self._na_value + + left = self.left[value] + right = self.right[value] + + # scalar + if not isinstance(left, Index): + return Interval(left, right, self.closed) + + return self._shallow_copy(left, right) + + # __repr__ associated methods are based on MultiIndex + + def _format_with_header(self, header, **kwargs): + return header + list(self._format_native_types(**kwargs)) + + def _format_native_types(self, na_rep='', quoting=None, **kwargs): + """ actually format my specific types """ + from pandas.formats.format import IntervalArrayFormatter + return IntervalArrayFormatter(values=self, + na_rep=na_rep, + justify='all').get_result() + + def _format_data(self): + + # TODO: integrate with categorical and make generic + n = len(self) + max_seq_items = min((get_option( + 'display.max_seq_items') or n) // 10, 10) + + formatter = str + + if n == 0: + summary = '[]' + elif n == 1: + first = formatter(self[0]) + summary = '[{}]'.format(first) + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary = '[{}, {}]'.format(first, last) + else: + + if n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in self[:n]] + tail = [formatter(x) for x in self[-n:]] + summary = '[{} ... {}]'.format(', '.join(head), + ', '.join(tail)) + else: + head = [] + tail = [formatter(x) for x in self] + summary = '[{}]'.format(', '.join(tail)) + + return summary + self._format_space() + + def _format_attrs(self): + attrs = [('closed', repr(self.closed))] + if self.name is not None: + attrs.append(('name', default_pprint(self.name))) + attrs.append(('dtype', "'%s'" % self.dtype)) + return attrs + + def _format_space(self): + return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) + + def argsort(self, *args, **kwargs): + return np.lexsort((self.right, self.left)) + + def equals(self, other): + + if self.is_(other): + return True + + # if we can coerce to an II + # then we can compare + if not isinstance(other, IntervalIndex): + if not is_interval_dtype(other): + return False + other = Index(getattr(other, '.values', other)) + + return (self.left.equals(other.left) and + self.right.equals(other.right) and + self.closed == other.closed) + + def _setop(op_name): + def func(self, other): + msg = ('can only do set operations between two IntervalIndex ' + 'objects that are closed on the same side') + other = self._as_like_interval_index(other, msg) + result = getattr(self._multiindex, op_name)(other._multiindex) + result_name = self.name if self.name == other.name else None + return type(self).from_tuples(result.values, closed=self.closed, + name=result_name) + return func + + union = _setop('union') + intersection = _setop('intersection') + difference = _setop('difference') + symmetric_differnce = _setop('symmetric_difference') + + # TODO: arithmetic operations + + +IntervalIndex._add_logical_methods_disabled() + + +def interval_range(start=None, end=None, freq=None, periods=None, + name=None, closed='right', **kwargs): + """ + Return a fixed frequency IntervalIndex + + Parameters + ---------- + start : string or datetime-like, default None + Left bound for generating data + end : string or datetime-like, default None + Right bound for generating data + freq : interger, string or DateOffset, default 1 + periods : interger, default None + name : str, default None + Name of the resulting index + closed : string, default 'right' + options are: 'left', 'right', 'both', 'neither' + + Notes + ----- + 2 of start, end, or periods must be specified + + Returns + ------- + rng : IntervalIndex + """ + + if freq is None: + freq = 1 + + if start is None: + if periods is None or end is None: + raise ValueError("must specify 2 of start, end, periods") + start = end - periods * freq + elif end is None: + if periods is None or start is None: + raise ValueError("must specify 2 of start, end, periods") + end = start + periods * freq + elif periods is None: + if start is None or end is None: + raise ValueError("must specify 2 of start, end, periods") + pass + + # must all be same units or None + arr = np.array([start, end, freq]) + if is_object_dtype(arr): + raise ValueError("start, end, freq need to be the same type") + + return IntervalIndex.from_breaks(np.arange(start, end, freq), + name=name, + closed=closed) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 74c45aac8b620..f51ed20379726 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1318,15 +1318,17 @@ def nlevels(self): def levshape(self): return tuple(len(x) for x in self.levels) + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): hash(key) - # work around some kind of odd cython bug try: self.get_loc(key) return True except LookupError: return False + _is_contained_in = __contains__ + def __reduce__(self): """Necessary for making this object picklable""" d = dict(levels=[lev for lev in self.levels], diff --git a/pandas/src/intervaltree.pyx b/pandas/src/intervaltree.pyx deleted file mode 100644 index 55782c930d4f8..0000000000000 --- a/pandas/src/intervaltree.pyx +++ /dev/null @@ -1,1444 +0,0 @@ - -# DO NOT EDIT THIS FILE: This file was autogenerated from -# generate_intervaltree.py, so please edit that file and then run -# `python2 generate_intervaltree.py` to re-generate this file. - - -from numpy cimport int64_t, float64_t -from numpy cimport ndarray, PyArray_ArgSort, NPY_QUICKSORT, PyArray_Take -import numpy as np - -cimport cython -cimport numpy as cnp -cnp.import_array() - -from hashtable cimport Int64Vector, Int64VectorData - - -ctypedef fused scalar64_t: - float64_t - int64_t - - -NODE_CLASSES = {} - - -cdef class IntervalTree(IntervalMixin): - """A centered interval tree - - Based off the algorithm described on Wikipedia: - http://en.wikipedia.org/wiki/Interval_tree - """ - cdef: - readonly object left, right, root - readonly str closed - object _left_sorter, _right_sorter - - def __init__(self, left, right, closed='right', leaf_size=100): - """ - Parameters - ---------- - left, right : np.ndarray[ndim=1] - Left and right bounds for each interval. Assumed to contain no - NaNs. - closed : {'left', 'right', 'both', 'neither'}, optional - Whether the intervals are closed on the left-side, right-side, both - or neither. Defaults to 'right'. - leaf_size : int, optional - Parameter that controls when the tree switches from creating nodes - to brute-force search. Tune this parameter to optimize query - performance. - """ - if closed not in ['left', 'right', 'both', 'neither']: - raise ValueError("invalid option for 'closed': %s" % closed) - - left = np.asarray(left) - right = np.asarray(right) - dtype = np.result_type(left, right) - self.left = np.asarray(left, dtype=dtype) - self.right = np.asarray(right, dtype=dtype) - - indices = np.arange(len(left), dtype='int64') - - self.closed = closed - - node_cls = NODE_CLASSES[str(dtype), closed] - self.root = node_cls(self.left, self.right, indices, leaf_size) - - @property - def left_sorter(self): - """How to sort the left labels; this is used for binary search - """ - if self._left_sorter is None: - self._left_sorter = np.argsort(self.left) - return self._left_sorter - - @property - def right_sorter(self): - """How to sort the right labels - """ - if self._right_sorter is None: - self._right_sorter = np.argsort(self.right) - return self._right_sorter - - def get_loc(self, scalar64_t key): - """Return all positions corresponding to intervals that overlap with - the given scalar key - """ - result = Int64Vector() - self.root.query(result, key) - if not result.data.n: - raise KeyError(key) - return result.to_array() - - def _get_partial_overlap(self, key_left, key_right, side): - """Return all positions corresponding to intervals with the given side - falling between the left and right bounds of an interval query - """ - if side == 'left': - values = self.left - sorter = self.left_sorter - else: - values = self.right - sorter = self.right_sorter - key = [key_left, key_right] - i, j = values.searchsorted(key, sorter=sorter) - return sorter[i:j] - - def get_loc_interval(self, key_left, key_right): - """Lookup the intervals enclosed in the given interval bounds - - The given interval is presumed to have closed bounds. - """ - import pandas as pd - left_overlap = self._get_partial_overlap(key_left, key_right, 'left') - right_overlap = self._get_partial_overlap(key_left, key_right, 'right') - enclosing = self.get_loc(0.5 * (key_left + key_right)) - combined = np.concatenate([left_overlap, right_overlap, enclosing]) - uniques = pd.unique(combined) - return uniques - - def get_indexer(self, scalar64_t[:] target): - """Return the positions corresponding to unique intervals that overlap - with the given array of scalar targets. - """ - # TODO: write get_indexer_intervals - cdef: - int64_t old_len, i - Int64Vector result - - result = Int64Vector() - old_len = 0 - for i in range(len(target)): - self.root.query(result, target[i]) - if result.data.n == old_len: - result.append(-1) - elif result.data.n > old_len + 1: - raise KeyError( - 'indexer does not intersect a unique set of intervals') - old_len = result.data.n - return result.to_array() - - def get_indexer_non_unique(self, scalar64_t[:] target): - """Return the positions corresponding to intervals that overlap with - the given array of scalar targets. Non-unique positions are repeated. - """ - cdef: - int64_t old_len, i - Int64Vector result, missing - - result = Int64Vector() - missing = Int64Vector() - old_len = 0 - for i in range(len(target)): - self.root.query(result, target[i]) - if result.data.n == old_len: - result.append(-1) - missing.append(i) - old_len = result.data.n - return result.to_array(), missing.to_array() - - def __repr__(self): - return ('' - % self.root.n_elements) - - -cdef take(ndarray source, ndarray indices): - """Take the given positions from a 1D ndarray - """ - return PyArray_Take(source, indices, 0) - - -cdef sort_values_and_indices(all_values, all_indices, subset): - indices = take(all_indices, subset) - values = take(all_values, subset) - sorter = PyArray_ArgSort(values, 0, NPY_QUICKSORT) - sorted_values = take(values, sorter) - sorted_indices = take(indices, sorter) - return sorted_values, sorted_indices - - -cdef class Float64ClosedLeftIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedLeftIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedLeftIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'left'] = Float64ClosedLeftIntervalNode - - -cdef class Float64ClosedRightIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedRightIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedRightIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'right'] = Float64ClosedRightIntervalNode - - -cdef class Float64ClosedBothIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedBothIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedBothIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'both'] = Float64ClosedBothIntervalNode - - -cdef class Float64ClosedNeitherIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Float64ClosedNeitherIntervalNode left_node, right_node - float64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - float64_t min_left, max_right - readonly float64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, float64_t[:] left, float64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[float64_t, ndim=1] left, - ndarray[float64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Float64ClosedNeitherIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - float64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['float64', 'neither'] = Float64ClosedNeitherIntervalNode - - -cdef class Int64ClosedLeftIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedLeftIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedLeftIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'left'] = Int64ClosedLeftIntervalNode - - -cdef class Int64ClosedRightIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedRightIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedRightIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'right'] = Int64ClosedRightIntervalNode - - -cdef class Int64ClosedBothIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedBothIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] < self.pivot: - left_ind.append(i) - elif self.pivot < left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedBothIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] <= point <= self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] <= point: - break - result.append(indices[i]) - if point <= self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point <= values[i]: - break - result.append(indices[i]) - if self.right_node.min_left <= point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'both'] = Int64ClosedBothIntervalNode - - -cdef class Int64ClosedNeitherIntervalNode: - """Non-terminal node for an IntervalTree - - Categorizes intervals by those that fall to the left, those that fall to - the right, and those that overlap with the pivot. - """ - cdef: - Int64ClosedNeitherIntervalNode left_node, right_node - int64_t[:] center_left_values, center_right_values, left, right - int64_t[:] center_left_indices, center_right_indices, indices - int64_t min_left, max_right - readonly int64_t pivot - readonly int64_t n_elements, n_center, leaf_size - readonly bint is_leaf_node - - def __init__(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - int64_t leaf_size): - - self.n_elements = len(left) - self.leaf_size = leaf_size - if left.size > 0: - self.min_left = left.min() - self.max_right = right.max() - else: - self.min_left = 0 - self.max_right = 0 - - if self.n_elements <= leaf_size: - # make this a terminal (leaf) node - self.is_leaf_node = True - self.left = left - self.right = right - self.indices = indices - self.n_center - else: - # calculate a pivot so we can create child nodes - self.is_leaf_node = False - self.pivot = np.median(left + right) / 2 - left_set, right_set, center_set = self.classify_intervals(left, right) - - self.left_node = self.new_child_node(left, right, indices, left_set) - self.right_node = self.new_child_node(left, right, indices, right_set) - - self.center_left_values, self.center_left_indices = \ - sort_values_and_indices(left, indices, center_set) - self.center_right_values, self.center_right_indices = \ - sort_values_and_indices(right, indices, center_set) - self.n_center = len(self.center_left_indices) - - @cython.wraparound(False) - @cython.boundscheck(False) - cdef classify_intervals(self, int64_t[:] left, int64_t[:] right): - """Classify the given intervals based upon whether they fall to the - left, right, or overlap with this node's pivot. - """ - cdef: - Int64Vector left_ind, right_ind, overlapping_ind - Py_ssize_t i - - left_ind = Int64Vector() - right_ind = Int64Vector() - overlapping_ind = Int64Vector() - - for i in range(self.n_elements): - if right[i] <= self.pivot: - left_ind.append(i) - elif self.pivot <= left[i]: - right_ind.append(i) - else: - overlapping_ind.append(i) - - return (left_ind.to_array(), - right_ind.to_array(), - overlapping_ind.to_array()) - - cdef new_child_node(self, - ndarray[int64_t, ndim=1] left, - ndarray[int64_t, ndim=1] right, - ndarray[int64_t, ndim=1] indices, - ndarray[int64_t, ndim=1] subset): - """Create a new child node. - """ - left = take(left, subset) - right = take(right, subset) - indices = take(indices, subset) - return Int64ClosedNeitherIntervalNode( - left, right, indices, self.leaf_size) - - @cython.wraparound(False) - @cython.boundscheck(False) - @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar64_t point): - """Recursively query this node and its sub-nodes for intervals that - overlap with the query point. - """ - cdef: - int64_t[:] indices - int64_t[:] values - Py_ssize_t i - - if self.is_leaf_node: - # Once we get down to a certain size, it doesn't make sense to - # continue the binary tree structure. Instead, we use linear - # search. - for i in range(self.n_elements): - if self.left[i] < point < self.right[i]: - result.append(self.indices[i]) - else: - # There are child nodes. Based on comparing our query to the pivot, - # look at the center values, then go to the relevant child. - if point < self.pivot: - values = self.center_left_values - indices = self.center_left_indices - for i in range(self.n_center): - if not values[i] < point: - break - result.append(indices[i]) - if point < self.left_node.max_right: - self.left_node.query(result, point) - elif point > self.pivot: - values = self.center_right_values - indices = self.center_right_indices - for i in range(self.n_center - 1, -1, -1): - if not point < values[i]: - break - result.append(indices[i]) - if self.right_node.min_left < point: - self.right_node.query(result, point) - else: - result.extend(self.center_left_indices) - - def __repr__(self): - if self.is_leaf_node: - return ('' % self.n_elements) - else: - n_left = self.left_node.n_elements - n_right = self.right_node.n_elements - n_center = self.n_elements - n_left - n_right - return ('' % - (self.pivot, self.n_elements, n_left, n_right, n_center)) - - def counts(self): - if self.is_leaf_node: - return self.n_elements - else: - m = len(self.center_left_values) - l = self.left_node.counts() - r = self.right_node.counts() - return (m, (l, r)) - -NODE_CLASSES['int64', 'neither'] = Int64ClosedNeitherIntervalNode - - diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 7301c87026114..a15d7cf26cbea 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -49,7 +49,7 @@ class TestPDApi(Base, tm.TestCase): 'Period', 'PeriodIndex', 'RangeIndex', 'UInt64Index', 'Series', 'SparseArray', 'SparseDataFrame', 'SparseSeries', 'TimeGrouper', 'Timedelta', - 'TimedeltaIndex', 'Timestamp'] + 'TimedeltaIndex', 'Timestamp', 'Interval', 'IntervalIndex'] # these are already deprecated; awaiting removal deprecated_classes = ['WidePanel', 'Panel4D', @@ -63,7 +63,7 @@ class TestPDApi(Base, tm.TestCase): # top-level functions funcs = ['bdate_range', 'concat', 'crosstab', 'cut', - 'date_range', 'eval', + 'date_range', 'interval_range', 'eval', 'factorize', 'get_dummies', 'infer_freq', 'isnull', 'lreshape', 'melt', 'notnull', 'offsets', diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index e52bfdbd4f837..f05b6fdd6bc23 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -8,7 +8,10 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, Index, MultiIndex, - RangeIndex, date_range) + RangeIndex, date_range, IntervalIndex) +from pandas.types.common import (is_object_dtype, + is_categorical_dtype, + is_interval_dtype) import pandas as pd from pandas.util.testing import (assert_series_equal, @@ -295,6 +298,17 @@ def test_set_index_dst(self): exp = pd.DataFrame({'b': [3, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp) + def test_reset_index_with_intervals(self): + idx = pd.IntervalIndex.from_breaks(np.arange(11), name='x') + original = pd.DataFrame({'x': idx, 'y': np.arange(10)})[['x', 'y']] + + result = original.set_index('x') + expected = pd.DataFrame({'y': np.arange(10)}, index=idx) + assert_frame_equal(result, expected) + + result2 = result.reset_index() + assert_frame_equal(result2, original) + def test_set_index_multiindexcolumns(self): columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) @@ -730,3 +744,53 @@ def test_set_index_preserve_categorical_dtype(self): result = df.set_index(cols).reset_index() result = result.reindex(columns=df.columns) tm.assert_frame_equal(result, df) + + +class TestIntervalIndex(tm.TestCase): + + def test_setitem(self): + + df = DataFrame({'A': range(10)}) + s = pd.cut(df.A, 5) + self.assertIsInstance(s.cat.categories, IntervalIndex) + + # B & D end up as Categoricals + # the remainer are converted to in-line objects + # contining an IntervalIndex.values + df['B'] = s + df['C'] = np.array(s) + df['D'] = s.values + df['E'] = np.array(s.values) + + assert is_categorical_dtype(df['B']) + assert is_interval_dtype(df['B'].cat.categories) + assert is_categorical_dtype(df['D']) + assert is_interval_dtype(df['D'].cat.categories) + + assert is_object_dtype(df['C']) + assert is_object_dtype(df['E']) + + # they compare equal as Index + # when converted to numpy objects + c = lambda x: Index(np.array(x)) + tm.assert_index_equal(c(df.B), c(df.B), check_names=False) + tm.assert_index_equal(c(df.B), c(df.C), check_names=False) + tm.assert_index_equal(c(df.B), c(df.D), check_names=False) + tm.assert_index_equal(c(df.B), c(df.D), check_names=False) + + # B & D are the same Series + tm.assert_series_equal(df['B'], df['B'], check_names=False) + tm.assert_series_equal(df['B'], df['D'], check_names=False) + + # C & E are the same Series + tm.assert_series_equal(df['C'], df['C'], check_names=False) + tm.assert_series_equal(df['C'], df['E'], check_names=False) + + def test_set_reset_index(self): + + df = DataFrame({'A': range(10)}) + s = pd.cut(df.A, 5) + df['B'] = s + df = df.set_index('B') + + df = df.reset_index() diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index cfcb531bedab8..68bdc0c6d5112 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import (Index, MultiIndex, CategoricalIndex, - DataFrame, Categorical, Series) + DataFrame, Categorical, Series, Interval) from pandas.util.testing import assert_frame_equal, assert_series_equal import pandas.util.testing as tm from .common import MixIn @@ -519,7 +519,8 @@ def test_groupby_categorical_two_columns(self): res = groups_double_key.agg('mean') nan = np.nan idx = MultiIndex.from_product( - [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), + [Categorical([Interval(1, 2), Interval(2, 3), + Interval(3, 6)], ordered=True), [1, 2, 3, 4]], names=["cat", "C2"]) exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 800e2e8aa1cc1..25f89b29021ce 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -864,11 +864,13 @@ def test_get_group_empty_bins(self): bins = [0, 5, 10, 15] g = d.groupby(pd.cut(d[0], bins)) - result = g.get_group('(0, 5]') + # TODO: should prob allow a str of Interval work as well + # IOW '(0, 5]' + result = g.get_group(pd.Interval(0, 5)) expected = DataFrame([3, 1], index=[0, 1]) assert_frame_equal(result, expected) - self.assertRaises(KeyError, lambda: g.get_group('(10, 15]')) + self.assertRaises(KeyError, lambda: g.get_group(pd.Interval(10, 15))) def test_get_group_grouped_by_tuple(self): # GH 8121 @@ -3866,49 +3868,6 @@ def test_transform_doesnt_clobber_ints(self): expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) - def test_groupby_categorical_two_columns(self): - - # https://github.com/pydata/pandas/issues/8138 - d = {'cat': pd.Categorical(["a","b","a","b"], categories=["a", "b", "c"], ordered=True), - 'ints': [1, 1, 2, 2],'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - exp = DataFrame({"ints":[1.5,1.5,np.nan], "val":[20,30,np.nan]}, - index=pd.CategoricalIndex(["a", "b", "c"], name="cat")) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat","ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val":[10,30,20,40,np.nan,np.nan], - "cat": ["a","a","b","b","c","c"], - "ints": [1,2,1,2,1,2]}).set_index(["cat","ints"]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6], labels=pd.Categorical(['a', 'b', 'c'])) - values.name = "cat" - groups_double_key = test.groupby([values,'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product([['a', 'b', 'c'], [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1":[nan,nan,nan,nan, 3, 3,nan,nan, nan,nan, 4, 5], - "C3":[nan,nan,nan,nan, 10,100,nan,nan, nan,nan,200,34]}, index=idx) - tm.assert_frame_equal(res, exp) - def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 08f8f8d48e705..54d47d02c5e8e 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -7,7 +7,8 @@ from pandas import (Series, Index, Float64Index, Int64Index, UInt64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, - TimedeltaIndex, PeriodIndex, notnull, isnull) + TimedeltaIndex, PeriodIndex, IntervalIndex, + notnull, isnull) from pandas.types.common import needs_i8_conversion from pandas.util.testing import assertRaisesRegexp from pandas._libs.tslib import iNaT @@ -255,18 +256,21 @@ def test_ensure_copied_data(self): tm.assert_numpy_array_equal(index.values, result.values, check_same='copy') - if not isinstance(index, PeriodIndex): - result = index_type(index.values, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index.values, result.values, - check_same='same') - tm.assert_numpy_array_equal(index._values, result._values, - check_same='same') - else: + if isinstance(index, PeriodIndex): # .values an object array of Period, thus copied result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index._values, result._values, check_same='same') + elif isinstance(index, IntervalIndex): + # checked in test_interval.py + pass + else: + result = index_type(index.values, copy=False, **init_kwargs) + tm.assert_numpy_array_equal(index.values, result.values, + check_same='same') + tm.assert_numpy_array_equal(index._values, result._values, + check_same='same') def test_copy_and_deepcopy(self): from copy import copy, deepcopy @@ -377,8 +381,9 @@ def test_memory_usage(self): result2 = index.memory_usage() result3 = index.memory_usage(deep=True) - # RangeIndex doesn't use a hashtable engine - if not isinstance(index, RangeIndex): + # RangeIndex, IntervalIndex + # don't have engines + if not isinstance(index, (RangeIndex, IntervalIndex)): self.assertTrue(result2 > result) if index.inferred_type == 'object': diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index a8197b070b032..cc819ff83b1dd 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -14,7 +14,7 @@ from pandas import (period_range, date_range, Series, DataFrame, Float64Index, Int64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, - PeriodIndex) + PeriodIndex, isnull) from pandas.core.index import _get_combined_index from pandas.util.testing import assert_almost_equal from pandas.compat.numpy import np_datetime64_compat @@ -504,7 +504,7 @@ def test_is_(self): def test_asof(self): d = self.dateIndex[0] self.assertEqual(self.dateIndex.asof(d), d) - self.assertTrue(np.isnan(self.dateIndex.asof(d - timedelta(1)))) + self.assertTrue(isnull(self.dateIndex.asof(d - timedelta(1)))) d = self.dateIndex[-1] self.assertEqual(self.dateIndex.asof(d + timedelta(1)), d) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 0d75ba5f2bd46..b8c50239efac3 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -8,7 +8,7 @@ import numpy as np -from pandas import Categorical, compat, notnull +from pandas import Categorical, IntervalIndex, compat, notnull from pandas.util.testing import assert_almost_equal import pandas.core.config as cf import pandas as pd @@ -343,11 +343,25 @@ def test_astype(self): self.assertIsInstance(result, Index) self.assertNotIsInstance(result, CategoricalIndex) + # interval + ii = IntervalIndex.from_arrays(left=[-0.001, 2.0], + right=[2, 4], + closed='right') + + ci = CategoricalIndex(Categorical.from_codes([0, 1, -1], categories=ii, ordered=True)) + + result = ci.astype('interval') + expected = ii.take([0, 1, -1]) + tm.assert_index_equal(result, expected) + + result = IntervalIndex.from_intervals(result.values) + tm.assert_index_equal(result, expected) + def test_reindex_base(self): # determined by cat ordering idx = self.create_index() - expected = np.array([4, 0, 1, 5, 2, 3], dtype=np.intp) + expected = np.arange(len(idx), dtype=np.intp) actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py new file mode 100644 index 0000000000000..6771b875c5137 --- /dev/null +++ b/pandas/tests/indexes/test_interval.py @@ -0,0 +1,799 @@ +from __future__ import division + +import pytest +import numpy as np + +from pandas import (Interval, IntervalIndex, Index, isnull, + interval_range, Timestamp, Timedelta) +from pandas._libs.interval import IntervalTree +from pandas.tests.indexes.common import Base +import pandas.util.testing as tm +import pandas as pd + + +class TestIntervalIndex(Base, tm.TestCase): + _holder = IntervalIndex + + def setUp(self): + self.index = IntervalIndex.from_arrays([0, 1], [1, 2]) + self.index_with_nan = IntervalIndex.from_tuples( + [(0, 1), np.nan, (1, 2)]) + self.indices = dict(intervalIndex=tm.makeIntervalIndex(10)) + + def create_index(self): + return IntervalIndex.from_breaks(np.arange(10)) + + def test_constructors(self): + expected = self.index + actual = IntervalIndex.from_breaks(np.arange(3), closed='right') + self.assertTrue(expected.equals(actual)) + + alternate = IntervalIndex.from_breaks(np.arange(3), closed='left') + self.assertFalse(expected.equals(alternate)) + + actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) + self.assertTrue(expected.equals(actual)) + + actual = IntervalIndex([Interval(0, 1), Interval(1, 2)]) + self.assertTrue(expected.equals(actual)) + + actual = IntervalIndex.from_arrays(np.arange(2), np.arange(2) + 1, + closed='right') + self.assertTrue(expected.equals(actual)) + + actual = Index([Interval(0, 1), Interval(1, 2)]) + self.assertIsInstance(actual, IntervalIndex) + self.assertTrue(expected.equals(actual)) + + actual = Index(expected) + self.assertIsInstance(actual, IntervalIndex) + self.assertTrue(expected.equals(actual)) + + def test_constructors_other(self): + + # all-nan + result = IntervalIndex.from_intervals([np.nan]) + expected = np.array([np.nan], dtype=object) + tm.assert_numpy_array_equal(result.values, expected) + + # empty + result = IntervalIndex.from_intervals([]) + expected = np.array([], dtype=object) + tm.assert_numpy_array_equal(result.values, expected) + + def test_constructors_errors(self): + + # scalar + with pytest.raises(TypeError): + IntervalIndex(5) + + # not an interval + with pytest.raises(TypeError): + IntervalIndex([0, 1]) + + with pytest.raises(TypeError): + IntervalIndex.from_intervals([0, 1]) + + # invalid closed + with pytest.raises(ValueError): + IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') + + # mismatched closed + with pytest.raises(ValueError): + IntervalIndex.from_intervals([Interval(0, 1), + Interval(1, 2, closed='left')]) + + with pytest.raises(ValueError): + IntervalIndex.from_arrays([0, 10], [3, 5]) + + with pytest.raises(ValueError): + Index([Interval(0, 1), Interval(2, 3, closed='left')]) + + # no point in nesting periods in an IntervalIndex + with pytest.raises(ValueError): + IntervalIndex.from_breaks( + pd.period_range('2000-01-01', periods=3)) + + def test_constructors_datetimelike(self): + + # DTI / TDI + for idx in [pd.date_range('20130101', periods=5), + pd.timedelta_range('1 day', periods=5)]: + result = IntervalIndex.from_breaks(idx) + expected = IntervalIndex.from_breaks(idx.values) + tm.assert_index_equal(result, expected) + + expected_scalar_type = type(idx[0]) + i = result[0] + self.assertTrue(isinstance(i.left, expected_scalar_type)) + self.assertTrue(isinstance(i.right, expected_scalar_type)) + + def test_constructors_error(self): + + # non-intervals + def f(): + IntervalIndex.from_intervals([0.997, 4.0]) + self.assertRaises(TypeError, f) + + def test_properties(self): + index = self.index + self.assertEqual(len(index), 2) + self.assertEqual(index.size, 2) + self.assertEqual(index.shape, (2, )) + + self.assert_index_equal(index.left, Index([0, 1])) + self.assert_index_equal(index.right, Index([1, 2])) + self.assert_index_equal(index.mid, Index([0.5, 1.5])) + + self.assertEqual(index.closed, 'right') + + expected = np.array([Interval(0, 1), Interval(1, 2)], dtype=object) + self.assert_numpy_array_equal(np.asarray(index), expected) + self.assert_numpy_array_equal(index.values, expected) + + # with nans + index = self.index_with_nan + self.assertEqual(len(index), 3) + self.assertEqual(index.size, 3) + self.assertEqual(index.shape, (3, )) + + self.assert_index_equal(index.left, Index([0, np.nan, 1])) + self.assert_index_equal(index.right, Index([1, np.nan, 2])) + self.assert_index_equal(index.mid, Index([0.5, np.nan, 1.5])) + + self.assertEqual(index.closed, 'right') + + expected = np.array([Interval(0, 1), np.nan, + Interval(1, 2)], dtype=object) + self.assert_numpy_array_equal(np.asarray(index), expected) + self.assert_numpy_array_equal(index.values, expected) + + def test_with_nans(self): + index = self.index + self.assertFalse(index.hasnans) + self.assert_numpy_array_equal(index.isnull(), + np.array([False, False])) + self.assert_numpy_array_equal(index.notnull(), + np.array([True, True])) + + index = self.index_with_nan + self.assertTrue(index.hasnans) + self.assert_numpy_array_equal(index.notnull(), + np.array([True, False, True])) + self.assert_numpy_array_equal(index.isnull(), + np.array([False, True, False])) + + def test_copy(self): + actual = self.index.copy() + self.assertTrue(actual.equals(self.index)) + + actual = self.index.copy(deep=True) + self.assertTrue(actual.equals(self.index)) + self.assertIsNot(actual.left, self.index.left) + + def test_ensure_copied_data(self): + # exercise the copy flag in the constructor + + # not copying + index = self.index + result = IntervalIndex(index, copy=False) + tm.assert_numpy_array_equal(index.left.values, result.left.values, + check_same='same') + tm.assert_numpy_array_equal(index.right.values, result.right.values, + check_same='same') + + # by-definition make a copy + result = IntervalIndex.from_intervals(index.values, copy=False) + tm.assert_numpy_array_equal(index.left.values, result.left.values, + check_same='copy') + tm.assert_numpy_array_equal(index.right.values, result.right.values, + check_same='copy') + + def test_equals(self): + + idx = self.index + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + + self.assertFalse(idx.equals(idx.astype(object))) + self.assertFalse(idx.equals(np.array(idx))) + self.assertFalse(idx.equals(list(idx))) + + self.assertFalse(idx.equals([1, 2])) + self.assertFalse(idx.equals(np.array([1, 2]))) + self.assertFalse(idx.equals( + pd.date_range('20130101', periods=2))) + + def test_astype(self): + + idx = self.index + + for dtype in [np.int64, np.float64, 'datetime64[ns]', + 'datetime64[ns, US/Eastern]', 'timedelta64', + 'period[M]']: + self.assertRaises(ValueError, idx.astype, dtype) + + result = idx.astype(object) + tm.assert_index_equal(result, Index(idx.values, dtype='object')) + self.assertFalse(idx.equals(result)) + self.assertTrue(idx.equals(IntervalIndex.from_intervals(result))) + + result = idx.astype('interval') + tm.assert_index_equal(result, idx) + self.assertTrue(result.equals(idx)) + + result = idx.astype('category') + expected = pd.Categorical(idx, ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_where(self): + expected = self.index + result = self.index.where(self.index.notnull()) + tm.assert_index_equal(result, expected) + + idx = IntervalIndex.from_breaks([1, 2]) + result = idx.where([True, False]) + expected = IntervalIndex.from_intervals( + [Interval(1.0, 2.0, closed='right'), np.nan]) + tm.assert_index_equal(result, expected) + + def test_where_array_like(self): + pass + + def test_delete(self): + expected = IntervalIndex.from_breaks([1, 2]) + actual = self.index.delete(0) + self.assertTrue(expected.equals(actual)) + + def test_insert(self): + expected = IntervalIndex.from_breaks(range(4)) + actual = self.index.insert(2, Interval(2, 3)) + self.assertTrue(expected.equals(actual)) + + self.assertRaises(ValueError, self.index.insert, 0, 1) + self.assertRaises(ValueError, self.index.insert, 0, + Interval(2, 3, closed='left')) + + def test_take(self): + actual = self.index.take([0, 1]) + self.assertTrue(self.index.equals(actual)) + + expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2]) + actual = self.index.take([0, 0, 1]) + self.assertTrue(expected.equals(actual)) + + def test_monotonic_and_unique(self): + self.assertTrue(self.index.is_monotonic) + self.assertTrue(self.index.is_unique) + + idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)]) + self.assertTrue(idx.is_monotonic) + self.assertTrue(idx.is_unique) + + idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (1, 2)]) + self.assertFalse(idx.is_monotonic) + self.assertTrue(idx.is_unique) + + idx = IntervalIndex.from_tuples([(0, 2), (0, 2)]) + self.assertFalse(idx.is_unique) + self.assertTrue(idx.is_monotonic) + + @pytest.mark.xfail(reason='not a valid repr as we use interval notation') + def test_repr(self): + i = IntervalIndex.from_tuples([(0, 1), (1, 2)], closed='right') + expected = ("IntervalIndex(left=[0, 1]," + "\n right=[1, 2]," + "\n closed='right'," + "\n dtype='interval[int64]')") + self.assertEqual(repr(i), expected) + + i = IntervalIndex.from_tuples((Timestamp('20130101'), + Timestamp('20130102')), + (Timestamp('20130102'), + Timestamp('20130103')), + closed='right') + expected = ("IntervalIndex(left=['2013-01-01', '2013-01-02']," + "\n right=['2013-01-02', '2013-01-03']," + "\n closed='right'," + "\n dtype='interval[datetime64[ns]]')") + self.assertEqual(repr(i), expected) + + @pytest.mark.xfail(reason='not a valid repr as we use interval notation') + def test_repr_max_seq_item_setting(self): + super(TestIntervalIndex, self).test_repr_max_seq_item_setting() + + @pytest.mark.xfail(reason='not a valid repr as we use interval notation') + def test_repr_roundtrip(self): + super(TestIntervalIndex, self).test_repr_roundtrip() + + def test_get_item(self): + i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), + closed='right') + assert i[0] == Interval(0.0, 1.0) + assert i[1] == Interval(1.0, 2.0) + assert isnull(i[2]) + + result = i[0:1] + expected = IntervalIndex.from_arrays((0.,), (1.,), closed='right') + tm.assert_index_equal(result, expected) + + result = i[0:2] + expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed='right') + tm.assert_index_equal(result, expected) + + result = i[1:3] + expected = IntervalIndex.from_arrays((1., np.nan), (2., np.nan), + closed='right') + tm.assert_index_equal(result, expected) + + def test_get_loc_value(self): + self.assertRaises(KeyError, self.index.get_loc, 0) + self.assertEqual(self.index.get_loc(0.5), 0) + self.assertEqual(self.index.get_loc(1), 0) + self.assertEqual(self.index.get_loc(1.5), 1) + self.assertEqual(self.index.get_loc(2), 1) + self.assertRaises(KeyError, self.index.get_loc, -1) + self.assertRaises(KeyError, self.index.get_loc, 3) + + idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) + self.assertEqual(idx.get_loc(0.5), 0) + self.assertEqual(idx.get_loc(1), 0) + self.assert_numpy_array_equal(idx.get_loc(1.5), + np.array([0, 1], dtype='int64')) + self.assert_numpy_array_equal(np.sort(idx.get_loc(2)), + np.array([0, 1], dtype='int64')) + self.assertEqual(idx.get_loc(3), 1) + self.assertRaises(KeyError, idx.get_loc, 3.5) + + idx = IntervalIndex.from_arrays([0, 2], [1, 3]) + self.assertRaises(KeyError, idx.get_loc, 1.5) + + def slice_locs_cases(self, breaks): + # TODO: same tests for more index types + index = IntervalIndex.from_breaks([0, 1, 2], closed='right') + self.assertEqual(index.slice_locs(), (0, 2)) + self.assertEqual(index.slice_locs(0, 1), (0, 1)) + self.assertEqual(index.slice_locs(1, 1), (0, 1)) + self.assertEqual(index.slice_locs(0, 2), (0, 2)) + self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) + self.assertEqual(index.slice_locs(0, 0.5), (0, 1)) + self.assertEqual(index.slice_locs(start=1), (0, 2)) + self.assertEqual(index.slice_locs(start=1.2), (1, 2)) + self.assertEqual(index.slice_locs(end=1), (0, 1)) + self.assertEqual(index.slice_locs(end=1.1), (0, 2)) + self.assertEqual(index.slice_locs(end=1.0), (0, 1)) + self.assertEqual(*index.slice_locs(-1, -1)) + + index = IntervalIndex.from_breaks([0, 1, 2], closed='neither') + self.assertEqual(index.slice_locs(0, 1), (0, 1)) + self.assertEqual(index.slice_locs(0, 2), (0, 2)) + self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) + self.assertEqual(index.slice_locs(1, 1), (1, 1)) + self.assertEqual(index.slice_locs(1, 2), (1, 2)) + + index = IntervalIndex.from_breaks([0, 1, 2], closed='both') + self.assertEqual(index.slice_locs(1, 1), (0, 2)) + self.assertEqual(index.slice_locs(1, 2), (0, 2)) + + def test_slice_locs_int64(self): + self.slice_locs_cases([0, 1, 2]) + + def test_slice_locs_float64(self): + self.slice_locs_cases([0.0, 1.0, 2.0]) + + def slice_locs_decreasing_cases(self, tuples): + index = IntervalIndex.from_tuples(tuples) + self.assertEqual(index.slice_locs(1.5, 0.5), (1, 3)) + self.assertEqual(index.slice_locs(2, 0), (1, 3)) + self.assertEqual(index.slice_locs(2, 1), (1, 3)) + self.assertEqual(index.slice_locs(3, 1.1), (0, 3)) + self.assertEqual(index.slice_locs(3, 3), (0, 2)) + self.assertEqual(index.slice_locs(3.5, 3.3), (0, 1)) + self.assertEqual(index.slice_locs(1, -3), (2, 3)) + self.assertEqual(*index.slice_locs(-1, -1)) + + def test_slice_locs_decreasing_int64(self): + self.slice_locs_cases([(2, 4), (1, 3), (0, 2)]) + + def test_slice_locs_decreasing_float64(self): + self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)]) + + def test_slice_locs_fails(self): + index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)]) + with self.assertRaises(KeyError): + index.slice_locs(1, 2) + + def test_get_loc_interval(self): + self.assertEqual(self.index.get_loc(Interval(0, 1)), 0) + self.assertEqual(self.index.get_loc(Interval(0, 0.5)), 0) + self.assertEqual(self.index.get_loc(Interval(0, 1, 'left')), 0) + self.assertRaises(KeyError, self.index.get_loc, Interval(2, 3)) + self.assertRaises(KeyError, self.index.get_loc, + Interval(-1, 0, 'left')) + + def test_get_indexer(self): + actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) + expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(self.index) + expected = np.array([0, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + index = IntervalIndex.from_breaks([0, 1, 2], closed='left') + actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) + expected = np.array([-1, 0, 0, 1, 1, -1, -1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(index[:1]) + expected = np.array([0], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(index) + expected = np.array([-1, 0], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + @pytest.mark.xfail(reason="what to return for overlaps") + def test_get_indexer_subintervals(self): + # TODO + + # return indexers for wholly contained subintervals + target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) + actual = self.index.get_indexer(target) + expected = np.array([0, 0, 1, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) + actual = self.index.get_indexer(target) + expected = np.array([-1, 0, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index.get_indexer(target[[0, -1]]) + expected = np.array([0, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left') + actual = self.index.get_indexer(target) + expected = np.array([0, 0, 0], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + def test_contains(self): + # only endpoints are valid + i = IntervalIndex.from_arrays([0, 1], [1, 2]) + + # invalid + self.assertNotIn(0, i) + self.assertNotIn(1, i) + self.assertNotIn(2, i) + + # valid + self.assertIn(Interval(0, 1), i) + self.assertIn(Interval(0, 2), i) + self.assertIn(Interval(0, 0.5), i) + self.assertNotIn(Interval(3, 5), i) + self.assertNotIn(Interval(-1, 0, closed='left'), i) + + def test_is_contained_in(self): + # can select values that are IN the range of a value + i = IntervalIndex.from_arrays([0, 1], [1, 2]) + + assert i._is_contained_in(0.1) + assert i._is_contained_in(0.5) + assert i._is_contained_in(1) + assert i._is_contained_in(Interval(0, 1)) + assert i._is_contained_in(Interval(0, 2)) + + # these overlaps completely + assert i._is_contained_in(Interval(0, 3)) + assert i._is_contained_in(Interval(1, 3)) + + assert not i._is_contained_in(20) + assert not i._is_contained_in(-20) + + def test_dropna(self): + + expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)]) + + ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan]) + result = ii.dropna() + tm.assert_index_equal(result, expected) + + ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan]) + result = ii.dropna() + tm.assert_index_equal(result, expected) + + def test_non_contiguous(self): + index = IntervalIndex.from_tuples([(0, 1), (2, 3)]) + target = [0.5, 1.5, 2.5] + actual = index.get_indexer(target) + expected = np.array([0, -1, 1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + self.assertNotIn(1.5, index) + + def test_union(self): + other = IntervalIndex.from_arrays([2], [3]) + expected = IntervalIndex.from_arrays(range(3), range(1, 4)) + actual = self.index.union(other) + self.assertTrue(expected.equals(actual)) + + actual = other.union(self.index) + self.assertTrue(expected.equals(actual)) + + tm.assert_index_equal(self.index.union(self.index), self.index) + tm.assert_index_equal(self.index.union(self.index[:1]), + self.index) + + def test_intersection(self): + other = IntervalIndex.from_breaks([1, 2, 3]) + expected = IntervalIndex.from_breaks([1, 2]) + actual = self.index.intersection(other) + self.assertTrue(expected.equals(actual)) + + tm.assert_index_equal(self.index.intersection(self.index), + self.index) + + def test_difference(self): + tm.assert_index_equal(self.index.difference(self.index[:1]), + self.index[1:]) + + def test_symmetric_difference(self): + result = self.index[:1].symmetric_difference(self.index[1:]) + expected = self.index + tm.assert_index_equal(result, expected) + + def test_set_operation_errors(self): + self.assertRaises(ValueError, self.index.union, self.index.left) + + other = IntervalIndex.from_breaks([0, 1, 2], closed='neither') + self.assertRaises(ValueError, self.index.union, other) + + def test_isin(self): + actual = self.index.isin(self.index) + self.assert_numpy_array_equal(np.array([True, True]), actual) + + actual = self.index.isin(self.index[:1]) + self.assert_numpy_array_equal(np.array([True, False]), actual) + + def test_comparison(self): + actual = Interval(0, 1) < self.index + expected = np.array([False, True]) + self.assert_numpy_array_equal(actual, expected) + + actual = Interval(0.5, 1.5) < self.index + expected = np.array([False, True]) + self.assert_numpy_array_equal(actual, expected) + actual = self.index > Interval(0.5, 1.5) + self.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index + expected = np.array([True, True]) + self.assert_numpy_array_equal(actual, expected) + actual = self.index <= self.index + self.assert_numpy_array_equal(actual, expected) + actual = self.index >= self.index + self.assert_numpy_array_equal(actual, expected) + + actual = self.index < self.index + expected = np.array([False, False]) + self.assert_numpy_array_equal(actual, expected) + actual = self.index > self.index + self.assert_numpy_array_equal(actual, expected) + + actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left') + self.assert_numpy_array_equal(actual, expected) + + actual = self.index == self.index.values + self.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index.values == self.index + self.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index <= self.index.values + self.assert_numpy_array_equal(actual, np.array([True, True])) + actual = self.index != self.index.values + self.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index > self.index.values + self.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index.values > self.index + self.assert_numpy_array_equal(actual, np.array([False, False])) + + # invalid comparisons + actual = self.index == 0 + self.assert_numpy_array_equal(actual, np.array([False, False])) + actual = self.index == self.index.left + self.assert_numpy_array_equal(actual, np.array([False, False])) + + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + self.index > 0 + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + self.index <= 0 + with self.assertRaises(TypeError): + self.index > np.arange(2) + with self.assertRaises(ValueError): + self.index > np.arange(3) + + def test_missing_values(self): + idx = pd.Index([np.nan, pd.Interval(0, 1), pd.Interval(1, 2)]) + idx2 = pd.IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2]) + assert idx.equals(idx2) + + with pytest.raises(ValueError): + IntervalIndex.from_arrays([np.nan, 0, 1], np.array([0, 1, 2])) + + self.assert_numpy_array_equal(isnull(idx), + np.array([True, False, False])) + + def test_sort_values(self): + expected = IntervalIndex.from_breaks([1, 2, 3, 4]) + actual = IntervalIndex.from_tuples([(3, 4), (1, 2), + (2, 3)]).sort_values() + tm.assert_index_equal(expected, actual) + + # nan + idx = self.index_with_nan + mask = idx.isnull() + self.assert_numpy_array_equal(mask, np.array([False, True, False])) + + result = idx.sort_values() + mask = result.isnull() + self.assert_numpy_array_equal(mask, np.array([False, False, True])) + + result = idx.sort_values(ascending=False) + mask = result.isnull() + self.assert_numpy_array_equal(mask, np.array([True, False, False])) + + def test_datetime(self): + dates = pd.date_range('2000', periods=3) + idx = IntervalIndex.from_breaks(dates) + + tm.assert_index_equal(idx.left, dates[:2]) + tm.assert_index_equal(idx.right, dates[-2:]) + + expected = pd.date_range('2000-01-01T12:00', periods=2) + tm.assert_index_equal(idx.mid, expected) + + self.assertNotIn(pd.Timestamp('2000-01-01T12'), idx) + self.assertNotIn(pd.Timestamp('2000-01-01T12'), idx) + + target = pd.date_range('1999-12-31T12:00', periods=7, freq='12H') + actual = idx.get_indexer(target) + expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='int64') + self.assert_numpy_array_equal(actual, expected) + + def test_append(self): + + index1 = IntervalIndex.from_arrays([0, 1], [1, 2]) + index2 = IntervalIndex.from_arrays([1, 2], [2, 3]) + + result = index1.append(index2) + expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3]) + tm.assert_index_equal(result, expected) + + result = index1.append([index1, index2]) + expected = IntervalIndex.from_arrays([0, 1, 0, 1, 1, 2], + [1, 2, 1, 2, 2, 3]) + tm.assert_index_equal(result, expected) + + def f(): + index1.append(IntervalIndex.from_arrays([0, 1], [1, 2], + closed='both')) + + self.assertRaises(ValueError, f) + + +class TestIntervalRange(tm.TestCase): + + def test_construction(self): + result = interval_range(0, 5, name='foo', closed='both') + expected = IntervalIndex.from_breaks( + np.arange(0, 5), name='foo', closed='both') + tm.assert_index_equal(result, expected) + + def test_errors(self): + + # not enough params + def f(): + interval_range(0) + + self.assertRaises(ValueError, f) + + def f(): + interval_range(periods=2) + + self.assertRaises(ValueError, f) + + def f(): + interval_range() + + self.assertRaises(ValueError, f) + + # mixed units + def f(): + interval_range(0, Timestamp('20130101'), freq=2) + + self.assertRaises(ValueError, f) + + def f(): + interval_range(0, 10, freq=Timedelta('1day')) + + self.assertRaises(ValueError, f) + + +class TestIntervalTree(tm.TestCase): + def setUp(self): + gentree = lambda dtype: IntervalTree(np.arange(5, dtype=dtype), + np.arange(5, dtype=dtype) + 2) + self.tree = gentree('int64') + self.trees = {dtype: gentree(dtype) + for dtype in ['int32', 'int64', 'float32', 'float64']} + + def test_get_loc(self): + for dtype, tree in self.trees.items(): + self.assert_numpy_array_equal(tree.get_loc(1), + np.array([0], dtype='int64')) + self.assert_numpy_array_equal(np.sort(tree.get_loc(2)), + np.array([0, 1], dtype='int64')) + with self.assertRaises(KeyError): + tree.get_loc(-1) + + def test_get_indexer(self): + for dtype, tree in self.trees.items(): + self.assert_numpy_array_equal( + tree.get_indexer(np.array([1.0, 5.5, 6.5])), + np.array([0, 4, -1], dtype='int64')) + with self.assertRaises(KeyError): + tree.get_indexer(np.array([3.0])) + + def test_get_indexer_non_unique(self): + indexer, missing = self.tree.get_indexer_non_unique( + np.array([1.0, 2.0, 6.5])) + self.assert_numpy_array_equal(indexer[:1], + np.array([0], dtype='int64')) + self.assert_numpy_array_equal(np.sort(indexer[1:3]), + np.array([0, 1], dtype='int64')) + self.assert_numpy_array_equal(np.sort(indexer[3:]), + np.array([-1], dtype='int64')) + self.assert_numpy_array_equal(missing, np.array([2], dtype='int64')) + + def test_duplicates(self): + tree = IntervalTree([0, 0, 0], [1, 1, 1]) + self.assert_numpy_array_equal(np.sort(tree.get_loc(0.5)), + np.array([0, 1, 2], dtype='int64')) + + with self.assertRaises(KeyError): + tree.get_indexer(np.array([0.5])) + + indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) + self.assert_numpy_array_equal(np.sort(indexer), + np.array([0, 1, 2], dtype='int64')) + self.assert_numpy_array_equal(missing, np.array([], dtype='int64')) + + def test_get_loc_closed(self): + for closed in ['left', 'right', 'both', 'neither']: + tree = IntervalTree([0], [1], closed=closed) + for p, errors in [(0, tree.open_left), + (1, tree.open_right)]: + if errors: + with self.assertRaises(KeyError): + tree.get_loc(p) + else: + self.assert_numpy_array_equal(tree.get_loc(p), + np.array([0], dtype='int64')) + + def test_get_indexer_closed(self): + x = np.arange(1000, dtype='int64') + found = x + not_found = (-1 * np.ones(1000)).astype('int64') + for leaf_size in [1, 10, 100, 10000]: + for closed in ['left', 'right', 'both', 'neither']: + tree = IntervalTree(x, x + 0.5, closed=closed, + leaf_size=leaf_size) + self.assert_numpy_array_equal(found, + tree.get_indexer(x + 0.25)) + + expected = found if tree.closed_left else not_found + self.assert_numpy_array_equal(expected, + tree.get_indexer(x + 0.0)) + + expected = found if tree.closed_right else not_found + self.assert_numpy_array_equal(expected, + tree.get_indexer(x + 0.5)) diff --git a/pandas/tests/indexing/test_interval.py b/pandas/tests/indexing/test_interval.py new file mode 100644 index 0000000000000..a5432be0da466 --- /dev/null +++ b/pandas/tests/indexing/test_interval.py @@ -0,0 +1,141 @@ +import pytest +import numpy as np +import pandas as pd + +from pandas import Series, DataFrame, IntervalIndex, Interval +import pandas.util.testing as tm + + +class TestIntervalIndex(tm.TestCase): + + def setUp(self): + self.s = Series(np.arange(5), IntervalIndex.from_breaks(np.arange(6))) + + def test_loc_getitem_series(self): + + s = self.s + expected = 0 + self.assertEqual(expected, s.loc[0.5]) + self.assertEqual(expected, s.loc[1]) + self.assertEqual(expected, s.loc[Interval(0, 1)]) + self.assertRaises(KeyError, s.loc.__getitem__, 0) + + expected = s.iloc[:3] + tm.assert_series_equal(expected, s.loc[:3]) + tm.assert_series_equal(expected, s.loc[:2.5]) + tm.assert_series_equal(expected, s.loc[0.1:2.5]) + tm.assert_series_equal(expected, s.loc[-1:3]) + + expected = s.iloc[1:4] + tm.assert_series_equal(expected, s.loc[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, s.loc[[2, 3, 4]]) + tm.assert_series_equal(expected, s.loc[[1.5, 3, 4]]) + + expected = s.iloc[2:5] + tm.assert_series_equal(expected, s.loc[s >= 2]) + + expected = s.iloc[2:5] + result = s.loc[[pd.Interval(3, 6)]] + tm.assert_series_equal(expected, result) + + expected = s.iloc[2:4] + result = s.loc[[pd.Interval(3, 5)]] + tm.assert_series_equal(expected, result) + + expected = s.iloc[[2, 3, 4, 2, 3, 4]] + result = s.loc[[pd.Interval(3, 6), pd.Interval(3, 6)]] + tm.assert_series_equal(expected, result) + + # slice of interval + with pytest.raises(NotImplementedError): + result = s.loc[pd.Interval(3, 6):] + + def test_loc_non_matching(self): + s = self.s + + # TODO: We are getting at least 1 matching + # interval so this meets our current semantics + expected = s.iloc[[2, 3, 4]] + result = s.loc[[-1, 3, 4, 5]] + tm.assert_series_equal(expected, result) + + def test_getitem_series(self): + + s = self.s + expected = 0 + self.assertEqual(expected, s[0.5]) + self.assertEqual(expected, s[1]) + self.assertEqual(expected, s[Interval(0, 1)]) + self.assertRaises(KeyError, s.__getitem__, 0) + + expected = s.iloc[:3] + tm.assert_series_equal(expected, s[:3]) + tm.assert_series_equal(expected, s[:2.5]) + tm.assert_series_equal(expected, s[0.1:2.5]) + tm.assert_series_equal(expected, s[-1:3]) + + expected = s.iloc[1:4] + tm.assert_series_equal(expected, s[[1.5, 2.5, 3.5]]) + tm.assert_series_equal(expected, s[[2, 3, 4]]) + tm.assert_series_equal(expected, s[[1.5, 3, 4]]) + + expected = s.iloc[2:5] + tm.assert_series_equal(expected, s[s >= 2]) + + expected = s.iloc[2:5] + result = s[[pd.Interval(3, 6)]] + tm.assert_series_equal(expected, result) + + # slice of interval + with pytest.raises(NotImplementedError): + result = s[pd.Interval(3, 6):] + + # slice of scalar + with pytest.raises(NotImplementedError): + s[0:4:2] + + def test_large_series(self): + s = Series(np.arange(1000000), + index=IntervalIndex.from_breaks(np.arange(1000001))) + + result1 = s.loc[:80000] + result2 = s.loc[0:80000] + result3 = s.loc[0:80000:1] + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) + + def test_loc_getitem_frame(self): + + df = DataFrame({'A': range(10)}) + s = pd.cut(df.A, 5) + df['B'] = s + df = df.set_index('B') + + result = df.loc[4] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + def f(): + df.loc[10] + + self.assertRaises(KeyError, f) + + # single list-like + result = df.loc[[4]] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) + + # non-unique + result = df.loc[[4, 5]] + expected = df.take([4, 5, 4, 5]) + tm.assert_frame_equal(result, expected) + + def f(): + df.loc[[10]] + + self.assertRaises(KeyError, f) + + # partial missing + result = df.loc[[10, 4]] + expected = df.iloc[4:6] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/test_interval.py b/pandas/tests/scalar/test_interval.py new file mode 100644 index 0000000000000..63e57fb472861 --- /dev/null +++ b/pandas/tests/scalar/test_interval.py @@ -0,0 +1,129 @@ +from __future__ import division + +import pytest +from pandas import Interval +import pandas.util.testing as tm + + +class TestInterval(tm.TestCase): + def setUp(self): + self.interval = Interval(0, 1) + + def test_properties(self): + self.assertEqual(self.interval.closed, 'right') + self.assertEqual(self.interval.left, 0) + self.assertEqual(self.interval.right, 1) + self.assertEqual(self.interval.mid, 0.5) + + def test_repr(self): + self.assertEqual(repr(self.interval), + "Interval(0, 1, closed='right')") + self.assertEqual(str(self.interval), "(0, 1]") + + interval_left = Interval(0, 1, closed='left') + self.assertEqual(repr(interval_left), + "Interval(0, 1, closed='left')") + self.assertEqual(str(interval_left), "[0, 1)") + + def test_contains(self): + self.assertIn(0.5, self.interval) + self.assertIn(1, self.interval) + self.assertNotIn(0, self.interval) + self.assertRaises(TypeError, lambda: self.interval in self.interval) + + interval = Interval(0, 1, closed='both') + self.assertIn(0, interval) + self.assertIn(1, interval) + + interval = Interval(0, 1, closed='neither') + self.assertNotIn(0, interval) + self.assertIn(0.5, interval) + self.assertNotIn(1, interval) + + def test_equal(self): + self.assertEqual(Interval(0, 1), Interval(0, 1, closed='right')) + self.assertNotEqual(Interval(0, 1), Interval(0, 1, closed='left')) + self.assertNotEqual(Interval(0, 1), 0) + + def test_comparison(self): + with self.assertRaisesRegexp(TypeError, 'unorderable types'): + Interval(0, 1) < 2 + + self.assertTrue(Interval(0, 1) < Interval(1, 2)) + self.assertTrue(Interval(0, 1) < Interval(0, 2)) + self.assertTrue(Interval(0, 1) < Interval(0.5, 1.5)) + self.assertTrue(Interval(0, 1) <= Interval(0, 1)) + self.assertTrue(Interval(0, 1) > Interval(-1, 2)) + self.assertTrue(Interval(0, 1) >= Interval(0, 1)) + + def test_hash(self): + # should not raise + hash(self.interval) + + def test_math_add(self): + expected = Interval(1, 2) + actual = self.interval + 1 + self.assertEqual(expected, actual) + + expected = Interval(1, 2) + actual = 1 + self.interval + self.assertEqual(expected, actual) + + actual = self.interval + actual += 1 + self.assertEqual(expected, actual) + + with pytest.raises(TypeError): + self.interval + Interval(1, 2) + + with pytest.raises(TypeError): + self.interval + 'foo' + + def test_math_sub(self): + expected = Interval(-1, 0) + actual = self.interval - 1 + self.assertEqual(expected, actual) + + actual = self.interval + actual -= 1 + self.assertEqual(expected, actual) + + with pytest.raises(TypeError): + self.interval - Interval(1, 2) + + with pytest.raises(TypeError): + self.interval - 'foo' + + def test_math_mult(self): + expected = Interval(0, 2) + actual = self.interval * 2 + self.assertEqual(expected, actual) + + expected = Interval(0, 2) + actual = 2 * self.interval + self.assertEqual(expected, actual) + + actual = self.interval + actual *= 2 + self.assertEqual(expected, actual) + + with pytest.raises(TypeError): + self.interval * Interval(1, 2) + + with pytest.raises(TypeError): + self.interval * 'foo' + + def test_math_div(self): + expected = Interval(0, 0.5) + actual = self.interval / 2.0 + self.assertEqual(expected, actual) + + actual = self.interval + actual /= 2.0 + self.assertEqual(expected, actual) + + with pytest.raises(TypeError): + self.interval / Interval(1, 2) + + with pytest.raises(TypeError): + self.interval / 'foo' diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index dbe2db67359f3..5822489c06a38 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -10,8 +10,7 @@ from pandas.types.common import is_categorical_dtype, is_datetime64tz_dtype from pandas import (Index, Series, isnull, date_range, - period_range, NaT) -from pandas.core.index import MultiIndex + NaT, period_range, MultiIndex, IntervalIndex) from pandas.tseries.index import Timestamp, DatetimeIndex from pandas._libs import lib @@ -543,6 +542,17 @@ def test_constructor_with_datetime_tz(self): expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected) + def test_construction_interval(self): + # construction from interval & array of intervals + index = IntervalIndex.from_breaks(np.arange(3), closed='right') + result = Series(index) + repr(result) + str(result) + tm.assert_index_equal(Index(result.values), index) + + result = Series(index.values) + tm.assert_index_equal(Index(result.values), index) + def test_construction_consistency(self): # make sure that we are not re-localizing upon construction diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index ea49abeee21c5..4a3332c2de6d8 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import (Series, DataFrame, isnull, date_range, - MultiIndex, Index, Timestamp, NaT) + MultiIndex, Index, Timestamp, NaT, IntervalIndex) from pandas.compat import range from pandas._libs.tslib import iNaT from pandas.util.testing import assert_series_equal, assert_frame_equal @@ -556,6 +556,15 @@ def test_dropna_no_nan(self): s2.dropna(inplace=True) self.assert_series_equal(s2, s) + def test_dropna_intervals(self): + s = Series([np.nan, 1, 2, 3], IntervalIndex.from_arrays( + [np.nan, 0, 1, 2], + [np.nan, 1, 2, 3])) + + result = s.dropna() + expected = s.iloc[1:] + assert_series_equal(result, expected) + def test_valid(self): ts = self.ts.copy() ts[::2] = np.NaN diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 135521f287f7c..19c9b69ff1988 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -7,7 +7,9 @@ from datetime import datetime from itertools import permutations from pandas import (Series, Categorical, CategoricalIndex, Index, - Timestamp, DatetimeIndex) +from pandas import (Series, Categorical, CategoricalIndex, + Timestamp, DatetimeIndex, + Index, IntervalIndex) import pandas as pd from pandas import compat @@ -590,8 +592,9 @@ def test_value_counts(self): # tm.assertIsInstance(factor, n) result = algos.value_counts(factor) - breaks = [-1.192, -0.535, 0.121, 0.777, 1.433] - expected_index = pd.IntervalIndex.from_breaks(breaks) + breaks = [-1.194, -0.535, 0.121, 0.777, 1.433] + expected_index = pd.IntervalIndex.from_breaks( + breaks).astype('category') expected = Series([1, 1, 1, 1], index=expected_index) tm.assert_series_equal(result.sort_index(), expected.sort_index()) @@ -599,13 +602,15 @@ def test_value_counts(self): def test_value_counts_bins(self): s = [1, 2, 3, 4] result = algos.value_counts(s, bins=1) - self.assertEqual(result.tolist(), [4]) - self.assertEqual(result.index[0], pd.Interval(0.999, 4.0)) + expected = Series([4], + index=IntervalIndex.from_tuples([(0.996, 4.0)])) + tm.assert_series_equal(result, expected) result = algos.value_counts(s, bins=2, sort=False) - self.assertEqual(result.tolist(), [2, 2]) - self.assertEqual(result.index.min(), pd.Interval(0.999, 2.5)) - self.assertEqual(result.index.max(), pd.Interval(2.5, 4.0)) + expected = Series([2, 2], + index=IntervalIndex.from_tuples([(0.996, 2.5), + (2.5, 4.0)])) + tm.assert_series_equal(result, expected) def test_value_counts_dtypes(self): result = algos.value_counts([1, 1.]) @@ -657,6 +662,7 @@ def test_categorical(self): result = s.value_counts() expected = Series([3, 2, 1], index=pd.CategoricalIndex(['a', 'b', 'c'])) + tm.assert_series_equal(result, expected, check_index_type=True) # preserve order? @@ -670,12 +676,13 @@ def test_categorical_nans(self): s.iloc[1] = np.nan result = s.value_counts() expected = Series([4, 3, 2], index=pd.CategoricalIndex( + ['a', 'b', 'c'], categories=['a', 'b', 'c'])) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) expected = Series([ 4, 3, 2, 1 - ], index=pd.CategoricalIndex(['a', 'b', 'c', np.nan])) + ], index=CategoricalIndex(['a', 'b', 'c', np.nan])) tm.assert_series_equal(result, expected, check_index_type=True) # out of order diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 1fe449fa26aef..4a1cf6314aaed 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -31,7 +31,6 @@ def test_string_methods_dont_fail(self): unicode(self.container) # noqa def test_tricky_container(self): - import nose if not hasattr(self, 'unicode_container'): pytest.skip('Need unicode_container to test with this') repr(self.unicode_container) @@ -576,10 +575,10 @@ def test_value_counts_bins(self): s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) - exp1 = Series({Interval(0.999, 3.0): 4}) + exp1 = Series({Interval(0.997, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) - exp1n = Series({Interval(0.999, 3.0): 1.0}) + exp1n = Series({Interval(0.997, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): @@ -590,12 +589,20 @@ def test_value_counts_bins(self): self.assertEqual(s1.nunique(), 3) - res4 = s1.value_counts(bins=4) - intervals = IntervalIndex.from_breaks([0.999, 1.5, 2.0, 2.5, 3.0]) - exp4 = Series([2, 1, 1], index=intervals.take([0, 3, 1])) + # these return the same + res4 = s1.value_counts(bins=4, dropna=True) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) + + res4 = s1.value_counts(bins=4, dropna=False) + intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) + exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) + tm.assert_series_equal(res4, exp4) + res4n = s1.value_counts(bins=4, normalize=True) - exp4n = Series([0.5, 0.25, 0.25], index=intervals.take([0, 3, 1])) + exp4n = Series([0.5, 0.25, 0.25, 0], + index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index fe37fa000e687..8600b2d726e49 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -22,7 +22,7 @@ date_range, DatetimeIndex, period_range, PeriodIndex, timedelta_range, TimedeltaIndex, NaT, - Interval) + Interval, IntervalIndex) from pandas.compat import range, lrange, u, PY3 from pandas.core.config import option_context @@ -122,6 +122,16 @@ def test_constructor_unsortable(self): self.assertRaises( TypeError, lambda: Categorical(arr, ordered=True)) + def test_constructor_interval(self): + result = Categorical([Interval(1, 2), Interval(2, 3), Interval(3, 6)], + ordered=True) + ii = IntervalIndex.from_intervals([Interval(1, 2), + Interval(2, 3), + Interval(3, 6)]) + exp = Categorical(ii, ordered=True) + self.assert_categorical_equal(result, exp) + tm.assert_index_equal(result.categories, ii) + def test_is_equal_dtype(self): # test dtype comparisons between cats diff --git a/pandas/tests/test_interval.py b/pandas/tests/test_interval.py deleted file mode 100644 index 1b52e2629b38c..0000000000000 --- a/pandas/tests/test_interval.py +++ /dev/null @@ -1,591 +0,0 @@ -from __future__ import division -import numpy as np - -from pandas.core.interval import Interval, IntervalIndex -from pandas.core.index import Index -from pandas.lib import IntervalTree - -import pandas.util.testing as tm -import pandas as pd - - -class TestInterval(tm.TestCase): - def setUp(self): - self.interval = Interval(0, 1) - - def test_properties(self): - self.assertEqual(self.interval.closed, 'right') - self.assertEqual(self.interval.left, 0) - self.assertEqual(self.interval.right, 1) - self.assertEqual(self.interval.mid, 0.5) - - def test_repr(self): - self.assertEqual(repr(self.interval), - "Interval(0, 1, closed='right')") - self.assertEqual(str(self.interval), "(0, 1]") - - interval_left = Interval(0, 1, closed='left') - self.assertEqual(repr(interval_left), - "Interval(0, 1, closed='left')") - self.assertEqual(str(interval_left), "[0, 1)") - - def test_contains(self): - self.assertIn(0.5, self.interval) - self.assertIn(1, self.interval) - self.assertNotIn(0, self.interval) - self.assertRaises(TypeError, lambda: self.interval in self.interval) - - interval = Interval(0, 1, closed='both') - self.assertIn(0, interval) - self.assertIn(1, interval) - - interval = Interval(0, 1, closed='neither') - self.assertNotIn(0, interval) - self.assertIn(0.5, interval) - self.assertNotIn(1, interval) - - def test_equal(self): - self.assertEqual(Interval(0, 1), Interval(0, 1, closed='right')) - self.assertNotEqual(Interval(0, 1), Interval(0, 1, closed='left')) - self.assertNotEqual(Interval(0, 1), 0) - - def test_comparison(self): - with self.assertRaisesRegexp(TypeError, 'unorderable types'): - Interval(0, 1) < 2 - - self.assertTrue(Interval(0, 1) < Interval(1, 2)) - self.assertTrue(Interval(0, 1) < Interval(0, 2)) - self.assertTrue(Interval(0, 1) < Interval(0.5, 1.5)) - self.assertTrue(Interval(0, 1) <= Interval(0, 1)) - self.assertTrue(Interval(0, 1) > Interval(-1, 2)) - self.assertTrue(Interval(0, 1) >= Interval(0, 1)) - - def test_hash(self): - # should not raise - hash(self.interval) - - def test_math_add(self): - expected = Interval(1, 2) - actual = self.interval + 1 - self.assertEqual(expected, actual) - - expected = Interval(1, 2) - actual = 1 + self.interval - self.assertEqual(expected, actual) - - actual = self.interval - actual += 1 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval + Interval(1, 2) - - def test_math_sub(self): - expected = Interval(-1, 0) - actual = self.interval - 1 - self.assertEqual(expected, actual) - - actual = self.interval - actual -= 1 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval - Interval(1, 2) - - def test_math_mult(self): - expected = Interval(0, 2) - actual = self.interval * 2 - self.assertEqual(expected, actual) - - expected = Interval(0, 2) - actual = 2 * self.interval - self.assertEqual(expected, actual) - - actual = self.interval - actual *= 2 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval * Interval(1, 2) - - def test_math_div(self): - expected = Interval(0, 0.5) - actual = self.interval / 2.0 - self.assertEqual(expected, actual) - - actual = self.interval - actual /= 2.0 - self.assertEqual(expected, actual) - - with self.assertRaises(TypeError): - self.interval / Interval(1, 2) - - -class TestIntervalTree(tm.TestCase): - def setUp(self): - self.tree = IntervalTree(np.arange(5), np.arange(5) + 2) - - def test_get_loc(self): - self.assert_numpy_array_equal(self.tree.get_loc(1), [0]) - self.assert_numpy_array_equal(np.sort(self.tree.get_loc(2)), [0, 1]) - with self.assertRaises(KeyError): - self.tree.get_loc(-1) - - def test_get_indexer(self): - self.assert_numpy_array_equal( - self.tree.get_indexer(np.array([1.0, 5.5, 6.5])), [0, 4, -1]) - with self.assertRaises(KeyError): - self.tree.get_indexer(np.array([3.0])) - - def test_get_indexer_non_unique(self): - indexer, missing = self.tree.get_indexer_non_unique( - np.array([1.0, 2.0, 6.5])) - self.assert_numpy_array_equal(indexer[:1], [0]) - self.assert_numpy_array_equal(np.sort(indexer[1:3]), [0, 1]) - self.assert_numpy_array_equal(np.sort(indexer[3:]), [-1]) - self.assert_numpy_array_equal(missing, [2]) - - def test_duplicates(self): - tree = IntervalTree([0, 0, 0], [1, 1, 1]) - self.assert_numpy_array_equal(np.sort(tree.get_loc(0.5)), [0, 1, 2]) - - with self.assertRaises(KeyError): - tree.get_indexer(np.array([0.5])) - - indexer, missing = tree.get_indexer_non_unique(np.array([0.5])) - self.assert_numpy_array_equal(np.sort(indexer), [0, 1, 2]) - self.assert_numpy_array_equal(missing, []) - - def test_get_loc_closed(self): - for closed in ['left', 'right', 'both', 'neither']: - tree = IntervalTree([0], [1], closed=closed) - for p, errors in [(0, tree.open_left), - (1, tree.open_right)]: - if errors: - with self.assertRaises(KeyError): - tree.get_loc(p) - else: - self.assert_numpy_array_equal(tree.get_loc(p), - np.array([0])) - - def test_get_indexer_closed(self): - x = np.arange(1000) - found = x - not_found = -np.ones(1000) - for leaf_size in [1, 10, 100, 10000]: - for closed in ['left', 'right', 'both', 'neither']: - tree = IntervalTree(x, x + 0.5, closed=closed, - leaf_size=leaf_size) - self.assert_numpy_array_equal(found, tree.get_indexer(x + 0.25)) - - expected = found if tree.closed_left else not_found - self.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.0)) - - expected = found if tree.closed_right else not_found - self.assert_numpy_array_equal(expected, tree.get_indexer(x + 0.5)) - - -class TestIntervalIndex(tm.TestCase): - def setUp(self): - self.index = IntervalIndex([0, 1], [1, 2]) - - def test_constructors(self): - expected = self.index - actual = IntervalIndex.from_breaks(np.arange(3), closed='right') - self.assertTrue(expected.equals(actual)) - - alternate = IntervalIndex.from_breaks(np.arange(3), closed='left') - self.assertFalse(expected.equals(alternate)) - - actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) - self.assertTrue(expected.equals(actual)) - - self.assertRaises(ValueError, IntervalIndex, [0], [1], closed='invalid') - - # TODO: fix all these commented out tests (here and below) - - intervals = [Interval(0, 1), Interval(1, 2, closed='left')] - with self.assertRaises(ValueError): - IntervalIndex.from_intervals(intervals) - - with self.assertRaises(ValueError): - IntervalIndex([0, 10], [3, 5]) - - actual = Index([Interval(0, 1), Interval(1, 2)]) - self.assertIsInstance(actual, IntervalIndex) - self.assertTrue(expected.equals(actual)) - - actual = Index(expected) - self.assertIsInstance(actual, IntervalIndex) - self.assertTrue(expected.equals(actual)) - - # no point in nesting periods in an IntervalIndex - # self.assertRaises(ValueError, IntervalIndex.from_breaks, - # pd.period_range('2000-01-01', periods=3)) - - def test_properties(self): - self.assertEqual(len(self.index), 2) - self.assertEqual(self.index.size, 2) - - self.assert_numpy_array_equal(self.index.left, [0, 1]) - self.assertIsInstance(self.index.left, Index) - - self.assert_numpy_array_equal(self.index.right, [1, 2]) - self.assertIsInstance(self.index.right, Index) - - self.assert_numpy_array_equal(self.index.mid, [0.5, 1.5]) - self.assertIsInstance(self.index.mid, Index) - - self.assertEqual(self.index.closed, 'right') - - expected = np.array([Interval(0, 1), Interval(1, 2)], dtype=object) - self.assert_numpy_array_equal(np.asarray(self.index), expected) - self.assert_numpy_array_equal(self.index.values, expected) - - def test_copy(self): - actual = self.index.copy() - self.assertTrue(actual.equals(self.index)) - - actual = self.index.copy(deep=True) - self.assertTrue(actual.equals(self.index)) - self.assertIsNot(actual.left, self.index.left) - - def test_delete(self): - expected = IntervalIndex.from_breaks([1, 2]) - actual = self.index.delete(0) - self.assertTrue(expected.equals(actual)) - - def test_insert(self): - expected = IntervalIndex.from_breaks(range(4)) - actual = self.index.insert(2, Interval(2, 3)) - self.assertTrue(expected.equals(actual)) - - self.assertRaises(ValueError, self.index.insert, 0, 1) - self.assertRaises(ValueError, self.index.insert, 0, - Interval(2, 3, closed='left')) - - def test_take(self): - actual = self.index.take([0, 1]) - self.assertTrue(self.index.equals(actual)) - - expected = IntervalIndex([0, 0, 1], [1, 1, 2]) - actual = self.index.take([0, 0, 1]) - self.assertTrue(expected.equals(actual)) - - def test_monotonic_and_unique(self): - self.assertTrue(self.index.is_monotonic) - self.assertTrue(self.index.is_unique) - - idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)]) - self.assertTrue(idx.is_monotonic) - self.assertTrue(idx.is_unique) - - idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (1, 2)]) - self.assertFalse(idx.is_monotonic) - self.assertTrue(idx.is_unique) - - idx = IntervalIndex.from_tuples([(0, 2), (0, 2)]) - self.assertFalse(idx.is_unique) - self.assertTrue(idx.is_monotonic) - - def test_repr(self): - expected = ("IntervalIndex(left=[0, 1],\n right=[1, 2]," - "\n closed='right')") - IntervalIndex((0, 1), (1, 2), closed='right') - self.assertEqual(repr(self.index), expected) - - def test_get_loc_value(self): - self.assertRaises(KeyError, self.index.get_loc, 0) - self.assertEqual(self.index.get_loc(0.5), 0) - self.assertEqual(self.index.get_loc(1), 0) - self.assertEqual(self.index.get_loc(1.5), 1) - self.assertEqual(self.index.get_loc(2), 1) - self.assertRaises(KeyError, self.index.get_loc, -1) - self.assertRaises(KeyError, self.index.get_loc, 3) - - idx = IntervalIndex.from_tuples([(0, 2), (1, 3)]) - self.assertEqual(idx.get_loc(0.5), 0) - self.assertEqual(idx.get_loc(1), 0) - self.assert_numpy_array_equal(idx.get_loc(1.5), [0, 1]) - self.assert_numpy_array_equal(np.sort(idx.get_loc(2)), [0, 1]) - self.assertEqual(idx.get_loc(3), 1) - self.assertRaises(KeyError, idx.get_loc, 3.5) - - idx = IntervalIndex([0, 2], [1, 3]) - self.assertRaises(KeyError, idx.get_loc, 1.5) - - def slice_locs_cases(self, breaks): - # TODO: same tests for more index types - index = IntervalIndex.from_breaks([0, 1, 2], closed='right') - self.assertEqual(index.slice_locs(), (0, 2)) - self.assertEqual(index.slice_locs(0, 1), (0, 1)) - self.assertEqual(index.slice_locs(1, 1), (0, 1)) - self.assertEqual(index.slice_locs(0, 2), (0, 2)) - self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) - self.assertEqual(index.slice_locs(0, 0.5), (0, 1)) - self.assertEqual(index.slice_locs(start=1), (0, 2)) - self.assertEqual(index.slice_locs(start=1.2), (1, 2)) - self.assertEqual(index.slice_locs(end=1), (0, 1)) - self.assertEqual(index.slice_locs(end=1.1), (0, 2)) - self.assertEqual(index.slice_locs(end=1.0), (0, 1)) - self.assertEqual(*index.slice_locs(-1, -1)) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='neither') - self.assertEqual(index.slice_locs(0, 1), (0, 1)) - self.assertEqual(index.slice_locs(0, 2), (0, 2)) - self.assertEqual(index.slice_locs(0.5, 1.5), (0, 2)) - self.assertEqual(index.slice_locs(1, 1), (1, 1)) - self.assertEqual(index.slice_locs(1, 2), (1, 2)) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='both') - self.assertEqual(index.slice_locs(1, 1), (0, 2)) - self.assertEqual(index.slice_locs(1, 2), (0, 2)) - - def test_slice_locs_int64(self): - self.slice_locs_cases([0, 1, 2]) - - def test_slice_locs_float64(self): - self.slice_locs_cases([0.0, 1.0, 2.0]) - - def slice_locs_decreasing_cases(self, tuples): - index = IntervalIndex.from_tuples(tuples) - self.assertEqual(index.slice_locs(1.5, 0.5), (1, 3)) - self.assertEqual(index.slice_locs(2, 0), (1, 3)) - self.assertEqual(index.slice_locs(2, 1), (1, 3)) - self.assertEqual(index.slice_locs(3, 1.1), (0, 3)) - self.assertEqual(index.slice_locs(3, 3), (0, 2)) - self.assertEqual(index.slice_locs(3.5, 3.3), (0, 1)) - self.assertEqual(index.slice_locs(1, -3), (2, 3)) - self.assertEqual(*index.slice_locs(-1, -1)) - - def test_slice_locs_decreasing_int64(self): - self.slice_locs_cases([(2, 4), (1, 3), (0, 2)]) - - def test_slice_locs_decreasing_float64(self): - self.slice_locs_cases([(2., 4.), (1., 3.), (0., 2.)]) - - def test_slice_locs_fails(self): - index = IntervalIndex.from_tuples([(1, 2), (0, 1), (2, 3)]) - with self.assertRaises(KeyError): - index.slice_locs(1, 2) - - def test_get_loc_interval(self): - self.assertEqual(self.index.get_loc(Interval(0, 1)), 0) - self.assertEqual(self.index.get_loc(Interval(0, 0.5)), 0) - self.assertEqual(self.index.get_loc(Interval(0, 1, 'left')), 0) - self.assertRaises(KeyError, self.index.get_loc, Interval(2, 3)) - self.assertRaises(KeyError, self.index.get_loc, Interval(-1, 0, 'left')) - - def test_get_indexer(self): - actual = self.index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = [-1, -1, 0, 0, 1, 1, -1] - self.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(self.index) - expected = [0, 1] - self.assert_numpy_array_equal(actual, expected) - - index = IntervalIndex.from_breaks([0, 1, 2], closed='left') - actual = index.get_indexer([-1, 0, 0.5, 1, 1.5, 2, 3]) - expected = [-1, 0, 0, 1, 1, -1, -1] - self.assert_numpy_array_equal(actual, expected) - - actual = self.index.get_indexer(index[:1]) - expected = [0] - self.assert_numpy_array_equal(actual, expected) - - self.assertRaises(ValueError, self.index.get_indexer, index) - - def test_get_indexer_subintervals(self): - # return indexers for wholly contained subintervals - target = IntervalIndex.from_breaks(np.linspace(0, 2, 5)) - actual = self.index.get_indexer(target) - expected = [0, 0, 1, 1] - self.assert_numpy_array_equal(actual, expected) - - target = IntervalIndex.from_breaks([0, 0.67, 1.33, 2]) - self.assertRaises(ValueError, self.index.get_indexer, target) - - actual = self.index.get_indexer(target[[0, -1]]) - expected = [0, 1] - self.assert_numpy_array_equal(actual, expected) - - target = IntervalIndex.from_breaks([0, 0.33, 0.67, 1], closed='left') - actual = self.index.get_indexer(target) - expected = [0, 0, 0] - self.assert_numpy_array_equal(actual, expected) - - def test_contains(self): - self.assertNotIn(0, self.index) - self.assertIn(0.5, self.index) - self.assertIn(2, self.index) - - self.assertIn(Interval(0, 1), self.index) - self.assertIn(Interval(0, 2), self.index) - self.assertIn(Interval(0, 0.5), self.index) - self.assertNotIn(Interval(3, 5), self.index) - self.assertNotIn(Interval(-1, 0, closed='left'), self.index) - - def test_non_contiguous(self): - index = IntervalIndex.from_tuples([(0, 1), (2, 3)]) - target = [0.5, 1.5, 2.5] - actual = index.get_indexer(target) - expected = [0, -1, 1] - self.assert_numpy_array_equal(actual, expected) - - self.assertNotIn(1.5, index) - - def test_union(self): - other = IntervalIndex([2], [3]) - expected = IntervalIndex(range(3), range(1, 4)) - actual = self.index.union(other) - self.assertTrue(expected.equals(actual)) - - actual = other.union(self.index) - self.assertTrue(expected.equals(actual)) - - self.assert_numpy_array_equal(self.index.union(self.index), self.index) - self.assert_numpy_array_equal(self.index.union(self.index[:1]), - self.index) - - def test_intersection(self): - other = IntervalIndex.from_breaks([1, 2, 3]) - expected = IntervalIndex.from_breaks([1, 2]) - actual = self.index.intersection(other) - self.assertTrue(expected.equals(actual)) - - self.assert_numpy_array_equal(self.index.intersection(self.index), - self.index) - - def test_difference(self): - self.assert_numpy_array_equal(self.index.difference(self.index[:1]), - self.index[1:]) - - def test_sym_diff(self): - self.assert_numpy_array_equal(self.index[:1].sym_diff(self.index[1:]), - self.index) - - def test_set_operation_errors(self): - self.assertRaises(ValueError, self.index.union, self.index.left) - - other = IntervalIndex.from_breaks([0, 1, 2], closed='neither') - self.assertRaises(ValueError, self.index.union, other) - - def test_isin(self): - actual = self.index.isin(self.index) - self.assert_numpy_array_equal([True, True], actual) - - actual = self.index.isin(self.index[:1]) - self.assert_numpy_array_equal([True, False], actual) - - def test_comparison(self): - actual = Interval(0, 1) < self.index - expected = [False, True] - self.assert_numpy_array_equal(actual, expected) - - actual = Interval(0.5, 1.5) < self.index - expected = [False, True] - self.assert_numpy_array_equal(actual, expected) - actual = self.index > Interval(0.5, 1.5) - self.assert_numpy_array_equal(actual, expected) - - actual = self.index == self.index - expected = [True, True] - self.assert_numpy_array_equal(actual, expected) - actual = self.index <= self.index - self.assert_numpy_array_equal(actual, expected) - actual = self.index >= self.index - self.assert_numpy_array_equal(actual, expected) - - actual = self.index < self.index - expected = [False, False] - self.assert_numpy_array_equal(actual, expected) - actual = self.index > self.index - self.assert_numpy_array_equal(actual, expected) - - actual = self.index == IntervalIndex.from_breaks([0, 1, 2], 'left') - self.assert_numpy_array_equal(actual, expected) - - actual = self.index == self.index.values - self.assert_numpy_array_equal(actual, [True, True]) - actual = self.index.values == self.index - self.assert_numpy_array_equal(actual, [True, True]) - actual = self.index <= self.index.values - self.assert_numpy_array_equal(actual, [True, True]) - actual = self.index != self.index.values - self.assert_numpy_array_equal(actual, [False, False]) - actual = self.index > self.index.values - self.assert_numpy_array_equal(actual, [False, False]) - actual = self.index.values > self.index - self.assert_numpy_array_equal(actual, [False, False]) - - # invalid comparisons - actual = self.index == 0 - self.assert_numpy_array_equal(actual, [False, False]) - actual = self.index == self.index.left - self.assert_numpy_array_equal(actual, [False, False]) - - with self.assertRaisesRegexp(TypeError, 'unorderable types'): - self.index > 0 - with self.assertRaisesRegexp(TypeError, 'unorderable types'): - self.index <= 0 - with self.assertRaises(TypeError): - self.index > np.arange(2) - with self.assertRaises(ValueError): - self.index > np.arange(3) - - def test_missing_values(self): - idx = pd.Index([np.nan, pd.Interval(0, 1), pd.Interval(1, 2)]) - idx2 = pd.IntervalIndex([np.nan, 0, 1], [np.nan, 1, 2]) - assert idx.equals(idx2) - - with tm.assertRaisesRegexp(ValueError, 'both left and right sides'): - pd.IntervalIndex([np.nan, 0, 1], [0, 1, 2]) - - self.assert_numpy_array_equal(pd.isnull(idx), [True, False, False]) - - def test_order(self): - expected = IntervalIndex.from_breaks([1, 2, 3, 4]) - actual = IntervalIndex.from_tuples([(3, 4), (1, 2), (2, 3)]).order() - self.assert_numpy_array_equal(expected, actual) - - def test_datetime(self): - dates = pd.date_range('2000', periods=3) - idx = IntervalIndex.from_breaks(dates) - - self.assert_numpy_array_equal(idx.left, dates[:2]) - self.assert_numpy_array_equal(idx.right, dates[-2:]) - - expected = pd.date_range('2000-01-01T12:00', periods=2) - self.assert_numpy_array_equal(idx.mid, expected) - - self.assertIn('2000-01-01T12', idx) - - target = pd.date_range('1999-12-31T12:00', periods=7, freq='12H') - actual = idx.get_indexer(target) - expected = [-1, -1, 0, 0, 1, 1, -1] - self.assert_numpy_array_equal(actual, expected) - - # def test_math(self): - # # add, subtract, multiply, divide with scalars should be OK - # actual = 2 * self.index + 1 - # expected = IntervalIndex.from_breaks((2 * np.arange(3) + 1)) - # self.assertTrue(expected.equals(actual)) - - # actual = self.index / 2.0 - 1 - # expected = IntervalIndex.from_breaks((np.arange(3) / 2.0 - 1)) - # self.assertTrue(expected.equals(actual)) - - # with self.assertRaises(TypeError): - # # doesn't make sense to add two IntervalIndex objects - # self.index + self.index - - # def test_datetime_math(self): - - # expected = IntervalIndex(pd.date_range('2000-01-02', periods=3)) - # actual = idx + pd.to_timedelta(1, unit='D') - # self.assertTrue(expected.equals(actual)) - - # TODO: other set operations (left join, right join, intersection), - # set operations with conflicting IntervalIndex objects or other dtypes, - # groupby, cut, reset_index... diff --git a/pandas/tests/tools/test_tile.py b/pandas/tests/tools/test_tile.py index 2d657c14b73a6..e0a625bbf29c2 100644 --- a/pandas/tests/tools/test_tile.py +++ b/pandas/tests/tools/test_tile.py @@ -3,23 +3,20 @@ import numpy as np from pandas.compat import zip -from pandas import DataFrame, Series, Index, unique, isnull, Categorical +from pandas import (Series, Index, isnull, + to_datetime, DatetimeIndex, Timestamp, + Interval, IntervalIndex, Categorical, + cut, qcut, date_range) import pandas.util.testing as tm -from pandas.util.testing import assertRaisesRegexp -import pandas.core.common as com from pandas.core.algorithms import quantile -from pandas.core.categorical import Categorical -from pandas.core.interval import Interval, IntervalIndex -from pandas.tools.tile import cut, qcut import pandas.tools.tile as tmod -from pandas import to_datetime, DatetimeIndex, Timestamp class TestCut(tm.TestCase): def test_simple(self): - data = np.ones(5) + data = np.ones(5, dtype='int64') result = cut(data, 4, labels=False) expected = np.array([1, 1, 1, 1, 1]) tm.assert_numpy_array_equal(result, expected, @@ -30,29 +27,37 @@ def test_bins(self): result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 1, 2, 0])) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + expected = intervals.take([0, 0, 0, 1, 2, 0]).astype('category') + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, + 6.53333333, 9.7])) def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 2, 3, 0, 0])) - tm.assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7]) + expected = intervals.astype('category').take([0, 0, 0, 2, 3, 0, 0]) + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, + 7.325, 9.7])) def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3), closed='left') - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 2, 3, 0, 1])) - tm.assert_almost_equal(bins, [0.2, 2.575, 4.95, 7.325, 9.7095]) + expected = intervals.take([0, 0, 0, 2, 3, 0, 1]).astype('category') + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, + 7.325, 9.7095])) def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - tm.assert_numpy_array_equal(result, intervals.take([0, 0, 0, 1, 2, 0])) - tm.assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) + expected = intervals.take([0, 0, 0, 1, 2, 0]).astype('category') + tm.assert_categorical_equal(result, expected) + tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, + 6.53333333, 9.7])) def test_bins_not_monotonic(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] @@ -81,12 +86,12 @@ def test_labels(self): result, bins = cut(arr, 4, retbins=True) ex_levels = IntervalIndex.from_breaks([-1e-3, 0.25, 0.5, 0.75, 1]) - self.assert_numpy_array_equal(unique(result), ex_levels) + tm.assert_index_equal(result.categories, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = IntervalIndex.from_breaks([0, 0.25, 0.5, 0.75, 1 + 1e-3], closed='left') - self.assert_numpy_array_equal(unique(result), ex_levels) + tm.assert_index_equal(result.categories, ex_levels) def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') @@ -98,8 +103,9 @@ def test_label_precision(self): arr = np.arange(0, 0.73, 0.01) result = cut(arr, 4, precision=2) - ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72]) - self.assert_numpy_array_equal(unique(result), ex_levels) + ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, + 0.54, 0.72]) + tm.assert_index_equal(result.categories, ex_levels) def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) @@ -109,12 +115,12 @@ def test_na_handling(self): result_arr = np.asarray(result) - ex_arr = np.where(com.isnull(arr), np.nan, result_arr) + ex_arr = np.where(isnull(arr), np.nan, result_arr) tm.assert_almost_equal(result_arr, ex_arr) result = cut(arr, 4, labels=False) - ex_result = np.where(com.isnull(arr), np.nan, result) + ex_result = np.where(isnull(arr), np.nan, result) tm.assert_almost_equal(result, ex_result) def test_inf_handling(self): @@ -125,8 +131,8 @@ def test_inf_handling(self): result = cut(data, bins) result_ser = cut(data_ser, bins) - ex_uniques = IntervalIndex.from_breaks(bins).values - tm.assert_numpy_array_equal(unique(result), ex_uniques) + ex_uniques = IntervalIndex.from_breaks(bins) + tm.assert_index_equal(result.categories, ex_uniques) self.assertEqual(result[5], Interval(4, np.inf)) self.assertEqual(result[0], Interval(-np.inf, 2)) self.assertEqual(result_ser[5], Interval(4, np.inf)) @@ -135,12 +141,17 @@ def test_inf_handling(self): def test_qcut(self): arr = np.random.randn(1000) + # we store the bins as Index that have been rounded + # to comparisions are a bit tricky labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) - tm.assert_almost_equal(bins, ex_bins) + result = labels.categories.left.values + self.assertTrue(np.allclose(result, ex_bins[:-1], atol=1e-2)) + result = labels.categories.right.values + self.assertTrue(np.allclose(result, ex_bins[1:], atol=1e-2)) ex_levels = cut(arr, ex_bins, include_lowest=True) - self.assert_categorical_equal(labels, ex_levels) + tm.assert_categorical_equal(labels, ex_levels) def test_qcut_bounds(self): arr = np.random.randn(1000) @@ -153,11 +164,11 @@ def test_qcut_specify_quantiles(self): factor = qcut(arr, [0, .25, .5, .75, 1.]) expected = qcut(arr, 4) - self.assert_numpy_array_equal(factor, expected) + tm.assert_categorical_equal(factor, expected) def test_qcut_all_bins_same(self): - assertRaisesRegexp(ValueError, "edges.*unique", qcut, - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) + tm.assertRaisesRegexp(ValueError, "edges.*unique", qcut, + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3) def test_cut_out_of_bounds(self): arr = np.random.randn(100) @@ -174,36 +185,39 @@ def test_cut_pass_labels(self): labels = ['Small', 'Medium', 'Large'] result = cut(arr, bins, labels=labels) - exp = ['Medium'] + 4 * ['Small'] + ['Medium', 'Large'] - self.assert_numpy_array_equal(result, exp) + exp = Categorical(['Medium'] + 4 * ['Small'] + ['Medium', 'Large'], + ordered=True) + self.assert_categorical_equal(result, exp) - result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2], labels)) + result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2], + labels)) exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels) - self.assertTrue(result.equals(exp)) + self.assert_categorical_equal(result, exp) def test_qcut_include_lowest(self): values = np.arange(10) - cats = qcut(values, 4) + ii = qcut(values, 4) - ex_levels = [Interval(0, 2.25, closed='both'), Interval(2.25, 4.5), - Interval(4.5, 6.75), Interval(6.75, 9)] - self.assert_numpy_array_equal(unique(cats), ex_levels) + ex_levels = IntervalIndex.from_intervals( + [Interval(-0.001, 2.25), + Interval(2.25, 4.5), + Interval(4.5, 6.75), + Interval(6.75, 9)]) + tm.assert_index_equal(ii.categories, ex_levels) def test_qcut_nas(self): arr = np.random.randn(100) arr[:20] = np.nan result = qcut(arr, 4) - self.assertTrue(com.isnull(result[:20]).all()) + self.assertTrue(isnull(result[:20]).all()) def test_qcut_index(self): - # the result is closed on a different side for the first interval, but - # we should still be able to make an index result = qcut([0, 2], 2) - index = Index(result) - expected = Index([Interval(0, 1, closed='both'), Interval(1, 2)]) - self.assert_numpy_array_equal(index, expected) + expected = Index([Interval(-0.001, 1), Interval(1, 2)]).astype( + 'category') + self.assert_categorical_equal(result, expected) def test_round_frac(self): # it works @@ -247,41 +261,46 @@ def test_qcut_binning_issues(self): self.assertTrue(ep <= sn) def test_cut_return_intervals(self): - s = Series([0,1,2,3,4,5,6,7,8]) - res = cut(s,3) + s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = cut(s, 3) exp_bins = np.linspace(0, 8, num=4).round(3) exp_bins[0] -= 0.008 - exp = Series(IntervalIndex.from_breaks(exp_bins).take([0,0,0,1,1,1,2,2,2])) + exp = Series(IntervalIndex.from_breaks(exp_bins, closed='right').take( + [0, 0, 0, 1, 1, 1, 2, 2, 2])).astype('category', ordered=True) tm.assert_series_equal(res, exp) def test_qcut_return_intervals(self): - s = Series([0,1,2,3,4,5,6,7,8]) - res = qcut(s,[0,0.333,0.666,1]) - exp_levels = np.array([Interval(0, 2.664, closed='both'), + s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) + res = qcut(s, [0, 0.333, 0.666, 1]) + exp_levels = np.array([Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]) - exp = Series(exp_levels.take([0,0,0,1,1,1,2,2,2])) + exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype( + 'category', ordered=True) tm.assert_series_equal(res, exp) def test_series_retbins(self): # GH 8589 s = Series(np.arange(4)) result, bins = cut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, - np.array([0, 0, 1, 1], dtype=np.int8)) - tm.assert_numpy_array_equal(bins, np.array([-0.003, 1.5, 3])) + expected = Series(IntervalIndex.from_breaks( + [-0.003, 1.5, 3], closed='right').repeat(2)).astype('category', + ordered=True) + tm.assert_series_equal(result, expected) result, bins = qcut(s, 2, retbins=True) - tm.assert_numpy_array_equal(result.cat.codes.values, - np.array([0, 0, 1, 1], dtype=np.int8)) - tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3])) + expected = Series(IntervalIndex.from_breaks( + [-0.001, 1.5, 3], closed='right').repeat(2)).astype('category', + ordered=True) + tm.assert_series_equal(result, expected) def test_qcut_duplicates_bin(self): # GH 7751 values = [0, 0, 0, 0, 1, 2, 3] - result_levels = ['[0, 1]', '(1, 3]'] + expected = IntervalIndex.from_intervals([Interval(-0.001, 1), + Interval(1, 3)]) - cats = qcut(values, 3, duplicates='drop') - self.assertTrue((cats.categories == result_levels).all()) + result = qcut(values, 3, duplicates='drop') + tm.assert_index_equal(result.categories, expected) self.assertRaises(ValueError, qcut, values, 3) self.assertRaises(ValueError, qcut, values, 3, duplicates='raise') @@ -297,51 +316,57 @@ def test_single_quantile(self): result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0, 0], ["[9, 9]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(8.999, 9.0), + Interval(8.999, 9.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([-9., -9.]) + expected = Series([0, 0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0, 0], ["[-9, -9]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(-9.001, -9.0), + Interval(-9.001, -9.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([0., 0.]) + expected = Series([0, 0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0, 0], ["[0, 0]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) - - expected = Series([0]) + intervals = IntervalIndex([Interval(-0.001, 0.0), + Interval(-0.001, 0.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([9]) + expected = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0], ["[9, 9]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(8.999, 9.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([-9]) + expected = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0], ["[-9, -9]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(-9.001, -9.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) s = Series([0]) + expected = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) - exp_lab = Series(Categorical.from_codes([0], ["[0, 0]"], - ordered=True)) - tm.assert_series_equal(result, exp_lab) + intervals = IntervalIndex([Interval(-0.001, 0.0)], closed='right') + expected = Series(intervals).astype('category', ordered=True) + tm.assert_series_equal(result, expected) def test_single_bin(self): # issue 14652 @@ -382,11 +407,18 @@ def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) + result, bins = cut(data, 3, retbins=True) - expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', - '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', - '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], - ).astype("category", ordered=True) + expected = ( + Series(IntervalIndex.from_intervals([ + Interval(Timestamp('2012-12-31 23:57:07.200000'), + Timestamp('2013-01-01 16:00:00')), + Interval(Timestamp('2013-01-01 16:00:00'), + Timestamp('2013-01-02 08:00:00')), + Interval(Timestamp('2013-01-02 08:00:00'), + Timestamp('2013-01-03 00:00:00'))])) + .astype('category', ordered=True)) + tm.assert_series_equal(result, expected) # testing for time data to be present as list @@ -410,9 +442,11 @@ def test_datetime_cut(self): def test_datetime_bin(self): data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] - expected = Series(['(2012-12-12 00:00:00, 2012-12-14 00:00:00]', - '(2012-12-14 00:00:00, 2012-12-16 00:00:00]'], - ).astype("category", ordered=True) + expected = ( + Series(IntervalIndex.from_intervals([ + Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), + Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])) + .astype('category', ordered=True)) for conv in [Timestamp, Timestamp, np.datetime64]: bins = [conv(v) for v in bin_data] @@ -427,13 +461,19 @@ def test_datetime_bin(self): result = cut(data, bins=bin_pydatetime) tm.assert_series_equal(Series(result), expected) - result, bins = cut(s, 2, retbins=True, labels=[0, 1]) - tm.assert_numpy_array_equal(result, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [-0.003, 1.5, 3]) + def test_datetime_nan(self): + + def f(): + cut(date_range('20130101', periods=3), bins=[0, 2, 4]) + self.assertRaises(ValueError, f) - result, bins = qcut(s, 2, retbins=True, labels=[0, 1]) - tm.assert_numpy_array_equal(result, [0, 0, 1, 1]) - tm.assert_almost_equal(bins, [0, 1.5, 3]) + result = cut(date_range('20130102', periods=5), + bins=date_range('20130101', periods=2)) + mask = result.categories.isnull() + self.assert_numpy_array_equal(mask, np.array([False])) + mask = result.isnull() + self.assert_numpy_array_equal( + mask, np.array([False, True, True, True, True])) def curpath(): diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index e7b2edeb57714..79d9fd84396e7 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -3,14 +3,15 @@ import numpy as np import pandas as pd -from pandas import Series, Categorical, date_range +from pandas import Series, Categorical, IntervalIndex, date_range -from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype +from pandas.types.dtypes import (DatetimeTZDtype, PeriodDtype, + IntervalDtype, CategoricalDtype) from pandas.types.common import (is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, - is_datetime64_dtype, + is_datetime64_dtype, is_interval_dtype, is_datetime64_any_dtype, is_string_dtype, _coerce_to_dtype) import pandas.util.testing as tm @@ -351,3 +352,114 @@ def test_empty(self): def test_not_string(self): # though PeriodDtype has object kind, it cannot be string self.assertFalse(is_string_dtype(PeriodDtype('D'))) + + +class TestIntervalDtype(Base, tm.TestCase): + + # TODO: placeholder + def setUp(self): + self.dtype = IntervalDtype('int64') + + def test_construction(self): + with tm.assertRaises(ValueError): + IntervalDtype('xx') + + for s in ['interval[int64]', 'Interval[int64]', 'int64']: + i = IntervalDtype(s) + self.assertEqual(i.subtype, np.dtype('int64')) + self.assertTrue(is_interval_dtype(i)) + + def test_construction_generic(self): + # generic + i = IntervalDtype('interval') + self.assertIs(i.subtype, None) + self.assertTrue(is_interval_dtype(i)) + self.assertTrue(str(i) == 'interval') + + i = IntervalDtype() + self.assertIs(i.subtype, None) + self.assertTrue(is_interval_dtype(i)) + self.assertTrue(str(i) == 'interval') + + def test_subclass(self): + a = IntervalDtype('interval[int64]') + b = IntervalDtype('interval[int64]') + + self.assertTrue(issubclass(type(a), type(a))) + self.assertTrue(issubclass(type(a), type(b))) + + def test_is_dtype(self): + self.assertTrue(IntervalDtype.is_dtype(self.dtype)) + self.assertTrue(IntervalDtype.is_dtype('interval')) + self.assertTrue(IntervalDtype.is_dtype(IntervalDtype('float64'))) + self.assertTrue(IntervalDtype.is_dtype(IntervalDtype('int64'))) + self.assertTrue(IntervalDtype.is_dtype(IntervalDtype(np.int64))) + + self.assertFalse(IntervalDtype.is_dtype('D')) + self.assertFalse(IntervalDtype.is_dtype('3D')) + self.assertFalse(IntervalDtype.is_dtype('U')) + self.assertFalse(IntervalDtype.is_dtype('S')) + self.assertFalse(IntervalDtype.is_dtype('foo')) + self.assertFalse(IntervalDtype.is_dtype(np.object_)) + self.assertFalse(IntervalDtype.is_dtype(np.int64)) + self.assertFalse(IntervalDtype.is_dtype(np.float64)) + + def test_identity(self): + self.assertEqual(IntervalDtype('interval[int64]'), + IntervalDtype('interval[int64]')) + + def test_coerce_to_dtype(self): + self.assertEqual(_coerce_to_dtype('interval[int64]'), + IntervalDtype('interval[int64]')) + + def test_construction_from_string(self): + result = IntervalDtype('interval[int64]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + result = IntervalDtype.construct_from_string('interval[int64]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + with tm.assertRaises(TypeError): + IntervalDtype.construct_from_string('foo') + with tm.assertRaises(TypeError): + IntervalDtype.construct_from_string('interval[foo]') + with tm.assertRaises(TypeError): + IntervalDtype.construct_from_string('foo[int64]') + + def test_equality(self): + self.assertTrue(is_dtype_equal(self.dtype, 'interval[int64]')) + self.assertTrue(is_dtype_equal(self.dtype, IntervalDtype('int64'))) + self.assertTrue(is_dtype_equal(self.dtype, IntervalDtype('int64'))) + self.assertTrue(is_dtype_equal(IntervalDtype('int64'), + IntervalDtype('int64'))) + + self.assertFalse(is_dtype_equal(self.dtype, 'int64')) + self.assertFalse(is_dtype_equal(IntervalDtype('int64'), + IntervalDtype('float64'))) + + def test_basic(self): + self.assertTrue(is_interval_dtype(self.dtype)) + + ii = IntervalIndex.from_breaks(range(3)) + + self.assertTrue(is_interval_dtype(ii.dtype)) + self.assertTrue(is_interval_dtype(ii)) + + s = Series(ii, name='A') + + # dtypes + # series results in object dtype currently, + self.assertFalse(is_interval_dtype(s.dtype)) + self.assertFalse(is_interval_dtype(s)) + + def test_basic_dtype(self): + self.assertTrue(is_interval_dtype('interval[int64]')) + self.assertTrue(is_interval_dtype(IntervalIndex.from_tuples([(0, 1)]))) + self.assertTrue(is_interval_dtype + (IntervalIndex.from_breaks(np.arange(4)))) + self.assertTrue(is_interval_dtype( + IntervalIndex.from_breaks(date_range('20130101', periods=3)))) + self.assertFalse(is_interval_dtype('U')) + self.assertFalse(is_interval_dtype('S')) + self.assertFalse(is_interval_dtype('foo')) + self.assertFalse(is_interval_dtype(np.object_)) + self.assertFalse(is_interval_dtype(np.int64)) + self.assertFalse(is_interval_dtype(np.float64)) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py index efd6dda02ab4b..31bf2817c8bab 100644 --- a/pandas/tests/types/test_missing.py +++ b/pandas/tests/types/test_missing.py @@ -55,6 +55,14 @@ def test_0d_array(self): self.assertFalse(isnull(np.array(0.0, dtype=object))) self.assertFalse(isnull(np.array(0, dtype=object))) + def test_empty_object(self): + + for shape in [(4, 0), (4,)]: + arr = np.empty(shape=shape, dtype=object) + result = isnull(arr) + expected = np.ones(shape=shape, dtype=bool) + tm.assert_numpy_array_equal(result, expected) + def test_isnull(self): self.assertFalse(isnull(1.)) self.assertTrue(isnull(None)) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index f1ca7ff4b19ba..c6b1ee417c64d 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -3,24 +3,22 @@ """ from pandas.types.missing import isnull -from pandas.types.common import (is_float, is_integer, - is_scalar, _ensure_int64) +from pandas.types.common import (is_integer, + is_scalar, + is_categorical_dtype, + is_datetime64_dtype, + is_timedelta64_dtype, + _ensure_int64) -from pandas.core.api import Series -from pandas.core.categorical import Categorical -from pandas.core.index import _ensure_index -from pandas.core.interval import IntervalIndex, Interval import pandas.core.algorithms as algos import pandas.core.nanops as nanops -from pandas.compat import zip -from pandas import to_timedelta, to_datetime -from pandas.types.common import is_datetime64_dtype, is_timedelta64_dtype from pandas._libs.lib import infer_dtype +from pandas import (to_timedelta, to_datetime, + Categorical, Timestamp, Timedelta, + Series, Interval, IntervalIndex) import numpy as np -import warnings - def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): @@ -97,7 +95,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") - # TODO: IntervalIndex try: # for array-like sz = x.size except AttributeError: @@ -124,13 +121,14 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, else: bins = np.asarray(bins) - bins = _convert_bin_to_numeric_type(bins) + bins = _convert_bin_to_numeric_type(bins, dtype) if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') fac, bins = _bins_to_cuts(x, bins, right=right, labels=labels, precision=precision, - include_lowest=include_lowest, dtype=dtype) + include_lowest=include_lowest, + dtype=dtype) return _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name) @@ -154,8 +152,8 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): the resulting bins. If False, return only integer indicators of the bins. retbins : bool, optional - Whether to return the bins or not. Can be useful if bins is given - as a scalar. + Whether to return the (bins, labels) or not. Can be useful if bins + is given as a scalar. precision : int, optional The precision at which to store and display the bins labels duplicates : {default 'raise', 'drop'}, optional @@ -232,42 +230,18 @@ def _bins_to_cuts(x, bins, right=True, labels=None, if labels is not False: if labels is None: - - # TODO: IntervalIndex - increases = 0 - while True: - try: - levels = _format_levels(bins, precision, right=right, - include_lowest=include_lowest, - dtype=dtype) - except ValueError: - increases += 1 - precision += 1 - if increases >= 20: - raise - else: - break - - # - #closed = 'right' if right else 'left' - #precision = _infer_precision(precision, bins) - #breaks = [_round_frac(b, precision) for b in bins] - #labels = IntervalIndex.from_breaks(breaks, closed=closed).values - - #if right and include_lowest: - # labels[0] = Interval(labels[0].left, labels[0].right, - # closed='both') - + labels = _format_labels(bins, precision, right=right, + include_lowest=include_lowest, + dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') - - if not com.is_categorical(labels): - labels = np.asarray(labels) + if not is_categorical_dtype(labels): + labels = Categorical(labels, ordered=True) np.putmask(ids, na_mask, 0) - result = com.take_nd(labels, ids - 1) + result = algos.take_nd(labels, ids - 1) else: result = ids - 1 @@ -277,42 +251,6 @@ def _bins_to_cuts(x, bins, right=True, labels=None, return result, bins -def _format_levels(bins, prec, right=True, - include_lowest=False, dtype=None): - fmt = lambda v: _format_label(v, precision=prec, dtype=dtype) - if right: - levels = [] - for a, b in zip(bins, bins[1:]): - fa, fb = fmt(a), fmt(b) - -def _round_frac(x, precision): - """Round the fractional part of the given number - """ - if not np.isfinite(x) or x == 0: - return x - else: - levels = ['[%s, %s)' % (fmt(a), fmt(b)) - for a, b in zip(bins, bins[1:])] - return levels - - -def _format_label(x, precision=3, dtype=None): - fmt_str = '%%.%dg' % precision - - if is_datetime64_dtype(dtype): - return to_datetime(x, unit='ns') - if is_timedelta64_dtype(dtype): - return to_timedelta(x, unit='ns') - if np.isinf(x): - return str(x) - elif is_float(x): - frac, whole = np.modf(x) - if whole == 0: - digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision - else: - digits = precision - return np.around(x, digits) - def _trim_zeros(x): while len(x) > 1 and x[-1] == '0': @@ -340,17 +278,65 @@ def _coerce_to_type(x): return x, dtype -def _convert_bin_to_numeric_type(x): +def _convert_bin_to_numeric_type(bins, dtype): """ if the passed bin is of datetime/timedelta type, this method converts it to integer + + Parameters + ---------- + bins : list-liek of bins + dtype : dtype of data + + Raises + ------ + ValueError if bins are not of a compat dtype to dtype """ - dtype = infer_dtype(x) - if dtype == 'timedelta' or dtype == 'timedelta64': - x = to_timedelta(x).view(np.int64) - elif dtype == 'datetime' or dtype == 'datetime64': - x = to_datetime(x).view(np.int64) - return x + bins_dtype = infer_dtype(bins) + if is_timedelta64_dtype(dtype): + if bins_dtype in ['timedelta', 'timedelta64']: + bins = to_timedelta(bins).view(np.int64) + else: + raise ValueError("bins must be of timedelta64 dtype") + elif is_datetime64_dtype(dtype): + if bins_dtype in ['datetime', 'datetime64']: + bins = to_datetime(bins).view(np.int64) + else: + raise ValueError("bins must be of datetime64 dtype") + + return bins + + +def _format_labels(bins, precision, right=True, + include_lowest=False, dtype=None): + """ based on the dtype, return our labels """ + + closed = 'right' if right else 'left' + + if is_datetime64_dtype(dtype): + formatter = Timestamp + adjust = lambda x: x - Timedelta('1ns') + elif is_timedelta64_dtype(dtype): + formatter = Timedelta + adjust = lambda x: x - Timedelta('1ns') + else: + precision = _infer_precision(precision, bins) + formatter = lambda x: _round_frac(x, precision) + adjust = lambda x: x - 10 ** (-precision) + + breaks = [formatter(b) for b in bins] + labels = IntervalIndex.from_breaks(breaks, closed=closed) + + if right and include_lowest: + # we will adjust the left hand side by precision to + # account that we are all right closed + v = adjust(labels[0].left) + + i = IntervalIndex.from_intervals( + [Interval(v, labels[0].right, closed='right')]) + labels = i.append(labels[1:]) + + return labels def _preprocess_for_cut(x): @@ -372,7 +358,8 @@ def _preprocess_for_cut(x): return x_is_series, series_index, name, x -def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): +def _postprocess_for_cut(fac, bins, retbins, x_is_series, + series_index, name): """ handles post processing for the cut method where we combine the index information if the originally passed @@ -386,6 +373,22 @@ def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): return fac, bins + +def _round_frac(x, precision): + """ + Round the fractional part of the given number + """ + if not np.isfinite(x) or x == 0: + return x + else: + frac, whole = np.modf(x) + if whole == 0: + digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision + else: + digits = precision + return np.around(x, digits) + + def _infer_precision(base_precision, bins): """Infer an appropriate precision for _round_frac """ diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index ae40c2f66a590..fe7005418b362 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -31,6 +31,9 @@ import pandas.types.concat as _concat import pandas.tseries.frequencies as frequencies +import pandas.indexes.base as ibase +_index_doc_kwargs = dict(ibase._index_doc_kwargs) + class DatelikeOps(object): """ common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex """ @@ -242,6 +245,7 @@ def _box_values(self, values): def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): try: res = self.get_loc(key) @@ -249,6 +253,8 @@ def __contains__(self, key): except (KeyError, TypeError, ValueError): return False + _is_contained_in = __contains__ + def __getitem__(self, key): """ This getitem defers to the underlying array, which by-definition can @@ -381,7 +387,7 @@ def sort_values(self, return_indexer=False, ascending=True): return self._simple_new(sorted_values, **attribs) - @Appender(_index_shared_docs['take']) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) @@ -798,7 +804,7 @@ def repeat(self, repeats, *args, **kwargs): return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) - @Appender(_index_shared_docs['where']) + @Appender(_index_shared_docs['where'] % _index_doc_kwargs) def where(self, cond, other=None): other = _ensure_datetimelike_to_i8(other) values = _ensure_datetimelike_to_i8(self) diff --git a/pandas/tseries/interval.py b/pandas/tseries/interval.py deleted file mode 100644 index 22801318a1853..0000000000000 --- a/pandas/tseries/interval.py +++ /dev/null @@ -1,35 +0,0 @@ - -from pandas.core.index import Index - - -class Interval(object): - """ - Represents an interval of time defined by two timestamps - """ - - def __init__(self, start, end): - self.start = start - self.end = end - - -class PeriodInterval(object): - """ - Represents an interval of time defined by two Period objects (time - ordinals) - """ - - def __init__(self, start, end): - self.start = start - self.end = end - - -class IntervalIndex(Index): - """ - - """ - - def __new__(self, starts, ends): - pass - - def dtype(self): - return self.values.dtype diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 1e1496bbe9c27..30ebc4da459ff 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -347,6 +347,7 @@ def _coerce_scalar_to_index(self, item): """ return PeriodIndex([item], **self._get_attributes_dict()) + @Appender(_index_shared_docs['__contains__']) def __contains__(self, key): if isinstance(key, Period): if key.freq != self.freq: @@ -361,6 +362,8 @@ def __contains__(self, key): return False return False + _is_contained_in = __contains__ + @property def asi8(self): return self._values.view('i8') diff --git a/pandas/types/api.py b/pandas/types/api.py index e78514ce77822..6dbd3dc6b640c 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -10,6 +10,10 @@ is_categorical, is_categorical_dtype, + # interval + is_interval, + is_interval_dtype, + # datetimelike is_datetimetz, is_datetime64_dtype, diff --git a/pandas/types/common.py b/pandas/types/common.py index 7ab2e068ac69f..0b14e484d40a7 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -7,6 +7,7 @@ from .dtypes import (CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, + IntervalDtype, IntervalDtypeType, ExtensionDtype) from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, @@ -139,6 +140,10 @@ def is_period_dtype(arr_or_dtype): return PeriodDtype.is_dtype(arr_or_dtype) +def is_interval_dtype(arr_or_dtype): + return IntervalDtype.is_dtype(arr_or_dtype) + + def is_categorical_dtype(arr_or_dtype): return CategoricalDtype.is_dtype(arr_or_dtype) @@ -501,6 +506,8 @@ def _coerce_to_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): dtype = PeriodDtype(dtype) + elif is_interval_dtype(dtype): + dtype = IntervalDtype(dtype) else: dtype = np.dtype(dtype) return dtype @@ -538,6 +545,8 @@ def _get_dtype(arr_or_dtype): return arr_or_dtype elif isinstance(arr_or_dtype, PeriodDtype): return arr_or_dtype + elif isinstance(arr_or_dtype, IntervalDtype): + return arr_or_dtype elif isinstance(arr_or_dtype, string_types): if is_categorical_dtype(arr_or_dtype): return CategoricalDtype.construct_from_string(arr_or_dtype) @@ -545,6 +554,8 @@ def _get_dtype(arr_or_dtype): return DatetimeTZDtype.construct_from_string(arr_or_dtype) elif is_period_dtype(arr_or_dtype): return PeriodDtype.construct_from_string(arr_or_dtype) + elif is_interval_dtype(arr_or_dtype): + return IntervalDtype.construct_from_string(arr_or_dtype) if hasattr(arr_or_dtype, 'dtype'): arr_or_dtype = arr_or_dtype.dtype @@ -575,6 +586,8 @@ def _get_dtype_type(arr_or_dtype): return CategoricalDtypeType elif isinstance(arr_or_dtype, DatetimeTZDtype): return DatetimeTZDtypeType + elif isinstance(arr_or_dtype, IntervalDtype): + return IntervalDtypeType elif isinstance(arr_or_dtype, PeriodDtype): return PeriodDtypeType elif isinstance(arr_or_dtype, string_types): @@ -584,6 +597,8 @@ def _get_dtype_type(arr_or_dtype): return DatetimeTZDtypeType elif is_period_dtype(arr_or_dtype): return PeriodDtypeType + elif is_interval_dtype(arr_or_dtype): + return IntervalDtypeType return _get_dtype_type(np.dtype(arr_or_dtype)) try: return arr_or_dtype.dtype.type @@ -695,6 +710,8 @@ def pandas_dtype(dtype): return dtype elif isinstance(dtype, CategoricalDtype): return dtype + elif isinstance(dtype, IntervalDtype): + return dtype elif isinstance(dtype, string_types): try: return DatetimeTZDtype.construct_from_string(dtype) @@ -708,6 +725,12 @@ def pandas_dtype(dtype): except TypeError: pass + elif dtype.startswith('interval[') or dtype.startswith('Interval['): + try: + return IntervalDtype.construct_from_string(dtype) + except TypeError: + pass + try: return CategoricalDtype.construct_from_string(dtype) except TypeError: diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index c3494df93476b..7913950a597c9 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -367,3 +367,112 @@ def is_dtype(cls, dtype): else: return False return super(PeriodDtype, cls).is_dtype(dtype) + + +class IntervalDtypeType(type): + """ + the type of IntervalDtype, this metaclass determines subclass ability + """ + pass + + +class IntervalDtype(ExtensionDtype): + __metaclass__ = IntervalDtypeType + """ + A Interval duck-typed class, suitable for holding an interval + + THIS IS NOT A REAL NUMPY DTYPE + """ + type = IntervalDtypeType + kind = None + str = '|O08' + base = np.dtype('O') + num = 103 + _metadata = ['subtype'] + _match = re.compile("(I|i)nterval\[(?P.+)\]") + _cache = {} + + def __new__(cls, subtype=None): + """ + Parameters + ---------- + subtype : the dtype of the Interval + """ + + if isinstance(subtype, IntervalDtype): + return subtype + elif subtype is None or (isinstance(subtype, compat.string_types) and + subtype == 'interval'): + subtype = None + else: + if isinstance(subtype, compat.string_types): + m = cls._match.search(subtype) + if m is not None: + subtype = m.group('subtype') + + from pandas.types.common import pandas_dtype + try: + subtype = pandas_dtype(subtype) + except TypeError: + raise ValueError("could not construct IntervalDtype") + + try: + return cls._cache[str(subtype)] + except KeyError: + u = object.__new__(cls) + u.subtype = subtype + cls._cache[str(subtype)] = u + return u + + @classmethod + def construct_from_string(cls, string): + """ + attempt to construct this type from a string, raise a TypeError + if its not possible + """ + if isinstance(string, compat.string_types): + try: + return cls(string) + except ValueError: + pass + raise TypeError("could not construct IntervalDtype") + + def __unicode__(self): + if self.subtype is None: + return "interval" + return "interval[{subtype}]".format(subtype=self.subtype) + + @property + def name(self): + return str(self) + + def __hash__(self): + # make myself hashable + return hash(str(self)) + + def __eq__(self, other): + if isinstance(other, compat.string_types): + return other == self.name or other == self.name.title() + + return (isinstance(other, IntervalDtype) and + self.subtype == other.subtype) + + @classmethod + def is_dtype(cls, dtype): + """ + Return a boolean if we if the passed type is an actual dtype that we + can match (via string or type) + """ + + if isinstance(dtype, compat.string_types): + if dtype.lower().startswith('interval'): + try: + if cls.construct_from_string(dtype) is not None: + return True + else: + return False + except ValueError: + return False + else: + return False + return super(IntervalDtype, cls).is_dtype(dtype) diff --git a/pandas/types/generic.py b/pandas/types/generic.py index e7b54ccc6f25e..90608c18ae503 100644 --- a/pandas/types/generic.py +++ b/pandas/types/generic.py @@ -32,12 +32,14 @@ def _check(cls, inst): ("periodindex", )) ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex", )) +ABCIntervalIndex = create_pandas_abc_type("ABCIntervalIndex", "_typ", + ("intervalindex", )) ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", "int64index", "rangeindex", "float64index", "uint64index", "multiindex", "datetimeindex", "timedeltaindex", "periodindex", - "categoricalindex")) + "categoricalindex", "intervalindex")) ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", )) ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) diff --git a/pandas/types/inference.py b/pandas/types/inference.py index 91418677c6b19..b0a93d24228af 100644 --- a/pandas/types/inference.py +++ b/pandas/types/inference.py @@ -20,6 +20,8 @@ is_decimal = lib.is_decimal +is_interval = lib.is_interval + def is_number(obj): """ diff --git a/pandas/types/missing.py b/pandas/types/missing.py index ea49af9884f5a..af3a873bc2866 100644 --- a/pandas/types/missing.py +++ b/pandas/types/missing.py @@ -9,7 +9,7 @@ from .common import (is_string_dtype, is_datetimelike, is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, + is_timedelta64_dtype, is_interval_dtype, is_complex_dtype, is_categorical_dtype, is_string_like_dtype, is_bool_dtype, is_integer_dtype, is_dtype_equal, @@ -127,6 +127,9 @@ def _isnull_ndarraylike(obj): if not isinstance(values, Categorical): values = values.values result = values.isnull() + elif is_interval_dtype(values): + from pandas import IntervalIndex + result = IntervalIndex(obj).isnull() else: # Working around NumPy ticket 1542 diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 28214b1462cb7..c73cca56f975a 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -29,6 +29,7 @@ is_number, is_bool, needs_i8_conversion, is_categorical_dtype, + is_interval_dtype, is_sequence, is_list_like) from pandas.formats.printing import pprint_thing @@ -945,6 +946,9 @@ def _get_ilevel_values(index, level): assert_attr_equal('names', left, right, obj=obj) if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): assert_attr_equal('freq', left, right, obj=obj) + if (isinstance(left, pd.IntervalIndex) or + isinstance(right, pd.IntervalIndex)): + assert_attr_equal('closed', left, right, obj=obj) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): @@ -1309,6 +1313,12 @@ def assert_series_equal(left, right, check_dtype=True, else: assert_numpy_array_equal(left.get_values(), right.get_values(), check_dtype=check_dtype) + elif is_interval_dtype(left) or is_interval_dtype(right): + # TODO: big hack here + l = pd.IntervalIndex(left) + r = pd.IntervalIndex(right) + assert_index_equal(l, r, obj='{0}.index'.format(obj)) + else: libtesting.assert_almost_equal(left.get_values(), right.get_values(), check_less_precise=check_less_precise, @@ -1694,6 +1704,7 @@ def makeIntervalIndex(k=10, name=None): x = np.linspace(0, 100, num=(k + 1)) return IntervalIndex.from_breaks(x, name=name) + def makeBoolIndex(k=10, name=None): if k == 1: return Index([True], name=name) diff --git a/setup.py b/setup.py index 96b25f7427370..6707af7eb0908 100755 --- a/setup.py +++ b/setup.py @@ -119,6 +119,7 @@ def is_platform_mac(): '_libs/hashtable_func_helper.pxi.in'], 'index': ['_libs/index_class_helper.pxi.in'], 'sparse': ['sparse/sparse_op_helper.pxi.in'], + 'interval': ['_libs/intervaltree.pxi.in'] } _pxifiles = [] @@ -335,6 +336,7 @@ class CheckSDist(sdist_class): 'pandas/_libs/index.pyx', 'pandas/_libs/algos.pyx', 'pandas/_libs/join.pyx', + 'pandas/_libs/interval.pyx', 'pandas/core/window.pyx', 'pandas/sparse/sparse.pyx', 'pandas/util/testing.pyx', @@ -508,6 +510,9 @@ def pxd(name): 'depends': _pxi_dep['join']}, '_libs.reshape': {'pyxfile': '_libs/reshape', 'depends': _pxi_dep['reshape']}, + '_libs.interval': {'pyxfile': '_libs/interval', + 'pxdfiles': ['_libs/hashtable'], + 'depends': _pxi_dep['interval']}, 'core.libwindow': {'pyxfile': 'core/window', 'pxdfiles': ['_libs/src/skiplist', '_libs/src/util'], 'depends': ['pandas/_libs/src/skiplist.pyx',