diff --git a/doc/source/basics.rst b/doc/source/basics.rst index c460b19640f46..c18b94fea9a28 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1924,11 +1924,24 @@ untouched. If the data is modified, it is because you did so explicitly. dtypes ------ -The main types stored in pandas objects are ``float``, ``int``, ``bool``, -``datetime64[ns]`` and ``datetime64[ns, tz]``, ``timedelta[ns]``, -``category`` and ``object``. In addition these dtypes have item sizes, e.g. -``int64`` and ``int32``. See :ref:`Series with TZ ` -for more detail on ``datetime64[ns, tz]`` dtypes. +For the most part, pandas uses NumPy arrays and dtypes for Series or individual +columns of a DataFrame. The main types allowed in pandas objects are ``float``, +``int``, ``bool``, and ``datetime64[ns]`` (note that NumPy does not support +timezone-aware datetimes). + +In addition to NumPy's types, pandas :ref:`extends ` +NumPy's type-system for a few cases. + +* :ref:`Categorical ` +* :ref:`Datetime with Timezone ` +* :ref:`Period ` +* :ref:`Interval ` + +Pandas uses the ``object`` dtype for storing strings. + +Finally, arbitrary objects may be stored using the ``object`` dtype, but should +be avoided to the extent possible (for performance and interoperability with +other libraries and methods. See :ref:`basics.object_conversion`). A convenient :attr:`~DataFrame.dtypes` attribute for DataFrame returns a Series with the data type of each column. diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index ed4022d422b4d..a17bf7c8bd6e9 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -66,6 +66,36 @@ Current Behavior: result + +.. _whatsnew_0240.enhancements.interval: + +Storing Interval Data in Series and DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an +:class:`IntervalIndex` like previously (:issue:`19453`). + +.. ipython:: python + + ser = pd.Series(pd.interval_range(0, 5)) + ser + ser.dtype + +Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, +this should result in better performance when storing an array of intervals in +a :class:`Series`. + +Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy +array, but rather an ``ExtensionArray``: + +.. ipython:: python + + ser.values + +This is the same behavior as ``Series.values`` for categorical data. See +:ref:`whatsnew_0240.api_breaking.interval_values` for more. + + .. _whatsnew_0240.enhancements.other: Other Enhancements @@ -91,6 +121,45 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0240.api_breaking.interval_values: + +``IntervalIndex.values`` is now an ``IntervalArray`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an +``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). + +Previous Behavior: + +.. code-block:: ipython + + In [1]: idx = pd.interval_range(0, 4) + + In [2]: idx.values + Out[2]: + array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), + Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], + dtype=object) + +New Behavior: + +.. ipython:: python + + idx = pd.interval_range(0, 4) + idx.values + +This mirrors ``CateogricalIndex.values``, which returns a ``Categorical``. + +For situations where you need an ``ndarray`` of ``Interval`` objects, use +:meth:`numpy.asarray` or ``idx.astype(object)``. + +.. ipython:: python + + np.asarray(idx) + idx.values.astype(object) + + .. _whatsnew_0240.api.datetimelike.normalize: Tick DateOffset Normalize Restrictions @@ -350,6 +419,7 @@ Interval ^^^^^^^^ - Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) +- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) - - diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 4129132251682..ca669032aa058 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -98,6 +98,26 @@ cdef class IntervalMixin(object): msg = 'cannot compute length between {left!r} and {right!r}' raise TypeError(msg.format(left=self.left, right=self.right)) + def _check_closed_matches(self, other, name='other'): + """Check if the closed attribute of `other` matches. + + Note that 'left' and 'right' are considered different from 'both'. + + Parameters + ---------- + other : Interval, IntervalIndex, IntervalArray + name : str + Name to use for 'other' in the error message. + + Raises + ------ + ValueError + When `other` is not closed exactly the same as self. + """ + if self.closed != other.closed: + msg = "'{}.closed' is '{}', expected '{}'." + raise ValueError(msg.format(name, other.closed, self.closed)) + cdef _interval_like(other): return (hasattr(other, 'left') diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 1b8a43d4293a5..72ff0828e3486 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -2,5 +2,6 @@ ExtensionScalarOpsMixin) from .categorical import Categorical # noqa from .datetimes import DatetimeArrayMixin # noqa +from .interval import IntervalArray # noqa from .period import PeriodArrayMixin # noqa from .timedelta import TimedeltaArrayMixin # noqa diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 83fd0ab499283..7a6253dffe235 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -20,6 +20,7 @@ _ensure_int64, _ensure_object, _ensure_platform_int, + is_extension_array_dtype, is_dtype_equal, is_datetimelike, is_datetime64_dtype, @@ -1243,6 +1244,11 @@ def __array__(self, dtype=None): ret = take_1d(self.categories.values, self._codes) if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) + if is_extension_array_dtype(ret): + # When we're a Categorical[ExtensionArray], like Interval, + # we need to ensure __array__ get's all the way to an + # ndarray. + ret = np.asarray(ret) return ret def __setstate__(self, state): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py new file mode 100644 index 0000000000000..4ad53e16bc439 --- /dev/null +++ b/pandas/core/arrays/interval.py @@ -0,0 +1,1049 @@ +import textwrap +import numpy as np + +from pandas._libs.interval import (Interval, IntervalMixin, + intervals_to_interval_bounds) +from pandas.compat import add_metaclass +from pandas.compat.numpy import function as nv +import pandas.core.common as com +from pandas.core.config import get_option +from pandas.core.dtypes.cast import maybe_convert_platform +from pandas.core.dtypes.common import (is_categorical_dtype, is_float_dtype, + is_integer_dtype, is_interval_dtype, + is_scalar, is_string_dtype, + is_datetime64_any_dtype, + is_timedelta64_dtype, is_interval, + pandas_dtype) +from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.core.dtypes.generic import (ABCDatetimeIndex, ABCPeriodIndex, + ABCSeries, ABCIntervalIndex, + ABCInterval) +from pandas.core.dtypes.missing import isna, notna +from pandas.core.indexes.base import Index, _ensure_index +from pandas.util._decorators import Appender +from pandas.util._doctools import _WritableDoc + +from . import ExtensionArray, Categorical + +_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) +_interval_shared_docs = {} +_shared_docs_kwargs = dict( + klass='IntervalArray', + name='' +) + + +_interval_shared_docs['class'] = """%(summary)s + +.. versionadded:: %(versionadded)s + +.. warning:: + + The indexing behaviors are provisional and may change in + a future version of pandas. + +Parameters +---------- +data : array-like (1-dimensional) + Array-like containing Interval objects from which to build the + %(klass)s. +closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both or + neither. +%(name)s\ +copy : boolean, default False + Copy the meta-data. +dtype : dtype or None, default None + If None, dtype will be inferred + + .. versionadded:: 0.23.0 + +Attributes +---------- +left +right +closed +mid +length +values +is_non_overlapping_monotonic + +Methods +------- +from_arrays +from_tuples +from_breaks +set_closed +%(extra_methods)s\ + +%(examples)s\ + +Notes +------ +See the `user guide +`_ +for more. + +See Also +-------- +Index : The base pandas Index type +Interval : A bounded slice-like interval; the elements of an IntervalIndex +interval_range : Function to create a fixed frequency IntervalIndex +cut, qcut : Convert arrays of continuous data into Categoricals/Series of + Intervals +""" + + +@Appender(_interval_shared_docs['class'] % dict( + klass="IntervalArray", + summary="Pandas array for interval data that are closed on the same side", + versionadded="0.24.0", + name='', extra_methods='', examples='', +)) +@add_metaclass(_WritableDoc) +class IntervalArray(IntervalMixin, ExtensionArray): + dtype = IntervalDtype() + ndim = 1 + can_hold_na = True + _na_value = _fill_value = np.nan + + def __new__(cls, data, closed=None, dtype=None, copy=False, + fastpath=False, verify_integrity=True): + + if fastpath: + return cls._simple_new(data.left, data.right, closed, + copy=copy, dtype=dtype, + verify_integrity=False) + + if isinstance(data, ABCSeries) and is_interval_dtype(data): + data = data.values + + if isinstance(data, (cls, ABCIntervalIndex)): + left = data.left + right = data.right + closed = closed or data.closed + else: + + # don't allow scalars + if is_scalar(data): + msg = ("{}(...) must be called with a collection of some kind," + " {} was passed") + raise TypeError(msg.format(cls.__name__, data)) + + # might need to convert empty or purely na data + data = maybe_convert_platform_interval(data) + left, right, infer_closed = intervals_to_interval_bounds( + data, validate_closed=closed is None) + closed = closed or infer_closed + + return cls._simple_new(left, right, closed, copy=copy, dtype=dtype, + verify_integrity=verify_integrity) + + @classmethod + def _simple_new(cls, left, right, closed=None, + copy=False, dtype=None, verify_integrity=True): + result = IntervalMixin.__new__(cls) + + closed = closed or 'right' + left = _ensure_index(left, copy=copy) + right = _ensure_index(right, copy=copy) + + if dtype is not None: + # GH 19262: dtype must be an IntervalDtype to override inferred + dtype = pandas_dtype(dtype) + if not is_interval_dtype(dtype): + msg = 'dtype must be an IntervalDtype, got {dtype}' + raise TypeError(msg.format(dtype=dtype)) + elif dtype.subtype is not None: + left = left.astype(dtype.subtype) + right = right.astype(dtype.subtype) + + # coerce dtypes to match if needed + if is_float_dtype(left) and is_integer_dtype(right): + right = right.astype(left.dtype) + elif is_float_dtype(right) and is_integer_dtype(left): + left = left.astype(right.dtype) + + if type(left) != type(right): + msg = ('must not have differing left [{ltype}] and right ' + '[{rtype}] types') + raise ValueError(msg.format(ltype=type(left).__name__, + rtype=type(right).__name__)) + elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): + # GH 19016 + msg = ('category, object, and string subtypes are not supported ' + 'for IntervalArray') + raise TypeError(msg) + elif isinstance(left, ABCPeriodIndex): + msg = 'Period dtypes are not supported, use a PeriodIndex instead' + raise ValueError(msg) + elif (isinstance(left, ABCDatetimeIndex) and + str(left.tz) != str(right.tz)): + msg = ("left and right must have the same time zone, got " + "'{left_tz}' and '{right_tz}'") + raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) + + result._left = left + result._right = right + result._closed = closed + if verify_integrity: + result._validate() + return result + + @classmethod + def _from_sequence(cls, scalars): + return cls(scalars) + + @classmethod + def _from_factorized(cls, values, original): + return cls(values, closed=original.closed) + + _interval_shared_docs['from_breaks'] = """ + Construct an %(klass)s from an array of splits. + + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : boolean, default False + copy the data + dtype : dtype or None, default None + If None, dtype will be inferred + + .. versionadded:: 0.23.0 + + Examples + -------- + >>> pd.%(klass)s.from_breaks([0, 1, 2, 3]) + %(klass)s([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + %(klass)s.from_arrays : Construct from a left and right array + %(klass)s.from_tuples : Construct from a sequence of tuples + """ + + @classmethod + @Appender(_interval_shared_docs['from_breaks'] % _shared_docs_kwargs) + def from_breaks(cls, breaks, closed='right', copy=False, dtype=None): + breaks = maybe_convert_platform_interval(breaks) + + return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, + dtype=dtype) + + _interval_shared_docs['from_arrays'] = """ + Construct from two arrays defining the left and right bounds. + + Parameters + ---------- + left : array-like (1-dimensional) + Left bounds for each interval. + right : array-like (1-dimensional) + Right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : boolean, default False + Copy the data. + dtype : dtype, optional + If None, dtype will be inferred. + + .. versionadded:: 0.23.0 + + Returns + ------- + %(klass)s + + Notes + ----- + Each element of `left` must be less than or equal to the `right` + element at the same position. If an element is missing, it must be + missing in both `left` and `right`. A TypeError is raised when + using an unsupported type for `left` or `right`. At the moment, + 'category', 'object', and 'string' subtypes are not supported. + + Raises + ------ + ValueError + When a value is missing in only one of `left` or `right`. + When a value in `left` is greater than the corresponding value + in `right`. + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits. + %(klass)s.from_tuples : Construct an %(klass)s from an + array-like of tuples. + + + Examples + -------- + >>> %(klass)s.from_arrays([0, 1, 2], [1, 2, 3]) + %(klass)s([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + """ + + @classmethod + @Appender(_interval_shared_docs['from_arrays'] % _shared_docs_kwargs) + def from_arrays(cls, left, right, closed='right', copy=False, dtype=None): + left = maybe_convert_platform_interval(left) + right = maybe_convert_platform_interval(right) + + return cls._simple_new(left, right, closed, copy=copy, + dtype=dtype, verify_integrity=True) + + _interval_shared_docs['from_intervals'] = """ + Construct an %(klass)s from a 1d array of Interval objects + + .. deprecated:: 0.23.0 + + Parameters + ---------- + data : array-like (1-dimensional) + Array of Interval objects. All intervals must be closed on the same + sides. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + dtype : dtype or None, default None + If None, dtype will be inferred + + ..versionadded:: 0.23.0 + + Examples + -------- + >>> pd.%(klass)s.from_intervals([pd.Interval(0, 1), + ... pd.Interval(1, 2)]) + %(klass)s([(0, 1], (1, 2]] + closed='right', dtype='interval[int64]') + + The generic Index constructor work identically when it infers an array + of all intervals: + + >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)]) + %(klass)s([(0, 1], (1, 2]] + closed='right', dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + %(klass)s.from_arrays : Construct an %(klass)s from a left and + right array + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits + %(klass)s.from_tuples : Construct an %(klass)s from an + array-like of tuples + """ + + _interval_shared_docs['from_tuples'] = """ + Construct an %(klass)s from an array-like of tuples + + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : boolean, default False + by-default copy the data, this is compat only and ignored + dtype : dtype or None, default None + If None, dtype will be inferred + + ..versionadded:: 0.23.0 + + + Examples + -------- + >>> pd.%(klass)s.from_tuples([(0, 1), (1, 2)]) + %(klass)s([(0, 1], (1, 2]], + closed='right', dtype='interval[int64]') + + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex + %(klass)s.from_arrays : Construct an %(klass)s from a left and + right array + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits + """ + + @classmethod + @Appender(_interval_shared_docs['from_tuples'] % _shared_docs_kwargs) + def from_tuples(cls, data, closed='right', copy=False, dtype=None): + if len(data): + left, right = [], [] + else: + # ensure that empty data keeps input dtype + left = right = data + + for d in data: + if isna(d): + lhs = rhs = np.nan + else: + name = cls.__name__ + try: + # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] + lhs, rhs = d + except ValueError: + msg = ('{name}.from_tuples requires tuples of ' + 'length 2, got {tpl}').format(name=name, tpl=d) + raise ValueError(msg) + except TypeError: + msg = ('{name}.from_tuples received an invalid ' + 'item, {tpl}').format(name=name, tpl=d) + raise TypeError(msg) + lhs, rhs = d + left.append(lhs) + right.append(rhs) + + return cls.from_arrays(left, right, closed, copy=False, + dtype=dtype) + + def _validate(self): + """Verify that the IntervalArray is valid. + + Checks that + + * closed is valid + * left and right match lengths + * left and right have the same missing values + * left is always below right + """ + if self.closed not in _VALID_CLOSED: + raise ValueError("invalid option for 'closed': {closed}" + .format(closed=self.closed)) + if len(self.left) != len(self.right): + raise ValueError('left and right must have the same length') + left_mask = notna(self.left) + right_mask = notna(self.right) + if not (left_mask == right_mask).all(): + raise ValueError('missing values must be missing in the same ' + 'location both left and right sides') + if not (self.left[left_mask] <= self.right[left_mask]).all(): + raise ValueError('left side of interval must be <= right side') + + # --------- + # Interface + # --------- + def __iter__(self): + return iter(np.asarray(self)) + + def __len__(self): + return len(self.left) + + def __getitem__(self, value): + left = self.left[value] + right = self.right[value] + + # scalar + if not isinstance(left, Index): + if isna(left): + return self._fill_value + return Interval(left, right, self.closed) + + return self._shallow_copy(left, right) + + def __setitem__(self, key, value): + # na value: need special casing to set directly on numpy arrays + needs_float_conversion = False + if is_scalar(value) and isna(value): + if is_integer_dtype(self.dtype.subtype): + # can't set NaN on a numpy integer array + needs_float_conversion = True + elif is_datetime64_any_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.datetime64('NaT') + elif is_timedelta64_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.timedelta64('NaT') + value_left, value_right = value, value + + # scalar interval + elif is_interval_dtype(value) or isinstance(value, ABCInterval): + self._check_closed_matches(value, name="value") + value_left, value_right = value.left, value.right + + else: + # list-like of intervals + try: + array = IntervalArray(value) + value_left, value_right = array.left, array.right + except TypeError: + # wrong type: not interval or NA + msg = "'value' should be an interval type, got {} instead." + raise TypeError(msg.format(type(value))) + + # Need to ensure that left and right are updated atomically, so we're + # forced to copy, update the copy, and swap in the new values. + left = self.left.copy(deep=True) + if needs_float_conversion: + left = left.astype('float') + left.values[key] = value_left + self._left = left + + right = self.right.copy(deep=True) + if needs_float_conversion: + right = right.astype('float') + right.values[key] = value_right + self._right = right + + def fillna(self, value=None, method=None, limit=None): + """ + Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, dict, Series + If a scalar value is passed it is used to fill all missing values. + Alternatively, a Series or dict can be used to fill in different + values for each index. The value should not be a list. The + value(s) passed should be either Interval objects or NA/NaN. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + (Not implemented yet for IntervalArray) + Method to use for filling holes in reindexed Series + limit : int, default None + (Not implemented yet for IntervalArray) + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : IntervalArray with NA/NaN filled + """ + if method is not None: + raise TypeError('Filling by method is not supported for ' + 'IntervalArray.') + if limit is not None: + raise TypeError('limit is not supported for IntervalArray.') + + if not isinstance(value, ABCInterval): + msg = ("'IntervalArray.fillna' only supports filling with a " + "scalar 'pandas.Interval'. Got a '{}' instead." + .format(type(value).__name__)) + raise TypeError(msg) + + value = getattr(value, '_values', value) + self._check_closed_matches(value, name="value") + + left = self.left.fillna(value=value.left) + right = self.right.fillna(value=value.right) + return self._shallow_copy(left, right) + + @property + def dtype(self): + return IntervalDtype(self.left.dtype) + + def astype(self, dtype, copy=True): + """ + Cast to an ExtensionArray or NumPy array with dtype 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ExtensionArray or ndarray + ExtensionArray or NumPy ndarray with 'dtype' for its dtype. + """ + dtype = pandas_dtype(dtype) + if is_interval_dtype(dtype): + if dtype == self.dtype: + return self.copy() if copy else self + + # need to cast to different subtype + try: + new_left = self.left.astype(dtype.subtype) + new_right = self.right.astype(dtype.subtype) + except TypeError: + msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are ' + 'incompatible') + raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) + return self._shallow_copy(new_left, new_right) + elif is_categorical_dtype(dtype): + return Categorical(np.asarray(self)) + # TODO: This try/except will be repeated. + try: + return np.asarray(self).astype(dtype, copy=copy) + except (TypeError, ValueError): + msg = 'Cannot cast {name} to dtype {dtype}' + raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) + + @classmethod + def _concat_same_type(cls, to_concat): + """ + Concatenate multiple IntervalArray + + Parameters + ---------- + to_concat : sequence of IntervalArray + + Returns + ------- + IntervalArray + """ + closed = set(interval.closed for interval in to_concat) + if len(closed) != 1: + raise ValueError("Intervals must all be closed on the same side.") + closed = closed.pop() + + left = np.concatenate([interval.left for interval in to_concat]) + right = np.concatenate([interval.right for interval in to_concat]) + return cls._simple_new(left, right, closed=closed, copy=False) + + def _shallow_copy(self, left=None, right=None, closed=None): + """ + Return a new IntervalArray with the replacement attributes + + Parameters + ---------- + left : array-like + Values to be used for the left-side of the the intervals. + If None, the existing left and right values will be used. + + right : array-like + Values to be used for the right-side of the the intervals. + If None and left is IntervalArray-like, the left and right + of the IntervalArray-like will be used. + + closed : {'left', 'right', 'both', 'neither'}, optional + Whether the intervals are closed on the left-side, right-side, both + or neither. If None, the existing closed will be used. + """ + if left is None: + + # no values passed + left, right = self.left, self.right + + elif right is None: + + # only single value passed, could be an IntervalArray + # or array of Intervals + if not isinstance(left, (type(self), ABCIntervalIndex)): + left = type(self)(left) + + left, right = left.left, left.right + else: + + # both left and right are values + pass + + closed = closed or self.closed + return self._simple_new( + left, right, closed=closed, verify_integrity=False) + + def copy(self, deep=False): + """ + Return a copy of the array. + + Parameters + ---------- + deep : bool, default False + Also copy the underlying data backing this array. + + Returns + ------- + IntervalArray + """ + left = self.left.copy(deep=True) if deep else self.left + right = self.right.copy(deep=True) if deep else self.right + closed = self.closed + # TODO: Could skip verify_integrity here. + return type(self).from_arrays(left, right, closed=closed) + + def _formatting_values(self): + return np.asarray(self) + + def isna(self): + return isna(self.left) + + @property + def nbytes(self): + return self.left.nbytes + self.right.nbytes + + @property + def size(self): + # Avoid materializing self.values + return self.left.size + + @property + def shape(self): + return self.left.shape + + @property + def itemsize(self): + return self.left.itemsize + self.right.itemsize + + def take(self, indices, allow_fill=False, fill_value=None, axis=None, + **kwargs): + """ + Take elements from the IntervalArray. + + Parameters + ---------- + indices : sequence of integers + Indices to be taken. + + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : Interval or NA, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + axis : any, default None + Present for compat with IntervalIndex; does nothing. + + Returns + ------- + IntervalArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + """ + from pandas.core.algorithms import take + + nv.validate_take(tuple(), kwargs) + + fill_left = fill_right = fill_value + if allow_fill: + if fill_value is None: + fill_left = fill_right = self.left._na_value + elif is_interval(fill_value): + self._check_closed_matches(fill_value, name='fill_value') + fill_left, fill_right = fill_value.left, fill_value.right + elif not is_scalar(fill_value) and notna(fill_value): + msg = ("'IntervalArray.fillna' only supports filling with a " + "'scalar pandas.Interval or NA'. Got a '{}' instead." + .format(type(fill_value).__name__)) + raise ValueError(msg) + + left_take = take(self.left, indices, + allow_fill=allow_fill, fill_value=fill_left) + right_take = take(self.right, indices, + allow_fill=allow_fill, fill_value=fill_right) + + return self._shallow_copy(left_take, right_take) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each interval. + + Parameters + ---------- + dropna : boolean, default True + Don't include counts of NaN. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + # TODO: implement this is a non-naive way! + from pandas.core.algorithms import value_counts + return value_counts(np.asarray(self), dropna=dropna) + + # Formatting + + def _format_data(self): + + # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical + n = len(self) + max_seq_items = min((get_option( + 'display.max_seq_items') or n) // 10, 10) + + formatter = str + + if n == 0: + summary = '[]' + elif n == 1: + first = formatter(self[0]) + summary = '[{first}]'.format(first=first) + elif n == 2: + first = formatter(self[0]) + last = formatter(self[-1]) + summary = '[{first}, {last}]'.format(first=first, last=last) + else: + + if n > max_seq_items: + n = min(max_seq_items // 2, 10) + head = [formatter(x) for x in self[:n]] + tail = [formatter(x) for x in self[-n:]] + summary = '[{head} ... {tail}]'.format( + head=', '.join(head), tail=', '.join(tail)) + else: + head = [] + tail = [formatter(x) for x in self] + summary = '[{tail}]'.format(tail=', '.join(tail)) + + return summary + + def __repr__(self): + tpl = textwrap.dedent("""\ + {cls}({data}, + {lead}closed='{closed}', + {lead}dtype='{dtype}')""") + return tpl.format(cls=self.__class__.__name__, + data=self._format_data(), + lead=' ' * len(self.__class__.__name__) + ' ', + closed=self.closed, dtype=self.dtype) + + def _format_space(self): + space = ' ' * (len(self.__class__.__name__) + 1) + return "\n{space}".format(space=space) + + @property + def left(self): + """ + Return the left endpoints of each Interval in the IntervalArray as + an Index + """ + return self._left + + @property + def right(self): + """ + Return the right endpoints of each Interval in the IntervalArray as + an Index + """ + return self._right + + @property + def closed(self): + """ + Whether the intervals are closed on the left-side, right-side, both or + neither + """ + return self._closed + + _interval_shared_docs['set_closed'] = """ + Return an %(klass)s identical to the current one, but closed on the + specified side + + .. versionadded:: 0.24.0 + + Parameters + ---------- + closed : {'left', 'right', 'both', 'neither'} + Whether the intervals are closed on the left-side, right-side, both + or neither. + + Returns + ------- + new_index : %(klass)s + + Examples + -------- + >>> index = pd.interval_range(0, 3) + >>> index + %(klass)s([(0, 1], (1, 2], (2, 3]] + closed='right', + dtype='interval[int64]') + >>> index.set_closed('both') + %(klass)s([[0, 1], [1, 2], [2, 3]] + closed='both', + dtype='interval[int64]') + """ + + @Appender(_interval_shared_docs['set_closed'] % _shared_docs_kwargs) + def set_closed(self, closed): + if closed not in _VALID_CLOSED: + msg = "invalid option for 'closed': {closed}" + raise ValueError(msg.format(closed=closed)) + + return self._shallow_copy(closed=closed) + + @property + def length(self): + """ + Return an Index with entries denoting the length of each Interval in + the IntervalArray + """ + try: + return self.right - self.left + except TypeError: + # length not defined for some types, e.g. string + msg = ('IntervalArray contains Intervals without defined length, ' + 'e.g. Intervals with string endpoints') + raise TypeError(msg) + + @property + def mid(self): + """ + Return the midpoint of each Interval in the IntervalArray as an Index + """ + try: + return 0.5 * (self.left + self.right) + except TypeError: + # datetime safe version + return self.left + 0.5 * self.length + + @property + def is_non_overlapping_monotonic(self): + """ + Return True if the IntervalArray is non-overlapping (no Intervals share + points) and is either monotonic increasing or monotonic decreasing, + else False + """ + # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) + # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) + # we already require left <= right + + # strict inequality for closed == 'both'; equality implies overlapping + # at a point when both sides of intervals are included + if self.closed == 'both': + return bool((self.right[:-1] < self.left[1:]).all() or + (self.left[:-1] > self.right[1:]).all()) + + # non-strict inequality when closed != 'both'; at least one side is + # not included in the intervals, so equality does not imply overlapping + return bool((self.right[:-1] <= self.left[1:]).all() or + (self.left[:-1] >= self.right[1:]).all()) + + # Conversion + def __array__(self, dtype=None): + """ + Return the IntervalArray's data as a numpy array of Interval + objects (with dtype='object') + """ + left = self.left + right = self.right + mask = self.isna() + closed = self._closed + + result = np.empty(len(left), dtype=object) + for i in range(len(left)): + if mask[i]: + result[i] = np.nan + else: + result[i] = Interval(left[i], right[i], closed) + return result + + _interval_shared_docs['to_tuples'] = """\ + Return an %(return_type)s of tuples of the form (left, right) + + Parameters + ---------- + na_tuple : boolean, default True + Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA + value itself if False, ``nan``. + + ..versionadded:: 0.23.0 + + Returns + ------- + tuples: %(return_type)s + %(examples)s\ + """ + + @Appender(_interval_shared_docs['to_tuples'] % dict( + return_type='ndarray', + examples='', + )) + def to_tuples(self, na_tuple=True): + tuples = com._asarray_tuplesafe(zip(self.left, self.right)) + if not na_tuple: + # GH 18756 + tuples = np.where(~self.isna(), tuples, np.nan) + return tuples + + def repeat(self, repeats, **kwargs): + """ + Repeat elements of an IntervalArray. + + Returns a new IntervalArray where each element of the current + IntervalArray is repeated consecutively a given number of times. + + Parameters + ---------- + repeats : int + The number of repetitions for each element. + + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. + + Returns + ------- + IntervalArray + Newly created IntervalArray with repeated elements. + + See Also + -------- + Index.repeat : Equivalent function for Index + Series.repeat : Equivalent function for Series + numpy.repeat : Underlying implementation + """ + left_repeat = self.left.repeat(repeats, **kwargs) + right_repeat = self.right.repeat(repeats, **kwargs) + return self._shallow_copy(left=left_repeat, right=right_repeat) + + +def maybe_convert_platform_interval(values): + """ + Try to do platform conversion, with special casing for IntervalArray. + Wrapper around maybe_convert_platform that alters the default return + dtype in certain cases to be compatible with IntervalArray. For example, + empty lists return with integer dtype instead of object dtype, which is + prohibited for IntervalArray. + + Parameters + ---------- + values : array-like + + Returns + ------- + array + """ + if isinstance(values, (list, tuple)) and len(values) == 0: + # GH 19016 + # empty lists/tuples get object dtype by default, but this is not + # prohibited for IntervalArray, so coerce to integer instead + return np.array([], dtype=np.int64) + elif is_categorical_dtype(values): + values = np.asarray(values) + + return maybe_convert_platform(values) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 285e386e25613..4a41b14cee071 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -15,6 +15,7 @@ is_period_dtype, is_object_dtype, is_bool_dtype, + is_interval_dtype, is_dtype_equal, _NS_DTYPE, _TD_DTYPE) @@ -58,6 +59,8 @@ def get_dtype_kinds(l): typ = 'bool' elif is_period_dtype(dtype): typ = str(arr.dtype) + elif is_interval_dtype(dtype): + typ = str(arr.dtype) else: typ = dtype.kind typs.add(typ) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 4f7e9136022a5..57b1d81d94754 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -690,14 +690,13 @@ class IntervalDtypeType(type): pass -class IntervalDtype(PandasExtensionDtype): +class IntervalDtype(PandasExtensionDtype, ExtensionDtype): """ A Interval duck-typed class, suitable for holding an interval THIS IS NOT A REAL NUMPY DTYPE """ name = 'interval' - type = IntervalDtypeType kind = None str = '|O08' base = np.dtype('O') @@ -751,6 +750,17 @@ def __new__(cls, subtype=None): cls._cache[str(subtype)] = u return u + @classmethod + def construct_array_type(cls): + """Return the array type associated with this dtype + + Returns + ------- + type + """ + from pandas.core.arrays import IntervalArray + return IntervalArray + @classmethod def construct_from_string(cls, string): """ @@ -765,6 +775,11 @@ def construct_from_string(cls, string): msg = "a string needs to be passed, got type {typ}" raise TypeError(msg.format(typ=type(string))) + @property + def type(self): + from pandas import Interval + return Interval + def __unicode__(self): if self.subtype is None: return "interval" diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 6683612eb8a3e..7ef4a7674753e 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -10,7 +10,7 @@ from .common import (is_string_dtype, is_datetimelike, is_datetimelike_v_numeric, is_float_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_interval_dtype, + is_timedelta64_dtype, is_period_dtype, is_complex_dtype, is_string_like_dtype, is_bool_dtype, @@ -196,10 +196,6 @@ def _isna_ndarraylike(obj): else: values = obj result = values.isna() - elif is_interval_dtype(values): - # TODO(IntervalArray): remove this if block - from pandas import IntervalIndex - result = IntervalIndex(obj).isna() elif is_string_dtype(dtype): # Working around NumPy ticket 1542 shape = values.shape diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 78fa6f8217157..1053dc8f89640 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -272,7 +272,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, **kwargs) # interval - if is_interval_dtype(data) or is_interval_dtype(dtype): + if ((is_interval_dtype(data) or is_interval_dtype(dtype)) and + not is_object_dtype(dtype)): from .interval import IntervalIndex closed = kwargs.get('closed', None) return IntervalIndex(data, dtype=dtype, name=name, copy=copy, diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6b9e9dc2f9377..9375a60d0964c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,20 +1,17 @@ """ define the IntervalIndex """ +import textwrap +import warnings import numpy as np -import warnings -from pandas.core.dtypes.missing import notna, isna -from pandas.core.dtypes.generic import ABCDatetimeIndex, ABCPeriodIndex -from pandas.core.dtypes.dtypes import IntervalDtype -from pandas.core.dtypes.cast import ( - maybe_convert_platform, find_common_type, maybe_downcast_to_dtype) +from pandas.compat import add_metaclass +from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.cast import find_common_type, maybe_downcast_to_dtype from pandas.core.dtypes.common import ( _ensure_platform_int, is_list_like, is_datetime_or_timedelta_dtype, is_datetime64tz_dtype, - is_categorical_dtype, - is_string_dtype, is_integer_dtype, is_float_dtype, is_interval_dtype, @@ -22,8 +19,7 @@ is_scalar, is_float, is_number, - is_integer, - pandas_dtype) + is_integer) from pandas.core.indexes.base import ( Index, _ensure_index, default_pprint, _index_shared_docs) @@ -31,26 +27,33 @@ from pandas._libs import Timestamp, Timedelta from pandas._libs.interval import ( Interval, IntervalMixin, IntervalTree, - intervals_to_interval_bounds) +) from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.indexes.multi import MultiIndex -from pandas.compat.numpy import function as nv import pandas.core.common as com from pandas.util._decorators import cache_readonly, Appender +from pandas.util._doctools import _WritableDoc +from pandas.util._exceptions import rewrite_exception from pandas.core.config import get_option from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import DateOffset import pandas.core.indexes.base as ibase +from pandas.core.arrays.interval import (IntervalArray, + _interval_shared_docs) + +_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( dict(klass='IntervalIndex', - target_klass='IntervalIndex or list of Intervals')) - - -_VALID_CLOSED = set(['left', 'right', 'both', 'neither']) + target_klass='IntervalIndex or list of Intervals', + name=textwrap.dedent("""\ + name : object, optional + to be stored in the index. + """), + )) def _get_next_label(label): @@ -96,34 +99,6 @@ def _get_interval_closed_bounds(interval): return left, right -def maybe_convert_platform_interval(values): - """ - Try to do platform conversion, with special casing for IntervalIndex. - Wrapper around maybe_convert_platform that alters the default return - dtype in certain cases to be compatible with IntervalIndex. For example, - empty lists return with integer dtype instead of object dtype, which is - prohibited for IntervalIndex. - - Parameters - ---------- - values : array-like - - Returns - ------- - array - """ - if is_categorical_dtype(values): - # GH 21243/21253 - values = np.array(values) - - if isinstance(values, (list, tuple)) and len(values) == 0: - # GH 19016 - # empty lists/tuples get object dtype by default, but this is not - # prohibited for IntervalIndex, so coerce to integer instead - return np.array([], dtype=np.int64) - return maybe_convert_platform(values) - - def _new_IntervalIndex(cls, d): """ This is called upon unpickling, rather than the default which doesn't have @@ -132,58 +107,16 @@ def _new_IntervalIndex(cls, d): return cls.from_arrays(**d) -class IntervalIndex(IntervalMixin, Index): - """ - Immutable Index implementing an ordered, sliceable set. IntervalIndex - represents an Index of Interval objects that are all closed on the same - side. - - .. versionadded:: 0.20.0 - - .. warning:: - - The indexing behaviors are provisional and may change in - a future version of pandas. - - Parameters - ---------- - data : array-like (1-dimensional) - Array-like containing Interval objects from which to build the - IntervalIndex - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both or - neither. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - Copy the meta-data - dtype : dtype or None, default None - If None, dtype will be inferred - - .. versionadded:: 0.23.0 - - Attributes - ---------- - closed - is_non_overlapping_monotonic - left - length - mid - right - values - - Methods - ------- - contains - from_arrays - from_breaks - from_tuples - get_indexer - get_loc - set_closed +@Appender(_interval_shared_docs['class'] % dict( + klass="IntervalIndex", + summary="Immutable index of intervals that are closed on the same side.", + name=_index_doc_kwargs['name'], + versionadded="0.20.0", + extra_methods="contains\n", + examples=textwrap.dedent("""\ Examples - --------- + -------- A new ``IntervalIndex`` is typically constructed using :func:`interval_range`: @@ -197,21 +130,11 @@ class IntervalIndex(IntervalMixin, Index): See further examples in the doc strings of ``interval_range`` and the mentioned constructor methods. + """), - Notes - ------ - See the `user guide - `_ - for more. - - See Also - -------- - Index : The base pandas Index type - Interval : A bounded slice-like interval; the elements of an IntervalIndex - interval_range : Function to create a fixed frequency IntervalIndex - cut, qcut : Convert arrays of continuous data into Categoricals/Series of - Intervals - """ +)) +@add_metaclass(_WritableDoc) +class IntervalIndex(IntervalMixin, Index): _typ = 'intervalindex' _comparables = ['name'] _attributes = ['name', 'closed'] @@ -219,131 +142,50 @@ class IntervalIndex(IntervalMixin, Index): # we would like our indexing holder to defer to us _defer_to_indexing = True + # Immutable, so we are able to cache computations like isna in '_mask' _mask = None def __new__(cls, data, closed=None, dtype=None, copy=False, name=None, fastpath=False, verify_integrity=True): if fastpath: - return cls._simple_new(data.left, data.right, closed, name, - copy=copy, verify_integrity=False) + return cls._simple_new(data, name) if name is None and hasattr(data, 'name'): name = data.name - if isinstance(data, IntervalIndex): - left = data.left - right = data.right - closed = closed or data.closed - else: - - # don't allow scalars - if is_scalar(data): - cls._scalar_data_error(data) - - data = maybe_convert_platform_interval(data) - left, right, infer_closed = intervals_to_interval_bounds( - data, validate_closed=closed is None) - closed = closed or infer_closed + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype, + fastpath=fastpath, + verify_integrity=verify_integrity) - return cls._simple_new(left, right, closed, name, copy=copy, - dtype=dtype, verify_integrity=verify_integrity) + return cls._simple_new(array, name) @classmethod - def _simple_new(cls, left, right, closed=None, name=None, copy=False, - dtype=None, verify_integrity=True): - result = IntervalMixin.__new__(cls) + def _simple_new(cls, array, name, closed=None): + """ + Construct from an IntervalArray - closed = closed or 'right' - left = _ensure_index(left, copy=copy) - right = _ensure_index(right, copy=copy) - - if dtype is not None: - # GH 19262: dtype must be an IntervalDtype to override inferred - dtype = pandas_dtype(dtype) - if not is_interval_dtype(dtype): - msg = 'dtype must be an IntervalDtype, got {dtype}' - raise TypeError(msg.format(dtype=dtype)) - elif dtype.subtype is not None: - left = left.astype(dtype.subtype) - right = right.astype(dtype.subtype) - - # coerce dtypes to match if needed - if is_float_dtype(left) and is_integer_dtype(right): - right = right.astype(left.dtype) - elif is_float_dtype(right) and is_integer_dtype(left): - left = left.astype(right.dtype) - - if type(left) != type(right): - msg = ('must not have differing left [{ltype}] and right ' - '[{rtype}] types') - raise ValueError(msg.format(ltype=type(left).__name__, - rtype=type(right).__name__)) - elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): - # GH 19016 - msg = ('category, object, and string subtypes are not supported ' - 'for IntervalIndex') - raise TypeError(msg) - elif isinstance(left, ABCPeriodIndex): - msg = 'Period dtypes are not supported, use a PeriodIndex instead' - raise ValueError(msg) - elif (isinstance(left, ABCDatetimeIndex) and - str(left.tz) != str(right.tz)): - msg = ("left and right must have the same time zone, got " - "'{left_tz}' and '{right_tz}'") - raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) - - result._left = left - result._right = right - result._closed = closed + Parameters + ---------- + array : IntervalArray + name : str + Attached as result.name + closed : Any + Ignored. + """ + result = IntervalMixin.__new__(cls) + result._data = array result.name = name - if verify_integrity: - result._validate() result._reset_identity() return result @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, left=None, right=None, **kwargs): - if left is None: - - # no values passed - left, right = self.left, self.right - - elif right is None: - - # only single value passed, could be an IntervalIndex - # or array of Intervals - if not isinstance(left, IntervalIndex): - left = self._constructor(left) - - left, right = left.left, left.right - else: - - # both left and right are values - pass - + result = self._data._shallow_copy(left=left, right=right) attributes = self._get_attributes_dict() attributes.update(kwargs) - attributes['verify_integrity'] = False - return self._simple_new(left, right, **attributes) - - def _validate(self): - """ - Verify that the IntervalIndex is valid. - """ - if self.closed not in _VALID_CLOSED: - raise ValueError("invalid option for 'closed': {closed}" - .format(closed=self.closed)) - if len(self.left) != len(self.right): - raise ValueError('left and right must have the same length') - left_mask = notna(self.left) - right_mask = notna(self.right) - if not (left_mask == right_mask).all(): - raise ValueError('missing values must be missing in the same ' - 'location both left and right sides') - if not (self.left[left_mask] <= self.right[left_mask]).all(): - raise ValueError('left side of interval must be <= right side') - self._mask = ~left_mask + return self._simple_new(result, **attributes) @cache_readonly def hasnans(self): @@ -412,272 +254,60 @@ def contains(self, key): return False @classmethod + @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) def from_breaks(cls, breaks, closed='right', name=None, copy=False, dtype=None): - """ - Construct an IntervalIndex from an array of splits - - Parameters - ---------- - breaks : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - copy the data - dtype : dtype or None, default None - If None, dtype will be inferred - - .. versionadded:: 0.23.0 - - Examples - -------- - >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex([(0, 1], (1, 2], (2, 3]] - closed='right', - dtype='interval[int64]') - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex - IntervalIndex.from_arrays : Construct an IntervalIndex from a left and - right array - IntervalIndex.from_tuples : Construct an IntervalIndex from a - list/array of tuples - """ - breaks = maybe_convert_platform_interval(breaks) - - return cls.from_arrays(breaks[:-1], breaks[1:], closed, - name=name, copy=copy, dtype=dtype) + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) @classmethod + @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) def from_arrays(cls, left, right, closed='right', name=None, copy=False, dtype=None): - """ - Construct from two arrays defining the left and right bounds. - - Parameters - ---------- - left : array-like (1-dimensional) - Left bounds for each interval. - right : array-like (1-dimensional) - Right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - Copy the data. - dtype : dtype, optional - If None, dtype will be inferred. - - .. versionadded:: 0.23.0 - - Returns - ------- - index : IntervalIndex - - Notes - ----- - Each element of `left` must be less than or equal to the `right` - element at the same position. If an element is missing, it must be - missing in both `left` and `right`. A TypeError is raised when - using an unsupported type for `left` or `right`. At the moment, - 'category', 'object', and 'string' subtypes are not supported. - - Raises - ------ - ValueError - When a value is missing in only one of `left` or `right`. - When a value in `left` is greater than the corresponding value - in `right`. - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - IntervalIndex.from_breaks : Construct an IntervalIndex from an array of - splits. - IntervalIndex.from_tuples : Construct an IntervalIndex from a - list/array of tuples. - - Examples - -------- - >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) - IntervalIndex([(0, 1], (1, 2], (2, 3]] - closed='right', - dtype='interval[int64]') - - If you want to segment different groups of people based on - ages, you can apply the method as follows: - - >>> ages = pd.IntervalIndex.from_arrays([0, 2, 13], - ... [2, 13, 19], closed='left') - >>> ages - IntervalIndex([[0, 2), [2, 13), [13, 19)] - closed='left', - dtype='interval[int64]') - >>> s = pd.Series(['baby', 'kid', 'teen'], ages) - >>> s - [0, 2) baby - [2, 13) kid - [13, 19) teen - dtype: object - - Values may be missing, but they must be missing in both arrays. - - >>> pd.IntervalIndex.from_arrays([0, np.nan, 13], - ... [2, np.nan, 19]) - IntervalIndex([(0.0, 2.0], nan, (13.0, 19.0]] - closed='right', - dtype='interval[float64]') - """ - left = maybe_convert_platform_interval(left) - right = maybe_convert_platform_interval(right) - - return cls._simple_new(left, right, closed, name=name, copy=copy, - dtype=dtype, verify_integrity=True) + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_arrays(left, right, closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) @classmethod + @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) def from_intervals(cls, data, closed=None, name=None, copy=False, dtype=None): - """ - Construct an IntervalIndex from a 1d array of Interval objects - - .. deprecated:: 0.23.0 - - Parameters - ---------- - data : array-like (1-dimensional) - Array of Interval objects. All intervals must be closed on the same - sides. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - by-default copy the data, this is compat only and ignored - dtype : dtype or None, default None - If None, dtype will be inferred - - .. versionadded:: 0.23.0 - - Examples - -------- - >>> pd.IntervalIndex.from_intervals([pd.Interval(0, 1), - ... pd.Interval(1, 2)]) - IntervalIndex([(0, 1], (1, 2]] - closed='right', dtype='interval[int64]') - - The generic Index constructor work identically when it infers an array - of all intervals: - - >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)]) - IntervalIndex([(0, 1], (1, 2]] - closed='right', dtype='interval[int64]') - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex - IntervalIndex.from_arrays : Construct an IntervalIndex from a left and - right array - IntervalIndex.from_breaks : Construct an IntervalIndex from an array of - splits - IntervalIndex.from_tuples : Construct an IntervalIndex from a - list/array of tuples - """ msg = ('IntervalIndex.from_intervals is deprecated and will be ' - 'removed in a future version; use IntervalIndex(...) instead') + 'removed in a future version; Use IntervalIndex(...) instead') warnings.warn(msg, FutureWarning, stacklevel=2) - return cls(data, closed=closed, name=name, copy=copy, dtype=dtype) + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) + + if name is None and isinstance(data, cls): + name = data.name + + return cls._simple_new(array, name=name) @classmethod + @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) def from_tuples(cls, data, closed='right', name=None, copy=False, dtype=None): - """ - Construct an IntervalIndex from a list/array of tuples - - Parameters - ---------- - data : array-like (1-dimensional) - Array of tuples - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - name : object, optional - Name to be stored in the index. - copy : boolean, default False - by-default copy the data, this is compat only and ignored - dtype : dtype or None, default None - If None, dtype will be inferred - - .. versionadded:: 0.23.0 - - Examples - -------- - >>> pd.IntervalIndex.from_tuples([(0, 1), (1, 2)]) - IntervalIndex([(0, 1], (1, 2]], - closed='right', dtype='interval[int64]') - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex - IntervalIndex.from_arrays : Construct an IntervalIndex from a left and - right array - IntervalIndex.from_breaks : Construct an IntervalIndex from an array of - splits - """ - if len(data): - left, right = [], [] - else: - left = right = data - - for d in data: - if isna(d): - lhs = rhs = np.nan - else: - try: - # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...] - lhs, rhs = d - except ValueError: - msg = ('IntervalIndex.from_tuples requires tuples of ' - 'length 2, got {tpl}').format(tpl=d) - raise ValueError(msg) - except TypeError: - msg = ('IntervalIndex.from_tuples received an invalid ' - 'item, {tpl}').format(tpl=d) - raise TypeError(msg) - left.append(lhs) - right.append(rhs) - - return cls.from_arrays(left, right, closed, name=name, copy=False, - dtype=dtype) - - def to_tuples(self, na_tuple=True): - """ - Return an Index of tuples of the form (left, right) - - Parameters - ---------- - na_tuple : boolean, default True - Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA - value itself if False, ``nan``. - - .. versionadded:: 0.23.0 - + with rewrite_exception("IntervalArray", cls.__name__): + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(arr, name=name) + + @Appender(_interval_shared_docs['to_tuples'] % dict( + return_type="Index", + examples=""" Examples -------- >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3]) >>> idx.to_tuples() Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') >>> idx.to_tuples(na_tuple=False) - Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') - """ - tuples = com._asarray_tuplesafe(zip(self.left, self.right)) - if not na_tuple: - # GH 18756 - tuples = np.where(~self._isnan, tuples, np.nan) + Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""", + )) + def to_tuples(self, na_tuple=True): + tuples = self._data.to_tuples(na_tuple=na_tuple) return Index(tuples) @cache_readonly @@ -691,7 +321,7 @@ def left(self): Return the left endpoints of each Interval in the IntervalIndex as an Index """ - return self._left + return self._data._left @property def right(self): @@ -699,7 +329,7 @@ def right(self): Return the right endpoints of each Interval in the IntervalIndex as an Index """ - return self._right + return self._data._right @property def closed(self): @@ -707,42 +337,17 @@ def closed(self): Whether the intervals are closed on the left-side, right-side, both or neither """ - return self._closed + return self._data._closed + @Appender(_interval_shared_docs['set_closed'] % _index_doc_kwargs) def set_closed(self, closed): - """ - Return an IntervalIndex identical to the current one, but closed on the - specified side - - .. versionadded:: 0.24.0 - - Parameters - ---------- - closed : {'left', 'right', 'both', 'neither'} - Whether the intervals are closed on the left-side, right-side, both - or neither. - - Returns - ------- - new_index : IntervalIndex - - Examples - -------- - >>> index = pd.interval_range(0, 3) - >>> index - IntervalIndex([(0, 1], (1, 2], (2, 3]] - closed='right', - dtype='interval[int64]') - >>> index.set_closed('both') - IntervalIndex([[0, 1], [1, 2], [2, 3]] - closed='both', - dtype='interval[int64]') - """ if closed not in _VALID_CLOSED: msg = "invalid option for 'closed': {closed}" raise ValueError(msg.format(closed=closed)) - return self._shallow_copy(closed=closed) + # return self._shallow_copy(closed=closed) + array = self._data.set_closed(closed) + return self._simple_new(array, self.name) @property def length(self): @@ -750,23 +355,22 @@ def length(self): Return an Index with entries denoting the length of each Interval in the IntervalIndex """ - try: - return self.right - self.left - except TypeError: - # length not defined for some types, e.g. string - msg = ('IntervalIndex contains Intervals without defined length, ' - 'e.g. Intervals with string endpoints') - raise TypeError(msg) + return self._data.length @property def size(self): - # Avoid materializing self.values - return self.left.size + # Avoid materializing ndarray[Interval] + return self._data.size @property def shape(self): - # Avoid materializing self.values - return self.left.shape + # Avoid materializing ndarray[Interval] + return self._data.shape + + @property + def itemsize(self): + # Avoid materializing ndarray[Interval] + return self._data.itemsize def __len__(self): return len(self.left) @@ -774,13 +378,20 @@ def __len__(self): @cache_readonly def values(self): """ - Return the IntervalIndex's data as a numpy array of Interval - objects (with dtype='object') + Return the IntervalIndex's data as an IntervalArray. """ + return self._data + + @cache_readonly + def _values(self): + return self._data + + @cache_readonly + def _ndarray_values(self): left = self.left right = self.right mask = self._isnan - closed = self._closed + closed = self.closed result = np.empty(len(left), dtype=object) for i in range(len(left)): @@ -792,15 +403,12 @@ def values(self): def __array__(self, result=None): """ the array interface, return my values """ - return self.values + return self._ndarray_values def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result - def _array_values(self): - return self.values - def __reduce__(self): d = dict(left=self.left, right=self.right) @@ -809,30 +417,25 @@ def __reduce__(self): @Appender(_index_shared_docs['copy']) def copy(self, deep=False, name=None): - left = self.left.copy(deep=True) if deep else self.left - right = self.right.copy(deep=True) if deep else self.right - name = name if name is not None else self.name - closed = self.closed - return type(self).from_arrays(left, right, closed=closed, name=name) + array = self._data.copy(deep=deep) + attributes = self._get_attributes_dict() + if name is not None: + attributes.update(name=name) + + return self._simple_new(array, **attributes) @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): - dtype = pandas_dtype(dtype) - if is_interval_dtype(dtype) and dtype != self.dtype: - try: - new_left = self.left.astype(dtype.subtype) - new_right = self.right.astype(dtype.subtype) - except TypeError: - msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are ' - 'incompatible') - raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) - return self._shallow_copy(new_left, new_right) + with rewrite_exception('IntervalArray', self.__class__.__name__): + new_values = self.values.astype(dtype, copy=copy) + if is_interval_dtype(new_values): + return self._shallow_copy(new_values.left, new_values.right) return super(IntervalIndex, self).astype(dtype, copy=copy) @cache_readonly def dtype(self): """Return the dtype object of the underlying data""" - return IntervalDtype(self.left.dtype.name) + return self._data.dtype @property def inferred_type(self): @@ -851,11 +454,7 @@ def mid(self): """ Return the midpoint of each Interval in the IntervalIndex as an Index """ - try: - return 0.5 * (self.left + self.right) - except TypeError: - # datetime safe version - return self.left + 0.5 * self.length + return self._data.mid @cache_readonly def is_monotonic(self): @@ -890,25 +489,7 @@ def is_unique(self): @cache_readonly def is_non_overlapping_monotonic(self): - """ - Return True if the IntervalIndex is non-overlapping (no Intervals share - points) and is either monotonic increasing or monotonic decreasing, - else False - """ - # must be increasing (e.g., [0, 1), [1, 2), [2, 3), ... ) - # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...) - # we already require left <= right - - # strict inequality for closed == 'both'; equality implies overlapping - # at a point when both sides of intervals are included - if self.closed == 'both': - return bool((self.right[:-1] < self.left[1:]).all() or - (self.left[:-1] > self.right[1:]).all()) - - # non-strict inequality when closed != 'both'; at least one side is - # not included in the intervals, so equality does not imply overlapping - return bool((self.right[:-1] <= self.left[1:]).all() or - (self.left[:-1] >= self.right[1:]).all()) + return self._data.is_non_overlapping_monotonic @Appender(_index_shared_docs['_convert_scalar_indexer']) def _convert_scalar_indexer(self, key, kind=None): @@ -1299,33 +880,10 @@ def _concat_same_dtype(self, to_concat, name): @Appender(_index_shared_docs['take'] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take(tuple(), kwargs) - indices = _ensure_platform_int(indices) - left, right = self.left, self.right - - if fill_value is None: - fill_value = self._na_value - mask = indices == -1 - - if not mask.any(): - # we won't change dtype here in this case - # if we don't need - allow_fill = False - - taker = lambda x: x.take(indices, allow_fill=allow_fill, - fill_value=fill_value) - - try: - new_left = taker(left) - new_right = taker(right) - except ValueError: - - # we need to coerce; migth have NA's in an - # integer dtype - new_left = taker(left.astype(float)) - new_right = taker(right.astype(float)) - - return self._shallow_copy(new_left, new_right) + result = self._data.take(indices, axis=axis, allow_fill=allow_fill, + fill_value=fill_value, **kwargs) + attributes = self._get_attributes_dict() + return self._simple_new(result, **attributes) def __getitem__(self, value): mask = self._isnan[value] @@ -1385,7 +943,7 @@ def _format_data(self, name=None): tail = [formatter(x) for x in self] summary = '[{tail}]'.format(tail=', '.join(tail)) - return summary + self._format_space() + return summary + ',' + self._format_space() def _format_attrs(self): attrs = [('closed', repr(self.closed))] diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index cc3f140d70832..e62d70847437c 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -205,7 +205,9 @@ def _hash_categorical(c, encoding, hash_key): ------- ndarray of hashed values array, same size as len(c) """ - hashed = hash_array(c.categories.values, encoding, hash_key, + # Convert ExtensionArrays to ndarrays + values = np.asarray(c.categories.values) + hashed = hash_array(values, encoding, hash_key, categorize=False) # we have uint64, as we don't directly support missing values diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 03a5e8528f72d..7a1e72637f4ce 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -57,6 +57,7 @@ Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, Categorical, CategoricalIndex, IntervalIndex, Interval, TimedeltaIndex) +from pandas.core.arrays import IntervalArray from pandas.core.sparse.api import SparseSeries, SparseDataFrame from pandas.core.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame @@ -402,13 +403,17 @@ def encode(obj): u'freq': u_safe(getattr(obj, 'freqstr', None)), u'tz': tz, u'compress': compressor} - elif isinstance(obj, IntervalIndex): - return {u'typ': u'interval_index', + elif isinstance(obj, (IntervalIndex, IntervalArray)): + if isinstance(obj, IntervalIndex): + typ = u'interval_index' + else: + typ = u'interval_array' + return {u'typ': typ, u'klass': u(obj.__class__.__name__), u'name': getattr(obj, 'name', None), - u'left': getattr(obj, '_left', None), - u'right': getattr(obj, '_right', None), - u'closed': getattr(obj, '_closed', None)} + u'left': getattr(obj, 'left', None), + u'right': getattr(obj, 'right', None), + u'closed': getattr(obj, 'closed', None)} elif isinstance(obj, MultiIndex): return {u'typ': u'multi_index', u'klass': u(obj.__class__.__name__), @@ -610,7 +615,7 @@ def decode(obj): result = result.tz_localize('UTC').tz_convert(tz) return result - elif typ == u'interval_index': + elif typ in (u'interval_index', 'interval_array'): return globals()[obj[u'klass']].from_arrays(obj[u'left'], obj[u'right'], obj[u'closed'], diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 62e0f1cb717f0..02ac7fc7d5ed7 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -552,10 +552,8 @@ def test_basic(self): s = Series(ii, name='A') - # dtypes - # series results in object dtype currently, - assert not is_interval_dtype(s.dtype) - assert not is_interval_dtype(s) + assert is_interval_dtype(s.dtype) + assert is_interval_dtype(s) def test_basic_dtype(self): assert is_interval_dtype('interval[int64]') diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 8fd3d1a57f6c8..0832e9f7d08df 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -161,6 +161,10 @@ class TestGroupby(BaseDecimal, base.BaseGroupbyTests): pass +class TestSetitem(BaseDecimal, base.BaseSetitemTests): + pass + + # TODO(extension) @pytest.mark.xfail(reason=( "raising AssertionError as this is not implemented, " diff --git a/pandas/tests/extension/interval/__init__.py b/pandas/tests/extension/interval/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/extension/interval/test_interval.py b/pandas/tests/extension/interval/test_interval.py new file mode 100644 index 0000000000000..a10a56ddfdfac --- /dev/null +++ b/pandas/tests/extension/interval/test_interval.py @@ -0,0 +1,193 @@ +import pytest +import numpy as np + +from pandas import Index, Interval, IntervalIndex, date_range, timedelta_range +from pandas.core.arrays import IntervalArray +from pandas.core.dtypes.dtypes import IntervalDtype +from pandas.tests.extension import base +import pandas.util.testing as tm + + +def make_data(): + N = 100 + left = np.random.uniform(size=N).cumsum() + right = left + np.random.uniform(size=N) + return [Interval(l, r) for l, r in zip(left, right)] + + +@pytest.fixture(params=[ + (Index([0, 2, 4]), Index([1, 3, 5])), + (Index([0., 1., 2.]), Index([1., 2., 3.])), + (timedelta_range('0 days', periods=3), + timedelta_range('1 day', periods=3)), + (date_range('20170101', periods=3), date_range('20170102', periods=3)), + (date_range('20170101', periods=3, tz='US/Eastern'), + date_range('20170102', periods=3, tz='US/Eastern'))], + ids=lambda x: str(x[0].dtype)) +def left_right_dtypes(request): + """ + Fixture for building an IntervalArray from various dtypes + """ + return request.param + + +@pytest.fixture +def dtype(): + return IntervalDtype() + + +@pytest.fixture +def data(): + """Length-100 PeriodArray for semantics test.""" + return IntervalArray(make_data()) + + +@pytest.fixture +def data_missing(): + """Length 2 array with [NA, Valid]""" + return IntervalArray.from_tuples([None, (0, 1)]) + + +@pytest.fixture +def data_repeated(): + """Return different versions of data for count times""" + def gen(count): + for _ in range(count): + yield IntervalArray(make_data()) + yield gen + + +@pytest.fixture +def data_for_sorting(): + return IntervalArray.from_tuples([(1, 2), (2, 3), (0, 1)]) + + +@pytest.fixture +def data_missing_for_sorting(): + return IntervalArray.from_tuples([(1, 2), None, (0, 1)]) + + +@pytest.fixture +def na_value(): + return np.nan + + +@pytest.fixture +def data_for_grouping(): + a = (0, 1) + b = (1, 2) + c = (2, 3) + return IntervalArray.from_tuples([b, b, None, None, a, a, b, c]) + + +class BaseInterval(object): + pass + + +class TestDtype(BaseInterval, base.BaseDtypeTests): + + def test_array_type_with_arg(self, data, dtype): + assert dtype.construct_array_type() is IntervalArray + + +class TestCasting(BaseInterval, base.BaseCastingTests): + pass + + +class TestConstructors(BaseInterval, base.BaseConstructorsTests): + pass + + +class TestGetitem(BaseInterval, base.BaseGetitemTests): + pass + + +class TestGrouping(BaseInterval, base.BaseGroupbyTests): + pass + + +class TestInterface(BaseInterval, base.BaseInterfaceTests): + pass + + +class TestMethods(BaseInterval, base.BaseMethodsTests): + @pytest.mark.parametrize('repeats', [0, 1, 5]) + def test_repeat(self, left_right_dtypes, repeats): + left, right = left_right_dtypes + result = IntervalArray.from_arrays(left, right).repeat(repeats) + expected = IntervalArray.from_arrays( + left.repeat(repeats), right.repeat(repeats)) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize('bad_repeats, msg', [ + (-1, 'negative dimensions are not allowed'), + ('foo', r'invalid literal for (int|long)\(\) with base 10')]) + def test_repeat_errors(self, bad_repeats, msg): + array = IntervalArray.from_breaks(range(4)) + with tm.assert_raises_regex(ValueError, msg): + array.repeat(bad_repeats) + + @pytest.mark.parametrize('new_closed', [ + 'left', 'right', 'both', 'neither']) + def test_set_closed(self, closed, new_closed): + # GH 21670 + array = IntervalArray.from_breaks(range(10), closed=closed) + result = array.set_closed(new_closed) + expected = IntervalArray.from_breaks(range(10), closed=new_closed) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.skip(reason='addition is not defined for intervals') + def test_combine_add(self, data_repeated): + pass + + +class TestMissing(BaseInterval, base.BaseMissingTests): + # Index.fillna only accepts scalar `value`, so we have to skip all + # non-scalar fill tests. + unsupported_fill = pytest.mark.skip("Unsupported fillna option.") + + @unsupported_fill + def test_fillna_limit_pad(self): + pass + + @unsupported_fill + def test_fillna_series_method(self): + pass + + @unsupported_fill + def test_fillna_limit_backfill(self): + pass + + @unsupported_fill + def test_fillna_series(self): + pass + + def test_non_scalar_raises(self, data_missing): + msg = "Got a 'list' instead." + with tm.assert_raises_regex(TypeError, msg): + data_missing.fillna([1, 1]) + + +class TestReshaping(BaseInterval, base.BaseReshapingTests): + pass + + +class TestSetitem(BaseInterval, base.BaseSetitemTests): + + def test_set_na(self, left_right_dtypes): + left, right = left_right_dtypes + result = IntervalArray.from_arrays(left, right) + result[0] = np.nan + + expected_left = Index([left._na_value] + list(left[1:])) + expected_right = Index([right._na_value] + list(right[1:])) + expected = IntervalArray.from_arrays(expected_left, expected_right) + + self.assert_extension_array_equal(result, expected) + + +def test_repr_matches(): + idx = IntervalIndex.from_breaks([1, 2, 3]) + a = repr(idx) + b = repr(idx.values) + assert a.replace("Index", "Array") == b diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 589134632c7e9..44b818be84e31 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -70,7 +70,6 @@ def test_astype_no_copy(): @pytest.mark.parametrize('dtype', [ dtypes.DatetimeTZDtype('ns', 'US/Central'), dtypes.PeriodDtype("D"), - dtypes.IntervalDtype(), ]) def test_is_not_extension_array_dtype(dtype): assert not isinstance(dtype, dtypes.ExtensionDtype) @@ -79,6 +78,7 @@ def test_is_not_extension_array_dtype(dtype): @pytest.mark.parametrize('dtype', [ dtypes.CategoricalDtype(), + dtypes.IntervalDtype(), ]) def test_is_extension_array_dtype(dtype): assert isinstance(dtype, dtypes.ExtensionDtype) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index f78bd583288a4..bb82d5578481b 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -886,7 +886,7 @@ def test_hasnans_isnans(self): assert not idx.hasnans idx = index.copy() - values = idx.values + values = np.asarray(idx.values) if len(index) == 0: continue @@ -928,7 +928,7 @@ def test_fillna(self): idx.fillna([idx[0]]) idx = index.copy() - values = idx.values + values = np.asarray(idx.values) if isinstance(index, DatetimeIndexOpsMixin): values[1] = iNaT diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index 3745f79d7d65d..d46e19ef56dd0 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -8,6 +8,7 @@ Interval, IntervalIndex, Index, Int64Index, Float64Index, Categorical, CategoricalIndex, date_range, timedelta_range, period_range, notna) from pandas.compat import lzip +from pandas.core.arrays import IntervalArray from pandas.core.dtypes.common import is_categorical_dtype from pandas.core.dtypes.dtypes import IntervalDtype import pandas.core.common as com @@ -74,7 +75,7 @@ def test_constructor_nan(self, constructor, breaks, closed): assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result.values, expected_values) + tm.assert_numpy_array_equal(result._ndarray_values, expected_values) @pytest.mark.parametrize('breaks', [ [], @@ -93,7 +94,7 @@ def test_constructor_empty(self, constructor, breaks, closed): assert result.empty assert result.closed == closed assert result.dtype.subtype == expected_subtype - tm.assert_numpy_array_equal(result.values, expected_values) + tm.assert_numpy_array_equal(result._ndarray_values, expected_values) @pytest.mark.parametrize('breaks', [ tuple('0123456789'), @@ -348,6 +349,17 @@ def test_override_inferred_closed(self, constructor, data, closed): result = constructor(data, closed=closed) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('values_constructor', [ + list, np.array, IntervalIndex, IntervalArray]) + def test_index_object_dtype(self, values_constructor): + # Index(intervals, dtype=object) is an Index (not an IntervalIndex) + intervals = [Interval(0, 1), Interval(1, 2), Interval(2, 3)] + values = values_constructor(intervals) + result = Index(values, dtype=object) + + assert type(result) is Index + tm.assert_numpy_array_equal(result.values, np.array(values)) + class TestFromIntervals(TestClassConstructors): """ @@ -368,3 +380,7 @@ def test_deprecated(self): ivs = [Interval(0, 1), Interval(1, 2)] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): IntervalIndex.from_intervals(ivs) + + @pytest.mark.skip(reason='parent class test that is not applicable') + def test_index_object_dtype(self): + pass diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 431833f2627d8..0dc5970c22803 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -50,7 +50,6 @@ def test_properties(self, closed): ivs = [Interval(l, r, closed) for l, r in zip(range(10), range(1, 11))] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) - tm.assert_numpy_array_equal(index.values, expected) # with nans index = self.create_index_with_nan(closed=closed) @@ -71,7 +70,6 @@ def test_properties(self, closed): for l, r in zip(expected_left, expected_right)] expected = np.array(ivs, dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) - tm.assert_numpy_array_equal(index.values, expected) @pytest.mark.parametrize('breaks', [ [1, 1, 2, 5, 15, 53, 217, 1014, 5335, 31240, 201608], @@ -136,7 +134,7 @@ def test_ensure_copied_data(self, closed): check_same='same') # by-definition make a copy - result = IntervalIndex(index.values, copy=False) + result = IntervalIndex(index._ndarray_values, copy=False) tm.assert_numpy_array_equal(index.left.values, result.left.values, check_same='copy') tm.assert_numpy_array_equal(index.right.values, result.right.values, @@ -978,6 +976,24 @@ def test_to_tuples_na(self, tuples, na_tuple): else: assert isna(result_na) + def test_nbytes(self): + # GH 19209 + left = np.arange(0, 4, dtype='i8') + right = np.arange(1, 5, dtype='i8') + + result = IntervalIndex.from_arrays(left, right).nbytes + expected = 64 # 4 * 8 * 2 + assert result == expected + + def test_itemsize(self): + # GH 19209 + left = np.arange(0, 4, dtype='i8') + right = np.arange(1, 5, dtype='i8') + + result = IntervalIndex.from_arrays(left, right).itemsize + expected = 16 # 8 * 2 + assert result == expected + @pytest.mark.parametrize('new_closed', [ 'left', 'right', 'both', 'neither']) def test_set_closed(self, name, closed, new_closed): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index a5d83c1c26948..31e5bd88523d2 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1200,7 +1200,8 @@ def test_iter_box(self): 'datetime64[ns, US/Central]'), (pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]'), (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), - (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), + (pd.IntervalIndex.from_breaks([0, 1, 2]), pd.core.arrays.IntervalArray, + 'interval'), ]) def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values @@ -1214,6 +1215,8 @@ def test_values_consistent(array, expected_type, dtype): tm.assert_index_equal(l_values, r_values) elif pd.api.types.is_categorical(l_values): tm.assert_categorical_equal(l_values, r_values) + elif pd.api.types.is_interval_dtype(l_values): + tm.assert_interval_array_equal(l_values, r_values) else: raise TypeError("Unexpected type {}".format(type(l_values))) diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index 95ea4658212e9..dee01ab6efff6 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import textwrap import os import pandas as pd import pytest @@ -820,6 +821,21 @@ def test_categorical_equal_message(self): tm.assert_categorical_equal(a, b) +class TestAssertIntervalArrayEqual(object): + def test_interval_array_equal_message(self): + a = pd.interval_range(0, periods=4).values + b = pd.interval_range(1, periods=4).values + + msg = textwrap.dedent("""\ + IntervalArray.left are different + + IntervalArray.left values are different \\(100.0 %\\) + \\[left\\]: Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\) + \\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""") + with tm.assert_raises_regex(AssertionError, msg): + tm.assert_interval_array_equal(a, b) + + class TestRNGContext(object): def test_RNGContext(self): diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index 667c5d9526563..c9e6e27363aed 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -163,6 +163,14 @@ def _make_table(self, ax, df, title, height=None): ax.axis('off') +class _WritableDoc(type): + # Remove this when Python2 support is dropped + # __doc__ is not mutable for new-style classes in Python2, which means + # we can't use @Appender to share class docstrings. This can be used + # with `add_metaclass` to make cls.__doc__ mutable. + pass + + if __name__ == "__main__": import matplotlib.pyplot as plt diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py new file mode 100644 index 0000000000000..953c8a43a21b8 --- /dev/null +++ b/pandas/util/_exceptions.py @@ -0,0 +1,16 @@ +import contextlib + + +@contextlib.contextmanager +def rewrite_exception(old_name, new_name): + """Rewrite the message of an exception.""" + try: + yield + except Exception as e: + msg = e.args[0] + msg = msg.replace(old_name, new_name) + args = (msg,) + if len(e.args) > 1: + args = args + e.args[1:] + e.args = args + raise diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 54ae8cfb3d39e..9697c991122dd 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -19,7 +19,7 @@ import numpy as np import pandas as pd -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ExtensionArray, IntervalArray from pandas.core.dtypes.missing import array_equivalent from pandas.core.dtypes.common import ( is_datetimelike_v_numeric, @@ -885,7 +885,7 @@ def _get_ilevel_values(index, level): assert_attr_equal('freq', left, right, obj=obj) if (isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex)): - assert_attr_equal('closed', left, right, obj=obj) + assert_interval_array_equal(left.values, right.values) if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): @@ -1023,6 +1023,31 @@ def assert_categorical_equal(left, right, check_dtype=True, assert_attr_equal('ordered', left, right, obj=obj) +def assert_interval_array_equal(left, right, exact='equiv', + obj='IntervalArray'): + """Test that two IntervalArrays are equivalent. + + Parameters + ---------- + left, right : IntervalArray + The IntervalArrays to compare. + exact : bool / string {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, IntervalArray) + + assert_index_equal(left.left, right.left, exact=exact, + obj='{obj}.left'.format(obj=obj)) + assert_index_equal(left.right, right.right, exact=exact, + obj='{obj}.left'.format(obj=obj)) + assert_attr_equal('closed', left, right, obj=obj) + + def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): left = pprint_thing(left) @@ -1251,10 +1276,7 @@ def assert_series_equal(left, right, check_dtype=True, assert_numpy_array_equal(left.get_values(), right.get_values(), check_dtype=check_dtype) elif is_interval_dtype(left) or is_interval_dtype(right): - # TODO: big hack here - left = pd.IntervalIndex(left) - right = pd.IntervalIndex(right) - assert_index_equal(left, right, obj='{obj}.index'.format(obj=obj)) + assert_interval_array_equal(left.values, right.values) elif (is_extension_array_dtype(left) and not is_categorical_dtype(left) and is_extension_array_dtype(right) and not is_categorical_dtype(right)):