REF: Define extension base classes

pandas-dev · Jan 18, 2018 · 2ef5216 · 2ef5216
1 parent ca2d261
commit 2ef5216
Show file tree

Hide file tree

Showing 9 changed files with 566 additions and 80 deletions.
diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py
@@ -1 +1,2 @@
+from .base import ExtensionArray  # noqa
 from .categorical import Categorical  # noqa
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -0,0 +1,201 @@
+"""An interface for extending pandas with custom arrays."""
+import abc
+
+import numpy as np
+
+from pandas.compat import add_metaclass
+
+
+_not_implemented_message = "{} does not implement {}."
+
+
+@add_metaclass(abc.ABCMeta)
+class ExtensionArray(object):
+    """Abstract base class for custom array types
+
+    pandas will recognize instances of this class as proper arrays
+    with a custom type and will not attempt to coerce them to objects.
+
+    Subclasses are expected to implement the following methods.
+    """
+    # ------------------------------------------------------------------------
+    # Must be a Sequence
+    # ------------------------------------------------------------------------
+    @abc.abstractmethod
+    def __getitem__(self, item):
+        """Select a subset of self
+
+        Notes
+        -----
+        As a sequence, __getitem__ should expect integer or slice ``key``.
+
+        For slice ``key``, you should return an instance of yourself, even
+        if the slice is length 0 or 1.
+
+        For scalar ``key``, you may return a scalar suitable for your type.
+        The scalar need not be an instance or subclass of your array type.
+        """
+        # type (Any) -> Any
+
+    def __setitem__(self, key, value):
+        # type: (Any, Any) -> None
+        raise NotImplementedError(_not_implemented_message.format(
+            type(self), '__setitem__')
+        )
+
+    @abc.abstractmethod
+    def __iter__(self):
+        # type: () -> Iterator
+        pass
+
+    @abc.abstractmethod
+    def __len__(self):
+        # type: () -> int
+        pass
+
+    # ------------------------------------------------------------------------
+    # Required attributes
+    # ------------------------------------------------------------------------
+    @property
+    def base(self):
+        """The base array I am a view of. None by default."""
+
+    @property
+    @abc.abstractmethod
+    def dtype(self):
+        """An instance of 'ExtensionDtype'."""
+        # type: () -> ExtensionDtype
+        pass
+
+    @property
+    def shape(self):
+        # type: () -> Tuple[int, ...]
+        return (len(self),)
+
+    @property
+    def ndim(self):
+        # type: () -> int
+        """Extension Arrays are only allowed to be 1-dimensional."""
+        return 1
+
+    @property
+    @abc.abstractmethod
+    def nbytes(self):
+        """The number of bytes needed to store this object in memory."""
+        # type: () -> int
+        pass
+
+    # ------------------------------------------------------------------------
+    # Additional Methods
+    # ------------------------------------------------------------------------
+    @abc.abstractmethod
+    def isna(self):
+        """Boolean NumPy array indicating if each value is missing."""
+        # type: () -> np.ndarray
+        pass
+
+    # ------------------------------------------------------------------------
+    # Indexing methods
+    # ------------------------------------------------------------------------
+    @abc.abstractmethod
+    def take(self, indexer, allow_fill=True, fill_value=None):
+        # type: (Sequence, bool, Optional[Any]) -> ExtensionArray
+        """For slicing"""
+
+    def take_nd(self, indexer, allow_fill=True, fill_value=None):
+        """For slicing"""
+        # TODO: this isn't really nescessary for 1-D
+        return self.take(indexer, allow_fill=allow_fill,
+                         fill_value=fill_value)
+
+    @abc.abstractmethod
+    def copy(self, deep=False):
+        # type: (bool) -> ExtensionArray
+        """Return a copy of the array."""
+
+    # ------------------------------------------------------------------------
+    # Block-related methods
+    # ------------------------------------------------------------------------
+    @property
+    def _fill_value(self):
+        """The missing value for this type, e.g. np.nan"""
+        # type: () -> Any
+        return None
+
+    @abc.abstractmethod
+    def _formatting_values(self):
+        # type: () -> np.ndarray
+        # At the moment, this has to be an array since we use result.dtype
+        """An array of values to be printed in, e.g. the Series repr"""
+
+    @classmethod
+    @abc.abstractmethod
+    def _concat_same_type(cls, to_concat):
+        # type: (Sequence[ExtensionArray]) -> ExtensionArray
+        """Concatenate multiple array
+
+        Parameters
+        ----------
+        to_concat : sequence of this type
+
+        Returns
+        -------
+        ExtensionArray
+        """
+
+    @abc.abstractmethod
+    def get_values(self):
+        # type: () -> np.ndarray
+        """Get the underlying values backing your data
+        """
+        pass
+
+    def _can_hold_na(self):
+        """Whether your array can hold missing values. True by default.
+
+        Notes
+        -----
+        Setting this to false will optimize some operations like fillna.
+        """
+        # type: () -> bool
+        return True
+
+    @property
+    def is_sparse(self):
+        """Whether your array is sparse. True by default."""
+        # type: () -> bool
+        return False
+
+    def _slice(self, slicer):
+        # type: (Union[tuple, Sequence, int]) -> 'ExtensionArray'
+        """Return a new array sliced by `slicer`.
+
+        Parameters
+        ----------
+        slicer : slice or np.ndarray
+            If an array, it should just be a boolean mask
+
+        Returns
+        -------
+        array : ExtensionArray
+            Should return an ExtensionArray, even if ``self[slicer]``
+            would return a scalar.
+        """
+        return type(self)(self[slicer])
+
+    def value_counts(self, dropna=True):
+        """Optional method for computing the histogram of the counts.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            whether to exclude missing values from the computation
+
+        Returns
+        -------
+        counts : Series
+        """
+        from pandas.core.algorithms import value_counts
+        mask = ~np.asarray(self.isna())
+        values = self[mask]  # XXX: this imposes boolean indexing
+        return value_counts(np.asarray(values), dropna=dropna)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -44,6 +44,8 @@
 from pandas.util._validators import validate_bool_kwarg
 from pandas.core.config import get_option
 
+from .base import ExtensionArray
+
 
 def _cat_compare_op(op):
     def f(self, other):
@@ -149,7 +151,7 @@ def _maybe_to_categorical(array):
 """
 
 
-class Categorical(PandasObject):
+class Categorical(ExtensionArray, PandasObject):
     """
     Represents a categorical variable in classic R / S-plus fashion
 
@@ -2131,6 +2133,20 @@ def repeat(self, repeats, *args, **kwargs):
         return self._constructor(values=codes, categories=self.categories,
                                  ordered=self.ordered, fastpath=True)
 
+    # Interface things
+    # can_hold_na, concat_same_type, formatting_values
+    @property
+    def _can_hold_na(self):
+        return True
+
+    @classmethod
+    def _concat_same_type(self, to_concat):
+        from pandas.types.concat import union_categoricals
+        return union_categoricals(to_concat)
+
+    def _formatting_values(self):
+        return self
+
 # The Series.cat accessor
 
 

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -0,0 +1,92 @@
+"""Extend pandas with custom array types"""
+import abc
+
+from pandas.compat import add_metaclass
+
+
+@add_metaclass(abc.ABCMeta)
+class ExtensionDtype(object):
+    """A custom data type for your array.
+    """
+    @property
+    def type(self):
+        """Typically a metaclass inheriting from 'type' with no methods."""
+        return type(self.name, (), {})
+
+    @property
+    def kind(self):
+        """A character code (one of 'biufcmMOSUV'), default 'O'
+
+        See Also
+        --------
+        numpy.dtype.kind
+        """
+        return 'O'
+
+    @property
+    @abc.abstractmethod
+    def name(self):
+        """An string identifying the data type.
+
+        Will be used in, e.g. ``Series.dtype``
+        """
+
+    @property
+    def names(self):
+        """Ordered list of field names, or None if there are no fields"""
+        return None
+
+    @classmethod
+    def construct_from_string(cls, string):
+        """Attempt to construct this type from a string.
+
+        Parameters
+        ----------
+        string : str
+
+        Returns
+        -------
+        self : instance of 'cls'
+
+        Raises
+        ------
+        TypeError
+
+        Notes
+        -----
+        The default implementation checks if 'string' matches your
+        type's name. If so, it calls your class with no arguments.
+        """
+        if string == cls.name:
+            return cls()
+        else:
+            raise TypeError("Cannot construct a '{}' from "
+                            "'{}'".format(cls, string))
+
+    @classmethod
+    def is_dtype(cls, dtype):
+        """Check if we match 'dtype'
+
+        Parameters
+        ----------
+        dtype : str or dtype
+
+        Returns
+        -------
+        is_dtype : bool
+
+        Notes
+        -----
+        The default implementation is True if
+
+        1. 'dtype' is a string that returns true for
+           ``cls.construct_from_string``
+        2. 'dtype' is ``cls`` or a subclass of ``cls``.
+        """
+        if isinstance(dtype, str):
+            try:
+                return isinstance(cls.construct_from_string(dtype), cls)
+            except TypeError:
+                return False
+        else:
+            return issubclass(dtype, cls)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1685,6 +1685,38 @@ def is_extension_type(arr):
     return False
 
 
+def is_extension_array_dtype(arr_or_dtype):
+    """Check if an object is a pandas extension array type
+
+    Parameters
+    ----------
+    arr_or_dtype : object
+
+    Returns
+    -------
+    bool
+
+    Notes
+    -----
+    This checks whether an object implements the pandas extension
+    array interface. In pandas, this includes:
+
+    * Categorical
+    * PeriodArray
+    * IntervalArray
+    * SparseArray
+
+    Third-party libraries may implement arrays or types satisfying
+    this interface as well.
+    """
+    from pandas.core.arrays import ExtensionArray
+
+    # we want to unpack series, anything else?
+    if isinstance(arr_or_dtype, ABCSeries):
+        arr_or_dtype = arr_or_dtype.values
+    return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))
+
+
 def is_complex_dtype(arr_or_dtype):
     """
     Check whether the provided array or dtype is of a complex dtype.