Skip to content

Commit

Permalink
REF: Define extension base classes
Browse files Browse the repository at this point in the history
  • Loading branch information
TomAugspurger committed Jan 18, 2018
1 parent ca2d261 commit 2ef5216
Show file tree
Hide file tree
Showing 9 changed files with 566 additions and 80 deletions.
1 change: 1 addition & 0 deletions pandas/core/arrays/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .base import ExtensionArray # noqa
from .categorical import Categorical # noqa
201 changes: 201 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
"""An interface for extending pandas with custom arrays."""
import abc

import numpy as np

from pandas.compat import add_metaclass


_not_implemented_message = "{} does not implement {}."


@add_metaclass(abc.ABCMeta)
class ExtensionArray(object):
"""Abstract base class for custom array types
pandas will recognize instances of this class as proper arrays
with a custom type and will not attempt to coerce them to objects.
Subclasses are expected to implement the following methods.
"""
# ------------------------------------------------------------------------
# Must be a Sequence
# ------------------------------------------------------------------------
@abc.abstractmethod
def __getitem__(self, item):
"""Select a subset of self
Notes
-----
As a sequence, __getitem__ should expect integer or slice ``key``.
For slice ``key``, you should return an instance of yourself, even
if the slice is length 0 or 1.
For scalar ``key``, you may return a scalar suitable for your type.
The scalar need not be an instance or subclass of your array type.
"""
# type (Any) -> Any

def __setitem__(self, key, value):
# type: (Any, Any) -> None
raise NotImplementedError(_not_implemented_message.format(
type(self), '__setitem__')
)

@abc.abstractmethod
def __iter__(self):
# type: () -> Iterator
pass

@abc.abstractmethod
def __len__(self):
# type: () -> int
pass

# ------------------------------------------------------------------------
# Required attributes
# ------------------------------------------------------------------------
@property
def base(self):
"""The base array I am a view of. None by default."""

@property
@abc.abstractmethod
def dtype(self):
"""An instance of 'ExtensionDtype'."""
# type: () -> ExtensionDtype
pass

@property
def shape(self):
# type: () -> Tuple[int, ...]
return (len(self),)

@property
def ndim(self):
# type: () -> int
"""Extension Arrays are only allowed to be 1-dimensional."""
return 1

@property
@abc.abstractmethod
def nbytes(self):
"""The number of bytes needed to store this object in memory."""
# type: () -> int
pass

# ------------------------------------------------------------------------
# Additional Methods
# ------------------------------------------------------------------------
@abc.abstractmethod
def isna(self):
"""Boolean NumPy array indicating if each value is missing."""
# type: () -> np.ndarray
pass

# ------------------------------------------------------------------------
# Indexing methods
# ------------------------------------------------------------------------
@abc.abstractmethod
def take(self, indexer, allow_fill=True, fill_value=None):
# type: (Sequence, bool, Optional[Any]) -> ExtensionArray
"""For slicing"""

def take_nd(self, indexer, allow_fill=True, fill_value=None):
"""For slicing"""
# TODO: this isn't really nescessary for 1-D
return self.take(indexer, allow_fill=allow_fill,
fill_value=fill_value)

@abc.abstractmethod
def copy(self, deep=False):
# type: (bool) -> ExtensionArray
"""Return a copy of the array."""

# ------------------------------------------------------------------------
# Block-related methods
# ------------------------------------------------------------------------
@property
def _fill_value(self):
"""The missing value for this type, e.g. np.nan"""
# type: () -> Any
return None

@abc.abstractmethod
def _formatting_values(self):
# type: () -> np.ndarray
# At the moment, this has to be an array since we use result.dtype
"""An array of values to be printed in, e.g. the Series repr"""

@classmethod
@abc.abstractmethod
def _concat_same_type(cls, to_concat):
# type: (Sequence[ExtensionArray]) -> ExtensionArray
"""Concatenate multiple array
Parameters
----------
to_concat : sequence of this type
Returns
-------
ExtensionArray
"""

@abc.abstractmethod
def get_values(self):
# type: () -> np.ndarray
"""Get the underlying values backing your data
"""
pass

def _can_hold_na(self):
"""Whether your array can hold missing values. True by default.
Notes
-----
Setting this to false will optimize some operations like fillna.
"""
# type: () -> bool
return True

@property
def is_sparse(self):
"""Whether your array is sparse. True by default."""
# type: () -> bool
return False

def _slice(self, slicer):
# type: (Union[tuple, Sequence, int]) -> 'ExtensionArray'
"""Return a new array sliced by `slicer`.
Parameters
----------
slicer : slice or np.ndarray
If an array, it should just be a boolean mask
Returns
-------
array : ExtensionArray
Should return an ExtensionArray, even if ``self[slicer]``
would return a scalar.
"""
return type(self)(self[slicer])

def value_counts(self, dropna=True):
"""Optional method for computing the histogram of the counts.
Parameters
----------
dropna : bool, default True
whether to exclude missing values from the computation
Returns
-------
counts : Series
"""
from pandas.core.algorithms import value_counts
mask = ~np.asarray(self.isna())
values = self[mask] # XXX: this imposes boolean indexing
return value_counts(np.asarray(values), dropna=dropna)
18 changes: 17 additions & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
from pandas.util._validators import validate_bool_kwarg
from pandas.core.config import get_option

from .base import ExtensionArray


def _cat_compare_op(op):
def f(self, other):
Expand Down Expand Up @@ -149,7 +151,7 @@ def _maybe_to_categorical(array):
"""


class Categorical(PandasObject):
class Categorical(ExtensionArray, PandasObject):
"""
Represents a categorical variable in classic R / S-plus fashion
Expand Down Expand Up @@ -2131,6 +2133,20 @@ def repeat(self, repeats, *args, **kwargs):
return self._constructor(values=codes, categories=self.categories,
ordered=self.ordered, fastpath=True)

# Interface things
# can_hold_na, concat_same_type, formatting_values
@property
def _can_hold_na(self):
return True

@classmethod
def _concat_same_type(self, to_concat):
from pandas.types.concat import union_categoricals
return union_categoricals(to_concat)

def _formatting_values(self):
return self

# The Series.cat accessor


Expand Down
92 changes: 92 additions & 0 deletions pandas/core/dtypes/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Extend pandas with custom array types"""
import abc

from pandas.compat import add_metaclass


@add_metaclass(abc.ABCMeta)
class ExtensionDtype(object):
"""A custom data type for your array.
"""
@property
def type(self):
"""Typically a metaclass inheriting from 'type' with no methods."""
return type(self.name, (), {})

@property
def kind(self):
"""A character code (one of 'biufcmMOSUV'), default 'O'
See Also
--------
numpy.dtype.kind
"""
return 'O'

@property
@abc.abstractmethod
def name(self):
"""An string identifying the data type.
Will be used in, e.g. ``Series.dtype``
"""

@property
def names(self):
"""Ordered list of field names, or None if there are no fields"""
return None

@classmethod
def construct_from_string(cls, string):
"""Attempt to construct this type from a string.
Parameters
----------
string : str
Returns
-------
self : instance of 'cls'
Raises
------
TypeError
Notes
-----
The default implementation checks if 'string' matches your
type's name. If so, it calls your class with no arguments.
"""
if string == cls.name:
return cls()
else:
raise TypeError("Cannot construct a '{}' from "
"'{}'".format(cls, string))

@classmethod
def is_dtype(cls, dtype):
"""Check if we match 'dtype'
Parameters
----------
dtype : str or dtype
Returns
-------
is_dtype : bool
Notes
-----
The default implementation is True if
1. 'dtype' is a string that returns true for
``cls.construct_from_string``
2. 'dtype' is ``cls`` or a subclass of ``cls``.
"""
if isinstance(dtype, str):
try:
return isinstance(cls.construct_from_string(dtype), cls)
except TypeError:
return False
else:
return issubclass(dtype, cls)
32 changes: 32 additions & 0 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1685,6 +1685,38 @@ def is_extension_type(arr):
return False


def is_extension_array_dtype(arr_or_dtype):
"""Check if an object is a pandas extension array type
Parameters
----------
arr_or_dtype : object
Returns
-------
bool
Notes
-----
This checks whether an object implements the pandas extension
array interface. In pandas, this includes:
* Categorical
* PeriodArray
* IntervalArray
* SparseArray
Third-party libraries may implement arrays or types satisfying
this interface as well.
"""
from pandas.core.arrays import ExtensionArray

# we want to unpack series, anything else?
if isinstance(arr_or_dtype, ABCSeries):
arr_or_dtype = arr_or_dtype.values
return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))


def is_complex_dtype(arr_or_dtype):
"""
Check whether the provided array or dtype is of a complex dtype.
Expand Down
Loading

0 comments on commit 2ef5216

Please sign in to comment.