DEPR: __array__ for tz-aware Series/Index (pandas-dev#24596)

Pingviinituutti · Feb 28, 2019 · 1a0b845 · 1a0b845
1 parent cde1c89
commit 1a0b845
Show file tree

Hide file tree

Showing 18 changed files with 329 additions and 38 deletions.
diff --git a/doc/source/api/series.rst b/doc/source/api/series.rst
@@ -26,6 +26,7 @@ Attributes
 .. autosummary::
    :toctree: generated/
 
+   Series.array
    Series.values
    Series.dtype
    Series.ftype
@@ -58,10 +59,12 @@ Conversion
    Series.convert_objects
    Series.copy
    Series.bool
+   Series.to_numpy
    Series.to_period
    Series.to_timestamp
    Series.to_list
    Series.get_values
+   Series.__array__
 
 Indexing, iteration
 -------------------

diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1227,7 +1227,7 @@ Deprecations
 .. _whatsnew_0240.deprecations.datetimelike_int_ops:
 
 Integer Addition/Subtraction with Datetimes and Timedeltas is Deprecated
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 In the past, users could—in some cases—add or subtract integers or integer-dtype
 arrays from :class:`Timestamp`, :class:`DatetimeIndex` and :class:`TimedeltaIndex`.
@@ -1265,6 +1265,74 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`).
     dti = pd.date_range('2001-01-01', periods=2, freq='7D')
     dti + pd.Index([1 * dti.freq, 2 * dti.freq])
 
+
+.. _whatsnew_0240.deprecations.tz_aware_array:
+
+Converting Timezone-Aware Series and Index to NumPy Arrays
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The conversion from a :class:`Series` or :class:`Index` with timezone-aware
+datetime data will change to preserve timezones by default (:issue:`23569`).
+
+NumPy doesn't have a dedicated dtype for timezone-aware datetimes.
+In the past, converting a :class:`Series` or :class:`DatetimeIndex` with
+timezone-aware datatimes would convert to a NumPy array by
+
+1. converting the tz-aware data to UTC
+2. dropping the timezone-info
+3. returning a :class:`numpy.ndarray` with ``datetime64[ns]`` dtype
+
+Future versions of pandas will preserve the timezone information by returning an
+object-dtype NumPy array where each value is a :class:`Timestamp` with the correct
+timezone attached
+
+.. ipython:: python
+
+   ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+   ser
+
+The default behavior remains the same, but issues a warning
+
+.. code-block:: python
+
+   In [8]: np.asarray(ser)
+   /bin/ipython:1: FutureWarning: Converting timezone-aware DatetimeArray to timezone-naive
+         ndarray with 'datetime64[ns]' dtype. In the future, this will return an ndarray
+         with 'object' dtype where each element is a 'pandas.Timestamp' with the correct 'tz'.
+
+           To accept the future behavior, pass 'dtype=object'.
+           To keep the old behavior, pass 'dtype="datetime64[ns]"'.
+     #!/bin/python3
+   Out[8]:
+   array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00.000000000'],
+         dtype='datetime64[ns]')
+
+The previous or future behavior can be obtained, without any warnings, by specifying
+the ``dtype``
+
+*Previous Behavior*
+
+.. ipython:: python
+
+   np.asarray(ser, dtype='datetime64[ns]')
+
+*Future Behavior*
+
+.. ipython:: python
+
+   # New behavior
+   np.asarray(ser, dtype=object)
+
+
+Or by using :meth:`Series.to_numpy`
+
+.. ipython:: python
+
+   ser.to_numpy()
+   ser.to_numpy(dtype="datetime64[ns]")
+
+All the above applies to a :class:`DatetimeIndex` with tz-aware values as well.
+
 .. _whatsnew_0240.prior_deprecations:
 
 Removal of prior version deprecations/changes

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -524,7 +524,7 @@ def _resolution(self):
     # Array-Like / EA-Interface Methods
 
     def __array__(self, dtype=None):
-        if is_object_dtype(dtype):
+        if is_object_dtype(dtype) or (dtype is None and self.tz):
             return np.array(list(self), dtype=object)
         elif is_int64_dtype(dtype):
             return self.asi8

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1020,7 +1020,7 @@ def maybe_cast_to_datetime(value, dtype, errors='raise'):
                             # datetime64tz is assumed to be naive which should
                             # be localized to the timezone.
                             is_dt_string = is_string_dtype(value)
-                            value = to_datetime(value, errors=errors)
+                            value = to_datetime(value, errors=errors).array
                             if is_dt_string:
                                 # Strings here are naive, so directly localize
                                 value = value.tz_localize(dtype.tz)

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -403,6 +403,7 @@ def _hash_categories(categories, ordered=True):
         from pandas.core.util.hashing import (
             hash_array, _combine_hash_arrays, hash_tuples
         )
+        from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE
 
         if len(categories) and isinstance(categories[0], tuple):
             # assumes if any individual category is a tuple, then all our. ATM
@@ -420,6 +421,11 @@ def _hash_categories(categories, ordered=True):
                     # find a better solution
                     hashed = hash((tuple(categories), ordered))
                     return hashed
+
+            if is_datetime64tz_dtype(categories.dtype):
+                # Avoid future warning.
+                categories = categories.astype(_NS_DTYPE)
+
             cat_array = hash_array(np.asarray(categories), categorize=False)
         if ordered:
             cat_array = np.vstack([

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1271,8 +1271,8 @@ def f(self, **kwargs):
         def first_compat(x, axis=0):
 
             def first(x):
+                x = x.to_numpy()
 
-                x = np.asarray(x)
                 x = x[notna(x)]
                 if len(x) == 0:
                     return np.nan
@@ -1286,8 +1286,7 @@ def first(x):
         def last_compat(x, axis=0):
 
             def last(x):
-
-                x = np.asarray(x)
+                x = x.to_numpy()
                 x = x[notna(x)]
                 if len(x) == 0:
                     return np.nan

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -339,6 +339,21 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None):
 
     # --------------------------------------------------------------------
 
+    def __array__(self, dtype=None):
+        if (dtype is None and isinstance(self._data, DatetimeArray)
+                and getattr(self.dtype, 'tz', None)):
+            msg = (
+                "Converting timezone-aware DatetimeArray to timezone-naive "
+                "ndarray with 'datetime64[ns]' dtype. In the future, this "
+                "will return an ndarray with 'object' dtype where each "
+                "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t"
+                "To accept the future behavior, pass 'dtype=object'.\n\t"
+                "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'."
+            )
+            warnings.warn(msg, FutureWarning, stacklevel=3)
+            dtype = 'M8[ns]'
+        return np.asarray(self._data, dtype=dtype)
+
     @property
     def dtype(self):
         return self._data.dtype
@@ -1114,7 +1129,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None):
 
     strftime = ea_passthrough(DatetimeArray.strftime)
     _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz)
-    __array__ = ea_passthrough(DatetimeArray.__array__)
 
     @property
     def offset(self):

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -581,7 +581,12 @@ def can_do_equal_len():
                         setter(item, v)
 
                 # we have an equal len ndarray/convertible to our labels
-                elif np.array(value).ndim == 2:
+                # hasattr first, to avoid coercing to ndarray without reason.
+                # But we may be relying on the ndarray coercion to check ndim.
+                # Why not just convert to an ndarray earlier on if needed?
+                elif ((hasattr(value, 'ndim') and value.ndim == 2)
+                      or (not hasattr(value, 'ndim') and
+                          np.array(value).ndim) == 2):
 
                     # note that this coerces the dtype if we are mixed
                     # GH 7551

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -1447,8 +1447,20 @@ def quantile(self, qs, interpolation='linear', axis=0):
         -------
         Block
         """
-        values = self.get_values()
-        values, _ = self._try_coerce_args(values, values)
+        if self.is_datetimetz:
+            # TODO: cleanup this special case.
+            # We need to operate on i8 values for datetimetz
+            # but `Block.get_values()` returns an ndarray of objects
+            # right now. We need an API for "values to do numeric-like ops on"
+            values = self.values.asi8
+
+            # TODO: NonConsolidatableMixin shape
+            # Usual shape inconsistencies for ExtensionBlocks
+            if self.ndim > 1:
+                values = values[None, :]
+        else:
+            values = self.get_values()
+            values, _ = self._try_coerce_args(values, values)
 
         is_empty = values.shape[axis] == 0
         orig_scalar = not is_list_like(qs)
@@ -2055,10 +2067,6 @@ def _na_value(self):
     def fill_value(self):
         return tslibs.iNaT
 
-    def to_dense(self):
-        # TODO(DatetimeBlock): remove
-        return np.asarray(self.values)
-
     def get_values(self, dtype=None):
         """
         return object dtype as boxed values, such as Timestamps/Timedelta
@@ -2330,6 +2338,12 @@ def get_values(self, dtype=None):
             values = values.reshape(1, -1)
         return values
 
+    def to_dense(self):
+        # we request M8[ns] dtype here, even though it discards tzinfo,
+        # as lots of code (e.g. anything using values_from_object)
+        # expects that behavior.
+        return np.asarray(self.values, dtype=_NS_DTYPE)
+
     def _slice(self, slicer):
         """ return a slice of my values """
         if isinstance(slicer, tuple):

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -34,6 +34,7 @@
 from pandas.core.indexes import base as ibase
 from pandas.core.internals import (
     create_block_manager_from_arrays, create_block_manager_from_blocks)
+from pandas.core.internals.arrays import extract_array
 
 # ---------------------------------------------------------------------
 # BlockManager Interface
@@ -539,7 +540,6 @@ def sanitize_array(data, index, dtype=None, copy=False,
     Sanitize input data to an ndarray, copy if specified, coerce to the
     dtype if specified.
     """
-
     if dtype is not None:
         dtype = pandas_dtype(dtype)
 
@@ -552,8 +552,10 @@ def sanitize_array(data, index, dtype=None, copy=False,
         else:
             data = data.copy()
 
+    data = extract_array(data, extract_numpy=True)
+
     # GH#846
-    if isinstance(data, (np.ndarray, Index, ABCSeries)):
+    if isinstance(data, np.ndarray):
 
         if dtype is not None:
             subarr = np.array(data, copy=False)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
@@ -144,7 +144,9 @@ def f(values, axis=None, skipna=True, **kwds):
 
 def _bn_ok_dtype(dt, name):
     # Bottleneck chokes on datetime64
-    if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):
+    if (not is_object_dtype(dt) and
+            not (is_datetime_or_timedelta_dtype(dt) or
+                 is_datetime64tz_dtype(dt))):
 
         # GH 15507
         # bottleneck does not properly upcast during the sum

diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -8,7 +8,7 @@
 from pandas._libs.lib import infer_dtype
 
 from pandas.core.dtypes.common import (
-    ensure_int64, is_categorical_dtype, is_datetime64_dtype,
+    _NS_DTYPE, ensure_int64, is_categorical_dtype, is_datetime64_dtype,
     is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer,
     is_scalar, is_timedelta64_dtype)
 from pandas.core.dtypes.missing import isna
@@ -226,7 +226,10 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3,
             raise ValueError('Overlapping IntervalIndex is not accepted.')
 
     else:
-        bins = np.asarray(bins)
+        if is_datetime64tz_dtype(bins):
+            bins = np.asarray(bins, dtype=_NS_DTYPE)
+        else:
+            bins = np.asarray(bins)
         bins = _convert_bin_to_numeric_type(bins, dtype)
         if (np.diff(bins) < 0).any():
             raise ValueError('bins must increase monotonically.')

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -21,7 +21,8 @@
     is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
     is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
 from pandas.core.dtypes.generic import (
-    ABCDataFrame, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries)
+    ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, ABCSeries,
+    ABCSparseArray, ABCSparseSeries)
 from pandas.core.dtypes.missing import (
     isna, na_value_for_dtype, notna, remove_na_arraylike)
 
@@ -658,11 +659,66 @@ def view(self, dtype=None):
     # ----------------------------------------------------------------------
     # NDArray Compat
 
-    def __array__(self, result=None):
+    def __array__(self, dtype=None):
         """
-        The array interface, return my values.
-        """
-        return self.get_values()
+        Return the values as a NumPy array.
+
+        Users should not call this directly. Rather, it is invoked by
+        :func:`numpy.array` and :func:`numpy.asarray`.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to use for the resulting NumPy array. By default,
+            the dtype is inferred from the data.
+
+        Returns
+        -------
+        numpy.ndarray
+            The values in the series converted to a :class:`numpy.ndarary`
+            with the specified `dtype`.
+
+        See Also
+        --------
+        pandas.array : Create a new array from data.
+        Series.array : Zero-copy view to the array backing the Series.
+        Series.to_numpy : Series method for similar behavior.
+
+        Examples
+        --------
+        >>> ser = pd.Series([1, 2, 3])
+        >>> np.asarray(ser)
+        array([1, 2, 3])
+
+        For timezone-aware data, the timezones may be retained with
+        ``dtype='object'``
+
+        >>> tzser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
+        >>> np.asarray(tzser, dtype="object")
+        array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'),
+               Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')],
+              dtype=object)
+
+        Or the values may be localized to UTC and the tzinfo discared with
+        ``dtype='datetime64[ns]'``
+
+        >>> np.asarray(tzser, dtype="datetime64[ns]")  # doctest: +ELLIPSIS
+        array(['1999-12-31T23:00:00.000000000', ...],
+              dtype='datetime64[ns]')
+        """
+        if (dtype is None and isinstance(self.array, ABCDatetimeArray)
+                and getattr(self.dtype, 'tz', None)):
+            msg = (
+                "Converting timezone-aware DatetimeArray to timezone-naive "
+                "ndarray with 'datetime64[ns]' dtype. In the future, this "
+                "will return an ndarray with 'object' dtype where each "
+                "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t"
+                "To accept the future behavior, pass 'dtype=object'.\n\t"
+                "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'."
+            )
+            warnings.warn(msg, FutureWarning, stacklevel=3)
+            dtype = 'M8[ns]'
+        return np.asarray(self.array, dtype)
 
     def __array_wrap__(self, result, context=None):
         """