From 8b1f99648efde7a519f5c3416cff73da73807d0e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 28 Oct 2020 18:06:25 -0700 Subject: [PATCH] BUG: DataFrame.min/max dt64 with skipna=False (#37425) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/compat/numpy/function.py | 4 +-- pandas/core/arrays/datetimelike.py | 54 +++++++++++++++++----------- pandas/core/nanops.py | 11 +++++- pandas/tests/frame/test_analytics.py | 25 +++++++++++++ 5 files changed, 71 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f1f24ab7a101b..d6169d6a61038 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -404,6 +404,7 @@ Numeric - Bug in :class:`IntervalArray` comparisons with :class:`Series` not returning :class:`Series` (:issue:`36908`) - Bug in :class:`DataFrame` allowing arithmetic operations with list of array-likes with undefined results. Behavior changed to raising ``ValueError`` (:issue:`36702`) - Bug in :meth:`DataFrame.std`` with ``timedelta64`` dtype and ``skipna=False`` (:issue:`37392`) +- Bug in :meth:`DataFrame.min` and :meth:`DataFrame.max` with ``datetime64`` dtype and ``skipna=False`` (:issue:`36907`) Conversion ^^^^^^^^^^ diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index c074b06042e26..c2e91c7877d35 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -387,7 +387,7 @@ def validate_resampler_func(method: str, args, kwargs) -> None: raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis: Optional[int]) -> None: +def validate_minmax_axis(axis: Optional[int], ndim: int = 1) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero or None, as otherwise it will be incorrectly ignored. @@ -395,12 +395,12 @@ def validate_minmax_axis(axis: Optional[int]) -> None: Parameters ---------- axis : int or None + ndim : int, default 1 Raises ------ ValueError """ - ndim = 1 # hard-coded for Index if axis is None: return if axis >= ndim or (axis < 0 and ndim + axis < 0): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8e3b26503a61b..a6c74294c4c75 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1264,13 +1264,24 @@ def min(self, axis=None, skipna=True, *args, **kwargs): Series.min : Return the minimum value in a Series. """ nv.validate_min(args, kwargs) - nv.validate_minmax_axis(axis) + nv.validate_minmax_axis(axis, self.ndim) - result = nanops.nanmin(self.asi8, skipna=skipna, mask=self.isna()) - if isna(result): - # Period._from_ordinal does not handle np.nan gracefully - return NaT - return self._box_func(result) + if is_period_dtype(self.dtype): + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmin( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + if result is NaT: + return NaT + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + + result = nanops.nanmin(self._ndarray, axis=axis, skipna=skipna) + if lib.is_scalar(result): + return self._box_func(result) + return self._from_backing_data(result) def max(self, axis=None, skipna=True, *args, **kwargs): """ @@ -1286,23 +1297,24 @@ def max(self, axis=None, skipna=True, *args, **kwargs): # TODO: skipna is broken with max. # See https://github.com/pandas-dev/pandas/issues/24265 nv.validate_max(args, kwargs) - nv.validate_minmax_axis(axis) - - mask = self.isna() - if skipna: - values = self[~mask].asi8 - elif mask.any(): - return NaT - else: - values = self.asi8 + nv.validate_minmax_axis(axis, self.ndim) - if not len(values): - # short-circuit for empty max / min - return NaT + if is_period_dtype(self.dtype): + # pass datetime64 values to nanops to get correct NaT semantics + result = nanops.nanmax( + self._ndarray.view("M8[ns]"), axis=axis, skipna=skipna + ) + if result is NaT: + return result + result = result.view("i8") + if axis is None or self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) - result = nanops.nanmax(values, skipna=skipna) - # Don't have to worry about NA `result`, since no NA went in. - return self._box_func(result) + result = nanops.nanmax(self._ndarray, axis=axis, skipna=skipna) + if lib.is_scalar(result): + return self._box_func(result) + return self._from_backing_data(result) def mean(self, skipna=True, axis: Optional[int] = 0): """ diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d2f596a5a6800..3ab08ab2cda56 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -340,6 +340,7 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): if result == fill_value: result = np.nan if tz is not None: + # we get here e.g. via nanmean when we call it on a DTA[tz] result = Timestamp(result, tz=tz) elif isna(result): result = np.datetime64("NaT", "ns") @@ -919,10 +920,13 @@ def reduction( mask: Optional[np.ndarray] = None, ) -> Dtype: + orig_values = values values, mask, dtype, dtype_max, fill_value = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask ) + datetimelike = orig_values.dtype.kind in ["m", "M"] + if (axis is not None and values.shape[axis] == 0) or values.size == 0: try: result = getattr(values, meth)(axis, dtype=dtype_max) @@ -933,7 +937,12 @@ def reduction( result = getattr(values, meth)(axis) result = _wrap_results(result, dtype, fill_value) - return _maybe_null_out(result, axis, mask, values.shape) + result = _maybe_null_out(result, axis, mask, values.shape) + + if datetimelike and not skipna: + result = _mask_datetimelike_result(result, axis, mask, orig_values) + + return result return reduction diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index ee4da37ce10f3..f2847315f4959 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1169,6 +1169,31 @@ def test_min_max_dt64_with_NaT(self): exp = Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + def test_min_max_dt64_with_NaT_skipna_false(self, tz_naive_fixture): + # GH#36907 + tz = tz_naive_fixture + df = DataFrame( + { + "a": [ + Timestamp("2020-01-01 08:00:00", tz=tz), + Timestamp("1920-02-01 09:00:00", tz=tz), + ], + "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT], + } + ) + + res = df.min(axis=1, skipna=False) + expected = Series([df.loc[0, "a"], pd.NaT]) + assert expected.dtype == df["a"].dtype + + tm.assert_series_equal(res, expected) + + res = df.max(axis=1, skipna=False) + expected = Series([df.loc[0, "b"], pd.NaT]) + assert expected.dtype == df["a"].dtype + + tm.assert_series_equal(res, expected) + def test_min_max_dt64_api_consistency_with_NaT(self): # Calling the following sum functions returned an error for dataframes but # returned NaT for series. These tests check that the API is consistent in