Skip to content

Commit

Permalink
Backport PR pandas-dev#38120: API: preserve freq in DTI/TDI.factorize
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and simonjayhawkins committed Nov 30, 2020
1 parent 8a2b8e2 commit a135e4a
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 20 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Fixed regressions
- Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` for ``__setitem__`` when one-dimensional tuple was given to select from :class:`MultiIndex` (:issue:`37711`)
- Fixed regression in inplace operations on :class:`Series` with ``ExtensionDtype`` with NumPy dtyped operand (:issue:`37910`)
- Fixed regression in metadata propagation for ``groupby`` iterator (:issue:`37343`)
- Fixed regression in :class:`MultiIndex` constructed from a :class:`DatetimeIndex` not retaining frequency (:issue:`35563`)
- Fixed regression in indexing on a :class:`Series` with ``CategoricalDtype`` after unpickling (:issue:`37631`)
- Fixed regression in :meth:`DataFrame.groupby` aggregation with out-of-bounds datetime objects in an object-dtype column (:issue:`36003`)
- Fixed regression in ``df.groupby(..).rolling(..)`` with the resulting :class:`MultiIndex` when grouping by a label that is in the index (:issue:`37641`)
Expand Down
28 changes: 26 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,13 @@
pandas_dtype,
)
from pandas.core.dtypes.generic import (
ABCDatetimeArray,
ABCExtensionArray,
ABCIndex,
ABCIndexClass,
ABCMultiIndex,
ABCSeries,
ABCTimedeltaArray,
)
from pandas.core.dtypes.missing import isna, na_value_for_dtype

Expand Down Expand Up @@ -191,8 +193,16 @@ def _reconstruct_data(
-------
ExtensionArray or np.ndarray
"""
if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
# Catch DatetimeArray/TimedeltaArray
return values

if is_extension_array_dtype(dtype):
values = dtype.construct_array_type()._from_sequence(values)
cls = dtype.construct_array_type()
if isinstance(values, cls) and values.dtype == dtype:
return values

values = cls._from_sequence(values)
elif is_bool_dtype(dtype):
values = values.astype(dtype, copy=False)

Expand Down Expand Up @@ -654,6 +664,8 @@ def factorize(

values = _ensure_arraylike(values)
original = values
if not isinstance(values, ABCMultiIndex):
values = extract_array(values, extract_numpy=True)

# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
# of values, assign na_sentinel=-1 to replace code value for NaN.
Expand All @@ -662,8 +674,20 @@ def factorize(
na_sentinel = -1
dropna = False

if (
isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
and values.freq is not None
):
codes, uniques = values.factorize(sort=sort)
if isinstance(original, ABCIndexClass):
uniques = original._shallow_copy(uniques, name=None)
elif isinstance(original, ABCSeries):
from pandas import Index

uniques = Index(uniques)
return codes, uniques

if is_extension_array_dtype(values.dtype):
values = extract_array(values)
codes, uniques = values.factorize(na_sentinel=na_sentinel)
dtype = original.dtype
else:
Expand Down
14 changes: 14 additions & 0 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -1660,6 +1660,20 @@ def mean(self, skipna=True):
# Don't have to worry about NA `result`, since no NA went in.
return self._box_func(result)

# --------------------------------------------------------------

def factorize(self, na_sentinel=-1, sort: bool = False):
if self.freq is not None:
# We must be unique, so can short-circuit (and retain freq)
codes = np.arange(len(self), dtype=np.intp)
uniques = self.copy() # TODO: copy or view?
if sort and self.freq.n < 0:
codes = codes[::-1]
uniques = uniques[::-1]
return codes, uniques
# FIXME: shouldn't get here; we are ignoring sort
return super().factorize(na_sentinel=na_sentinel)


DatetimeLikeArrayMixin._add_comparison_ops()

Expand Down
4 changes: 4 additions & 0 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@

import pandas.core.algorithms as algos
from pandas.core.arrays import datetimelike as dtl
from pandas.core.arrays.base import ExtensionArray
import pandas.core.common as com


Expand Down Expand Up @@ -766,6 +767,9 @@ def _check_timedeltalike_freq_compat(self, other):

raise raise_on_incompatible(self, other)

def factorize(self, na_sentinel=-1):
return ExtensionArray.factorize(self, na_sentinel=na_sentinel)


def raise_on_incompatible(left, right):
"""
Expand Down
51 changes: 35 additions & 16 deletions pandas/tests/indexes/datetimes/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,10 +271,12 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

arr, idx = idx1.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

# tz must be preserved
idx1 = idx1.tz_localize("Asia/Tokyo")
Expand All @@ -283,6 +285,7 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

idx2 = pd.DatetimeIndex(
["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"]
Expand All @@ -293,49 +296,65 @@ def test_factorize(self):
arr, idx = idx2.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp)
exp_idx = DatetimeIndex(["2014-03", "2014-02", "2014-01"])
arr, idx = idx2.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

# freq must be preserved
def test_factorize_preserves_freq(self):
# GH#38120 freq should be preserved
idx3 = date_range("2000-01", periods=4, freq="M", tz="Asia/Tokyo")
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)

arr, idx = idx3.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

arr, idx = pd.factorize(idx3)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

def test_factorize_tz(self, tz_naive_fixture):
def test_factorize_tz(self, tz_naive_fixture, index_or_series):
tz = tz_naive_fixture
# GH#13750
base = pd.date_range("2016-11-05", freq="H", periods=100, tz=tz)
idx = base.repeat(5)

exp_arr = np.arange(100, dtype=np.intp).repeat(5)

for obj in [idx, pd.Series(idx)]:
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
expected = base._with_freq(None)
tm.assert_index_equal(res, expected)
obj = index_or_series(idx)

arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
expected = base._with_freq(None)
tm.assert_index_equal(res, expected)
assert res.freq == expected.freq

def test_factorize_dst(self):
def test_factorize_dst(self, index_or_series):
# GH 13750
idx = pd.date_range("2016-11-06", freq="H", periods=12, tz="US/Eastern")
obj = index_or_series(idx)

for obj in [idx, pd.Series(idx)]:
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
if index_or_series is Index:
assert res.freq == idx.freq

idx = pd.date_range("2016-06-13", freq="H", periods=12, tz="US/Eastern")
obj = index_or_series(idx)

for obj in [idx, pd.Series(idx)]:
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
arr, res = obj.factorize()
tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp))
tm.assert_index_equal(res, idx)
if index_or_series is Index:
assert res.freq == idx.freq

@pytest.mark.parametrize(
"arr, expected",
Expand Down
11 changes: 10 additions & 1 deletion pandas/tests/indexes/timedeltas/test_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,26 @@ def test_factorize(self):
arr, idx = idx1.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

arr, idx = idx1.factorize(sort=True)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, exp_idx)
assert idx.freq == exp_idx.freq

# freq must be preserved
def test_factorize_preserves_freq(self):
# GH#38120 freq should be preserved
idx3 = timedelta_range("1 day", periods=4, freq="s")
exp_arr = np.array([0, 1, 2, 3], dtype=np.intp)
arr, idx = idx3.factorize()
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

arr, idx = pd.factorize(idx3)
tm.assert_numpy_array_equal(arr, exp_arr)
tm.assert_index_equal(idx, idx3)
assert idx.freq == idx3.freq

def test_sort_values(self):

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/indexing/multiindex/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,13 @@ def test_multiindex_get_loc_list_raises(self):
msg = "unhashable type"
with pytest.raises(TypeError, match=msg):
idx.get_loc([])

def test_multiindex_with_datatime_level_preserves_freq(self):
# https://github.com/pandas-dev/pandas/issues/35563
idx = Index(range(2), name="A")
dti = pd.date_range("2020-01-01", periods=7, freq="D", name="B")
mi = MultiIndex.from_product([idx, dti])
df = DataFrame(np.random.randn(14, 2), index=mi)
result = df.loc[0].index
tm.assert_index_equal(result, dti)
assert result.freq == dti.freq
1 change: 0 additions & 1 deletion pandas/tests/window/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def get_result(obj, obj2=None):
result = result.loc[(slice(None), 1), 5]
result.index = result.index.droplevel(1)
expected = get_result(frame[1], frame[5])
expected.index = expected.index._with_freq(None)
tm.assert_series_equal(result, expected, check_names=False)


Expand Down

0 comments on commit a135e4a

Please sign in to comment.