From 63292d45a3e1d5347a822c54a747b1b96573e7b4 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 Apr 2023 16:57:10 -0700 Subject: [PATCH 01/12] DEPR: concat with empty objects --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/dtypes/concat.py | 22 ++++++++ pandas/core/internals/concat.py | 55 +++++++++++++++++-- pandas/tests/dtypes/test_concat.py | 7 ++- pandas/tests/groupby/test_groupby.py | 10 +++- pandas/tests/indexes/test_base.py | 4 +- pandas/tests/reshape/concat/test_append.py | 4 +- .../reshape/concat/test_append_common.py | 21 ++++--- pandas/tests/reshape/concat/test_concat.py | 10 +++- pandas/tests/reshape/concat/test_datetimes.py | 4 +- pandas/tests/reshape/concat/test_empty.py | 12 ++-- pandas/tests/reshape/concat/test_series.py | 4 +- pandas/tests/reshape/merge/test_merge.py | 6 +- .../series/methods/test_combine_first.py | 8 ++- 14 files changed, 137 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index a037e50593737..c4d3f6af1e007 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -152,6 +152,7 @@ Deprecations - Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`) - Deprecated :meth:`DataFrame._data` and :meth:`Series._data`, use public APIs instead (:issue:`33333`) - Deprecated :meth:`.Groupby.all` and :meth:`.GroupBy.any` with datetime64 or :class:`PeriodDtype` values, matching the :class:`Series` and :class:`DataFrame` deprecations (:issue:`34479`) +- Deprecated :func:`concat` behavior when any of the objects being concatenated have length 0; in the past the dtypes of empty objects were ignored when determining the resulting dtype, in a future version they will not (:issue:`39122`) - Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) - Deprecated the behavior of :func:`concat` with both ``len(keys) != len(objs)``, in a future version this will raise instead of truncating to the shorter of the two sequences (:issue:`43485`) - Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b55c8cd31c110..e33b8682b0d2e 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -8,10 +8,12 @@ Sequence, cast, ) +import warnings import numpy as np from pandas._libs import lib +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( @@ -42,6 +44,9 @@ ) +_dtype_obj = np.dtype(object) + + def _is_nonempty(x, axis) -> bool: # filter empty arrays # 1-d dtypes always are included here @@ -104,6 +109,23 @@ def concat_compat( non_empties = [x for x in to_concat if _is_nonempty(x, axis)] if non_empties and axis == 0 and not ea_compat_axis: # ea_compat_axis see GH#39574 + if len(non_empties) < len(to_concat) and not any( + obj.dtype == _dtype_obj for obj in non_empties + ): + # Check for object dtype is an imperfect proxy for checking if + # the result dtype is going to change once the deprecation is + # enforced. + # GH#39122 + warnings.warn( + "The behavior of array concatenation with empty entries is " + "deprecated. In a future version, this will no longer exclude " + "empty items when determining the result dtype. To opt in to " + "the future behavior, set pd.set_option('future.concat_empty', True). " + "To retain the old behavior, exclude the empty entries before " + "the concat operation.", + FutureWarning, + stacklevel=find_stack_level(), + ) to_concat = non_empties dtypes = {obj.dtype for obj in to_concat} diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index fda445af5c0c4..18d63c28f1b26 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -7,6 +7,7 @@ Sequence, cast, ) +import warnings import numpy as np @@ -16,6 +17,7 @@ ) from pandas._libs.missing import NA from pandas.util._decorators import cache_readonly +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.cast import ( @@ -470,7 +472,9 @@ def is_na(self) -> bool: values = blk.values if values.size == 0: + # GH#39122 this case will return False once deprecation is enforced return True + if isinstance(values.dtype, SparseDtype): return False @@ -488,6 +492,19 @@ def is_na(self) -> bool: return False return all(isna_all(row) for row in values) + @cache_readonly + def is_na_after_size_deprecation(self) -> bool: + """ + Will self.is_na be True after values.size == 0 deprecation is enforced? + """ + blk = self.block + if blk.dtype.kind == "V": + return True + + if not blk._can_hold_na: + return False + return self.is_na and blk.values.size != 0 + def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike @@ -578,7 +595,7 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike """ Concatenate values from several join units along axis=1. """ - empty_dtype = _get_empty_dtype(join_units) + empty_dtype, empty_dtype_future = _get_empty_dtype(join_units) has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) @@ -617,6 +634,18 @@ def _concatenate_join_units(join_units: list[JoinUnit], copy: bool) -> ArrayLike else: concat_values = concat_compat(to_concat, axis=1) + if empty_dtype != empty_dtype_future: + if empty_dtype == concat_values.dtype: + # GH#39122 + warnings.warn( + "The behavior of DataFrame concatenation with empty entries is " + "deprecated. In a future version, this will no longer exclude " + "empty frames when determining the result dtypes. " + "To retain the old behavior, exclude the empty entries before " + "the concat operation.", + FutureWarning, + stacklevel=find_stack_level(), + ) return concat_values @@ -643,7 +672,7 @@ def _dtype_to_na_value(dtype: DtypeObj, has_none_blocks: bool): raise NotImplementedError -def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: +def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj]: """ Return dtype and N/A values to use when concatenating specified units. @@ -655,11 +684,11 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: """ if len(join_units) == 1: blk = join_units[0].block - return blk.dtype + return blk.dtype, blk.dtype if _is_uniform_reindex(join_units): empty_dtype = join_units[0].block.dtype - return empty_dtype + return empty_dtype, empty_dtype has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) @@ -670,7 +699,23 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: dtype = find_common_type(dtypes) if has_none_blocks: dtype = ensure_dtype_can_hold_na(dtype) - return dtype + + dtype_future = dtype + if len(dtypes) != len(join_units): + dtypes_future = [ + unit.dtype for unit in join_units if not unit.is_na_after_size_deprecation + ] + if not len(dtypes_future): + dtypes_future = [ + unit.dtype for unit in join_units if unit.block.dtype.kind != "V" + ] + + if len(dtypes) != len(dtypes_future): + dtype_future = find_common_type(dtypes_future) + if has_none_blocks: + dtype_future = ensure_dtype_can_hold_na(dtype_future) + + return dtype, dtype_future def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 772dfdfe8fb03..97718386dabb7 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -12,8 +12,11 @@ def test_concat_mismatched_categoricals_with_empty(): ser1 = Series(["a", "b", "c"], dtype="category") ser2 = Series([], dtype="category") - result = _concat.concat_compat([ser1._values, ser2._values]) - expected = pd.concat([ser1, ser2])._values + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = _concat.concat_compat([ser1._values, ser2._values]) + with tm.assert_produces_warning(FutureWarning, match=msg): + expected = pd.concat([ser1, ser2])._values tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4388913511be2..168dce0c6f3eb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -362,8 +362,11 @@ def f3(x): df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) # correct result - result1 = df.groupby("a").apply(f1) - result2 = df2.groupby("a").apply(f1) + depr_msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result1 = df.groupby("a").apply(f1) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -377,7 +380,8 @@ def f3(x): with pytest.raises(AssertionError, match=msg): df.groupby("a").apply(f3) with pytest.raises(AssertionError, match=msg): - df2.groupby("a").apply(f3) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + df2.groupby("a").apply(f3) def test_attr_wrapper(ts): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e681223933abb..4c97d07f39bbf 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -616,7 +616,9 @@ def test_append_empty_preserve_name(self, name, expected): left = Index([], name="foo") right = Index([1, 2, 3], name=name) - result = left.append(right) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = left.append(right) assert result.name == expected @pytest.mark.parametrize( diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index b540cd514c0b5..e794f5abf4b21 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -162,7 +162,9 @@ def test_append_preserve_index_name(self): df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"]) df2 = df2.set_index(["A"]) - result = df1._append(df2) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = df1._append(df2) assert result.index.name == "A" indexes_can_append = [ diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index 2d84de8145111..e5d42d9cb1bfb 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -693,11 +693,14 @@ def test_concat_categorical_empty(self): s1 = Series([], dtype="category") s2 = Series([1, 2], dtype="category") - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) - tm.assert_series_equal(s1._append(s2, ignore_index=True), s2) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1._append(s2, ignore_index=True), s2) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) - tm.assert_series_equal(s2._append(s1, ignore_index=True), s2) + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2._append(s1, ignore_index=True), s2) s1 = Series([], dtype="category") s2 = Series([], dtype="category") @@ -719,11 +722,13 @@ def test_concat_categorical_empty(self): # empty Series is ignored exp = Series([np.nan, np.nan]) - tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) - tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1._append(s2, ignore_index=True), exp) - tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) - tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) + with tm.assert_produces_warning(FutureWarning, match=msg): + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2._append(s1, ignore_index=True), exp) def test_categorical_concat_append(self): cat = Categorical(["a", "b"], categories=["a", "b"]) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 244fe6a7927fe..d9fa3232db964 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -747,7 +747,15 @@ def test_concat_ignore_empty_object_float(empty_dtype, df_dtype): # https://github.com/pandas-dev/pandas/issues/45637 df = DataFrame({"foo": [1, 2], "bar": [1, 2]}, dtype=df_dtype) empty = DataFrame(columns=["foo", "bar"], dtype=empty_dtype) - result = concat([empty, df]) + + msg = "The behavior of DataFrame concatenation with empty entries is deprecated" + warn = None + if df_dtype == "datetime64[ns]" or ( + df_dtype == "float64" and empty_dtype != "float64" + ): + warn = FutureWarning + with tm.assert_produces_warning(warn, match=msg): + result = concat([empty, df]) expected = df if df_dtype == "int64": # TODO what exact behaviour do we want for integer eventually? diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 43c6bb03b6a9a..950b90e581060 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -567,7 +567,9 @@ def test_concat_float_datetime64(using_array_manager): if not using_array_manager: expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}) - result = concat([df_time, df_float.iloc[:0]]) + msg = "The behavior of DataFrame concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = concat([df_time, df_float.iloc[:0]]) tm.assert_frame_equal(result, expected) else: expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}).astype( diff --git a/pandas/tests/reshape/concat/test_empty.py b/pandas/tests/reshape/concat/test_empty.py index 919bcb8b2e577..6ef54b907cf34 100644 --- a/pandas/tests/reshape/concat/test_empty.py +++ b/pandas/tests/reshape/concat/test_empty.py @@ -58,7 +58,9 @@ def test_concat_empty_series(self): s1 = Series([1, 2, 3], name="x") s2 = Series(name="y", dtype="float64") - res = concat([s1, s2], axis=0) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + res = concat([s1, s2], axis=0) # name will be reset exp = Series([1, 2, 3]) tm.assert_series_equal(res, exp) @@ -238,9 +240,11 @@ def test_concat_inner_join_empty(self): df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64") df_expected = DataFrame({"a": []}, index=RangeIndex(0), dtype="int64") - for how, expected in [("inner", df_expected), ("outer", df_a)]: - result = concat([df_a, df_empty], axis=1, join=how) - tm.assert_frame_equal(result, expected) + result = concat([df_a, df_empty], axis=1, join="inner") + tm.assert_frame_equal(result, df_expected) + + result = concat([df_a, df_empty], axis=1, join="outer") + tm.assert_frame_equal(result, df_a) def test_empty_dtype_coerce(self): # xref to #12411 diff --git a/pandas/tests/reshape/concat/test_series.py b/pandas/tests/reshape/concat/test_series.py index c5d3a8a7c74d1..2711b6a34c62c 100644 --- a/pandas/tests/reshape/concat/test_series.py +++ b/pandas/tests/reshape/concat/test_series.py @@ -40,7 +40,9 @@ def test_concat_empty_and_non_empty_series_regression(self): s2 = Series([], dtype=object) expected = s1 - result = concat([s1, s2]) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = concat([s1, s2]) tm.assert_series_equal(result, expected) def test_concat_series_axis1(self): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 3a822c8134eb4..96d268b7841fd 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -682,8 +682,10 @@ def test_join_append_timedeltas(self, using_array_manager): {"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]} ) df = DataFrame(columns=list("dt")) - df = concat([df, d], ignore_index=True) - result = concat([df, d], ignore_index=True) + msg = "The behavior of DataFrame concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + df = concat([df, d], ignore_index=True) + result = concat([df, d], ignore_index=True) expected = DataFrame( { "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)], diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 46af5f509d6ab..fb6f7e386d5d5 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -63,7 +63,9 @@ def test_combine_first(self): # corner case ser = Series([1.0, 2, 3], index=[0, 1, 2]) empty = Series([], index=[], dtype=object) - result = ser.combine_first(empty) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = ser.combine_first(empty) ser.index = ser.index.astype("O") tm.assert_series_equal(ser, result) @@ -110,7 +112,9 @@ def test_combine_first_timezone_series_with_empty_series(self): ) s1 = Series(range(10), index=time_index) s2 = Series(index=time_index) - result = s1.combine_first(s2) + msg = "The behavior of array concatenation with empty entries is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + result = s1.combine_first(s2) tm.assert_series_equal(result, s1) def test_combine_first_preserves_dtype(self): From 2ace79c60ff9f5423d5ce6aa436dc436053ab80e Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 7 Apr 2023 18:25:29 -0700 Subject: [PATCH 02/12] xfail on 32bit --- pandas/core/dtypes/concat.py | 3 +-- pandas/tests/io/formats/test_info.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index e33b8682b0d2e..f53f74e69da46 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -119,8 +119,7 @@ def concat_compat( warnings.warn( "The behavior of array concatenation with empty entries is " "deprecated. In a future version, this will no longer exclude " - "empty items when determining the result dtype. To opt in to " - "the future behavior, set pd.set_option('future.concat_empty', True). " + "empty items when determining the result dtype. " "To retain the old behavior, exclude the empty entries before " "the concat operation.", FutureWarning, diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index e79e135208995..fdb467cd2d1b0 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -495,6 +495,7 @@ def test_info_int_columns(): assert result == expected +@pytest.mark.xfail(not IS64) def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) From 6258adf6f14099c5b2997c4b67689a1876a6c1b9 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 8 Apr 2023 08:49:21 -0700 Subject: [PATCH 03/12] missing reason --- pandas/tests/io/formats/test_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index fdb467cd2d1b0..42aebb3d9ac1c 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -495,7 +495,7 @@ def test_info_int_columns(): assert result == expected -@pytest.mark.xfail(not IS64) +@pytest.mark.xfail(not IS64, reason="concat will cast to int64") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) From 51e6d360b32b8e3c422021551fcf591e6cfc4c70 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 10 Apr 2023 14:11:31 -0700 Subject: [PATCH 04/12] Fix AM build --- pandas/tests/reshape/merge/test_merge.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 96d268b7841fd..491795a87d15a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -683,7 +683,10 @@ def test_join_append_timedeltas(self, using_array_manager): ) df = DataFrame(columns=list("dt")) msg = "The behavior of DataFrame concatenation with empty entries is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): + warn = FutureWarning + if using_array_manager: + warn = None + with tm.assert_produces_warning(warn, match=msg): df = concat([df, d], ignore_index=True) result = concat([df, d], ignore_index=True) expected = DataFrame( From 52ce0d7732586a5ffeb6f9c9e7b8999fbfa90f43 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 10 Apr 2023 15:56:35 -0700 Subject: [PATCH 05/12] post-merge fixup --- pandas/core/internals/concat.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 7ccb5b434a058..826378d60a506 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -653,11 +653,13 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> tuple[DtypeObj, DtypeObj dtype_future = dtype if len(dtypes) != len(join_units): dtypes_future = [ - unit.dtype for unit in join_units if not unit.is_na_after_size_deprecation + unit.block.dtype + for unit in join_units + if not unit.is_na_after_size_deprecation ] if not len(dtypes_future): dtypes_future = [ - unit.dtype for unit in join_units if unit.block.dtype.kind != "V" + unit.block.dtype for unit in join_units if unit.block.dtype.kind != "V" ] if len(dtypes) != len(dtypes_future): From 163bf8a9e62c75b5d416c03fb51990ee0063c13b Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 11 Apr 2023 16:56:56 -0700 Subject: [PATCH 06/12] catch more specifically --- pandas/core/dtypes/concat.py | 128 +++++++++++------------------------ 1 file changed, 39 insertions(+), 89 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f53f74e69da46..c2e059952abb5 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -21,14 +21,9 @@ find_common_type, ) from pandas.core.dtypes.common import is_dtype_equal -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - DatetimeTZDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ( ABCCategoricalIndex, - ABCExtensionArray, ABCSeries, ) @@ -106,15 +101,17 @@ def concat_compat( # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. + orig = to_concat non_empties = [x for x in to_concat if _is_nonempty(x, axis)] if non_empties and axis == 0 and not ea_compat_axis: # ea_compat_axis see GH#39574 - if len(non_empties) < len(to_concat) and not any( - obj.dtype == _dtype_obj for obj in non_empties - ): - # Check for object dtype is an imperfect proxy for checking if - # the result dtype is going to change once the deprecation is - # enforced. + to_concat = non_empties + + any_ea, kinds, target_dtype = _get_result_dtype(to_concat, non_empties) + + if len(to_concat) < len(orig): + _, _, alt_dtype = _get_result_dtype(orig, non_empties) + if alt_dtype != target_dtype: # GH#39122 warnings.warn( "The behavior of array concatenation with empty entries is " @@ -125,42 +122,42 @@ def concat_compat( FutureWarning, stacklevel=find_stack_level(), ) - to_concat = non_empties - dtypes = {obj.dtype for obj in to_concat} - kinds = {obj.dtype.kind for obj in to_concat} - contains_datetime = any( - isinstance(dtype, (np.dtype, DatetimeTZDtype)) and dtype.kind in "mM" - for dtype in dtypes - ) or any(isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat) + if target_dtype is not None: + to_concat = [astype_array(arr, target_dtype, copy=False) for arr in to_concat] - all_empty = not len(non_empties) - single_dtype = len(dtypes) == 1 - any_ea = any(isinstance(x, ExtensionDtype) for x in dtypes) + if not isinstance(to_concat[0], np.ndarray): + # i.e. isinstance(to_concat[0], ExtensionArray) + to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) + cls = type(to_concat[0]) + return cls._concat_same_type(to_concat_eas) + else: + to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) + result = np.concatenate(to_concat_arrs, axis=axis) + + if not any_ea and "b" in kinds and result.dtype.kind in "iuf": + # GH#39817 cast to object instead of casting bools to numeric + result = result.astype(object, copy=False) + return result - if contains_datetime: - return _concat_datetime(to_concat, axis=axis) +def _get_result_dtype(to_concat: Sequence[ArrayLike], non_empties: Sequence[ArrayLike]): + target_dtype = None + + dtypes = {obj.dtype for obj in to_concat} + kinds = {obj.dtype.kind for obj in to_concat} + + any_ea = any(not isinstance(x, np.ndarray) for x in to_concat) if any_ea: + # i.e. any ExtensionArrays + # we ignore axis here, as internally concatting with EAs is always # for axis=0 - if not single_dtype: + if len(dtypes) != 1: target_dtype = find_common_type([x.dtype for x in to_concat]) target_dtype = common_dtype_categorical_compat(to_concat, target_dtype) - to_concat = [ - astype_array(arr, target_dtype, copy=False) for arr in to_concat - ] - - if isinstance(to_concat[0], ABCExtensionArray): - # TODO: what about EA-backed Index? - to_concat_eas = cast("Sequence[ExtensionArray]", to_concat) - cls = type(to_concat[0]) - return cls._concat_same_type(to_concat_eas) - else: - to_concat_arrs = cast("Sequence[np.ndarray]", to_concat) - return np.concatenate(to_concat_arrs) - elif all_empty: + elif not len(non_empties): # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) @@ -170,17 +167,12 @@ def concat_compat( pass else: # coerce to object - to_concat = [x.astype("object") for x in to_concat] + target_dtype = np.dtype(object) kinds = {"o"} + else: + target_dtype = np.find_common_type(list(dtypes), []) - # error: Argument 1 to "concatenate" has incompatible type - # "Sequence[Union[ExtensionArray, ndarray[Any, Any]]]"; expected - # "Union[_SupportsArray[dtype[Any]], _NestedSequence[_SupportsArray[dtype[Any]]]]" - result: np.ndarray = np.concatenate(to_concat, axis=axis) # type: ignore[arg-type] - if "b" in kinds and result.dtype.kind in "iuf": - # GH#39817 cast to object instead of casting bools to numeric - result = result.astype(object, copy=False) - return result + return any_ea, kinds, target_dtype def union_categoricals( @@ -347,45 +339,3 @@ def _maybe_unwrap(x): dtype = CategoricalDtype(categories=categories, ordered=ordered) return Categorical._simple_new(new_codes, dtype=dtype) - - -def _concatenate_2d(to_concat: Sequence[np.ndarray], axis: AxisInt) -> np.ndarray: - # coerce to 2d if needed & concatenate - if axis == 1: - to_concat = [np.atleast_2d(x) for x in to_concat] - return np.concatenate(to_concat, axis=axis) - - -def _concat_datetime(to_concat: Sequence[ArrayLike], axis: AxisInt = 0) -> ArrayLike: - """ - provide concatenation of an datetimelike array of arrays each of which is a - single M8[ns], datetime64[ns, tz] or m8[ns] dtype - - Parameters - ---------- - to_concat : sequence of arrays - axis : axis to provide concatenation - - Returns - ------- - a single array, preserving the combined dtypes - """ - from pandas.core.construction import ensure_wrapped_if_datetimelike - - to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] - - single_dtype = lib.dtypes_all_equal([x.dtype for x in to_concat]) - - # multiple types, need to coerce to object - if not single_dtype: - # ensure_wrapped_if_datetimelike ensures that astype(object) wraps - # in Timestamp/Timedelta - return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) - - # error: Unexpected keyword argument "axis" for "_concat_same_type" of - # "ExtensionArray" - to_concat_eas = cast("list[ExtensionArray]", to_concat) - result = type(to_concat_eas[0])._concat_same_type( # type: ignore[call-arg] - to_concat_eas, axis=axis - ) - return result From 03a06412e09fc5135dba9c35d9e9ed625559332a Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 11 Apr 2023 19:56:18 -0700 Subject: [PATCH 07/12] un-xfail --- pandas/tests/io/formats/test_info.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 42aebb3d9ac1c..e79e135208995 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -495,7 +495,6 @@ def test_info_int_columns(): assert result == expected -@pytest.mark.xfail(not IS64, reason="concat will cast to int64") def test_memory_usage_empty_no_warning(): # GH#50066 df = DataFrame(index=["a", "b"]) From 7e2e995e4272f208f8e00bd98704fb10f02c8bce Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 12 Apr 2023 07:17:29 -0700 Subject: [PATCH 08/12] mypy fixup --- pandas/core/dtypes/concat.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index c2e059952abb5..64cef1ba52c97 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -170,7 +170,11 @@ def _get_result_dtype(to_concat: Sequence[ArrayLike], non_empties: Sequence[Arra target_dtype = np.dtype(object) kinds = {"o"} else: - target_dtype = np.find_common_type(list(dtypes), []) + # Argument 1 to "list" has incompatible type "Set[Union[ExtensionDtype, + # Any]]"; expected "Iterable[Union[dtype[Any], None, Type[Any], + # _SupportsDType[dtype[Any]], str, Tuple[Any, Union[SupportsIndex, + # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" + target_dtype = np.find_common_type(list(dtypes), []) # type: ignore[arg-type] return any_ea, kinds, target_dtype From 75d5041dd4b7dc0818f8ad7fa209a2e826b48971 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 18 Apr 2023 14:21:38 -0700 Subject: [PATCH 09/12] update test --- pandas/tests/reshape/concat/test_datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 950b90e581060..ce4a2abc82a55 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -567,7 +567,7 @@ def test_concat_float_datetime64(using_array_manager): if not using_array_manager: expected = DataFrame({"A": pd.array(["2000"], dtype="datetime64[ns]")}) - msg = "The behavior of DataFrame concatenation with empty entries is deprecated" + msg = "The behavior of DataFrame concatenation with empty or all-NA entries" with tm.assert_produces_warning(FutureWarning, match=msg): result = concat([df_time, df_float.iloc[:0]]) tm.assert_frame_equal(result, expected) From 392b40ab0b5e07faa539e94658abf64121e99fce Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 May 2023 21:44:07 -0700 Subject: [PATCH 10/12] Fix broken test --- pandas/tests/groupby/test_groupby.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index daf2d76548f11..9ade9c9b3b111 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -374,9 +374,13 @@ def f3(x): df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) + depr_msg = "The behavior of array concatenation with empty entries is deprecated" + # correct result - result1 = df.groupby("a").apply(f1) - result2 = df2.groupby("a").apply(f1) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result1 = df.groupby("a").apply(f1) + with tm.assert_produces_warning(FutureWarning, match=depr_msg): + result2 = df2.groupby("a").apply(f1) tm.assert_frame_equal(result1, result2) # should fail (not the same number of levels) @@ -387,7 +391,6 @@ def f3(x): df2.groupby("a").apply(f2) # should fail (incorrect shape) - depr_msg = "The behavior of array concatenation with empty entries is deprecated" with pytest.raises(AssertionError, match=msg): df.groupby("a").apply(f3) with pytest.raises(AssertionError, match=msg): From 3666bca065000a6261582574e780cfc9c392295d Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 May 2023 08:28:44 -0700 Subject: [PATCH 11/12] remove duplicate whatsnew entries --- doc/source/whatsnew/v2.1.0.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 94abccdda638b..9176773172f04 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -225,23 +225,17 @@ Deprecations - Deprecated :meth:`Categorical.to_list`, use ``obj.tolist()`` instead (:issue:`51254`) - Deprecated :meth:`DataFrame._data` and :meth:`Series._data`, use public APIs instead (:issue:`33333`) - Deprecated :meth:`DataFrameGroupBy.dtypes`, check ``dtypes`` on the underlying object instead (:issue:`51045`) -- Deprecated :meth:`DataFrameGroupBy.dtypes`, check ``dtypes`` on the underlying object instead (:issue:`51045`) - Deprecated ``axis=1`` in :meth:`DataFrame.ewm`, :meth:`DataFrame.rolling`, :meth:`DataFrame.expanding`, transpose before calling the method instead (:issue:`51778`) - Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`) -- Deprecated ``axis=1`` in :meth:`DataFrame.groupby` and in :class:`Grouper` constructor, do ``frame.T.groupby(...)`` instead (:issue:`51203`) - Deprecated accepting slices in :meth:`DataFrame.take`, call ``obj[slicer]`` or pass a sequence of integers instead (:issue:`51539`) - Deprecated explicit support for subclassing :class:`Index` (:issue:`45289`) -- Deprecated explicit support for subclassing :class:`Index` (:issue:`45289`) - Deprecated passing a :class:`DataFrame` to :meth:`DataFrame.from_records`, use :meth:`DataFrame.set_index` or :meth:`DataFrame.drop` instead (:issue:`51353`) - Deprecated silently dropping unrecognized timezones when parsing strings to datetimes (:issue:`18702`) - Deprecated the ``axis`` keyword in :meth:`DataFrame.ewm`, :meth:`Series.ewm`, :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, :meth:`Series.expanding` (:issue:`51778`) - Deprecated the ``axis`` keyword in :meth:`DataFrame.resample`, :meth:`Series.resample` (:issue:`51778`) - Deprecated the behavior of :func:`concat` with both ``len(keys) != len(objs)``, in a future version this will raise instead of truncating to the shorter of the two sequences (:issue:`43485`) -- Deprecated the behavior of :func:`concat` with both ``len(keys) != len(objs)``, in a future version this will raise instead of truncating to the shorter of the two sequences (:issue:`43485`) -- Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`) - Deprecated the default of ``observed=False`` in :meth:`DataFrame.groupby` and :meth:`Series.groupby`; this will default to ``True`` in a future version (:issue:`43999`) - Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) -- Deprecating pinning ``group.name`` to each group in :meth:`SeriesGroupBy.aggregate` aggregations; if your operation requires utilizing the groupby keys, iterate over the groupby object instead (:issue:`41090`) - Deprecated the 'axis' keyword in :meth:`.GroupBy.idxmax`, :meth:`.GroupBy.idxmin`, :meth:`.GroupBy.fillna`, :meth:`.GroupBy.take`, :meth:`.GroupBy.skew`, :meth:`.GroupBy.rank`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cummax`, :meth:`.GroupBy.cummin`, :meth:`.GroupBy.pct_change`, :meth:`GroupBy.diff`, :meth:`.GroupBy.shift`, and :meth:`DataFrameGroupBy.corrwith`; for ``axis=1`` operate on the underlying :class:`DataFrame` instead (:issue:`50405`, :issue:`51046`) - Deprecated :class:`.DataFrameGroupBy` with ``as_index=False`` not including groupings in the result when they are not columns of the DataFrame (:issue:`49519`) - Deprecated :func:`is_categorical_dtype`, use ``isinstance(obj.dtype, pd.CategoricalDtype)`` instead (:issue:`52527`) From e696c539ab2816eeda00bea2fef7bab2ffa85e39 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 May 2023 13:04:42 -0700 Subject: [PATCH 12/12] remove unused --- pandas/core/dtypes/concat.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 7de76157143e6..35ebd9a4f4f52 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -40,9 +40,6 @@ ) -_dtype_obj = np.dtype(object) - - def _is_nonempty(x, axis) -> bool: # filter empty arrays # 1-d dtypes always are included here