From a7f7570bbf5c8c6baf17bf5f677288b2be27441b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 28 Jan 2021 12:02:33 -0800 Subject: [PATCH 1/4] BUG: incorrect casting in DataFrame.append --- pandas/core/internals/concat.py | 32 ++++++++++++------- pandas/tests/extension/test_categorical.py | 4 +-- pandas/tests/reshape/concat/test_append.py | 22 ++++++++++--- .../tests/reshape/concat/test_categorical.py | 1 + 4 files changed, 39 insertions(+), 20 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 0b611bfdb1f10..8d2af010fd1fe 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -11,7 +11,7 @@ from pandas._typing import ArrayLike, DtypeObj, Manager, Shape from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.cast import find_common_type, maybe_promote from pandas.core.dtypes.common import ( get_dtype, is_categorical_dtype, @@ -393,7 +393,11 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype - upcasted_na = join_units[0].block.fill_value + if is_extension_array_dtype(empty_dtype): + # for dt64tz we need this to get NaT instead of np.datetime64("NaT") + upcasted_na = empty_dtype.na_value + else: + upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False @@ -404,25 +408,29 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A else: dtypes[i] = unit.dtype + filtered_dtypes = [ + unit.dtype for unit in join_units if unit.block is not None and not unit.is_na + ] + if not len(filtered_dtypes): + filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None] + dtype_alt = find_common_type(filtered_dtypes) + upcast_classes = _get_upcast_classes(join_units, dtypes) + if is_extension_array_dtype(dtype_alt): + return dtype_alt, dtype_alt.na_value + elif dtype_alt == object: + return dtype_alt, np.nan + # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: - if len(upcast_classes) == 1: - cls = upcast_classes["extension"][0] - return cls, cls.na_value - else: - return np.dtype("object"), np.nan - elif "object" in upcast_classes: - return np.dtype(np.object_), np.nan + return np.dtype("object"), np.nan elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None - elif "category" in upcast_classes: - return np.dtype(np.object_), np.nan elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. @@ -480,7 +488,7 @@ def _get_upcast_classes( def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: """Select upcast class name based on dtype.""" if is_categorical_dtype(dtype): - return "category" + return "extension" elif is_datetime64tz_dtype(dtype): return "datetimetz" elif is_extension_array_dtype(dtype): diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 9cea274a118c0..10e82a8c9bff1 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -117,9 +117,7 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - @pytest.mark.xfail(reason="Deliberately upcast to object?") - def test_concat_with_reindex(self, data): - super().test_concat_with_reindex(data) + pass class TestGetitem(base.BaseGetitemTests): diff --git a/pandas/tests/reshape/concat/test_append.py b/pandas/tests/reshape/concat/test_append.py index ffeda703cd890..dd6dbd79113e5 100644 --- a/pandas/tests/reshape/concat/test_append.py +++ b/pandas/tests/reshape/concat/test_append.py @@ -365,13 +365,25 @@ def test_append_empty_tz_frame_with_datetime64ns(self): # pd.NaT gets inferred as tz-naive, so append result is tz-naive result = df.append({"a": pd.NaT}, ignore_index=True) - expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + expected = DataFrame({"a": [pd.NaT]}).astype(object) tm.assert_frame_equal(result, expected) # also test with typed value to append df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") - result = df.append( - Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True - ) - expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + other = Series({"a": pd.NaT}, dtype="datetime64[ns]") + result = df.append(other, ignore_index=True) + expected = DataFrame({"a": [pd.NaT]}).astype(object) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"] + ) + def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str): + # https://github.com/pandas-dev/pandas/issues/35460 + df = DataFrame(columns=["a"]).astype(dtype_str) + + other = DataFrame({"a": [np.timedelta64("NaT", "ns")]}) + result = df.append(other, ignore_index=True) + + expected = other.astype(object) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_categorical.py b/pandas/tests/reshape/concat/test_categorical.py index 6dae28003d3b6..357274b66332f 100644 --- a/pandas/tests/reshape/concat/test_categorical.py +++ b/pandas/tests/reshape/concat/test_categorical.py @@ -42,6 +42,7 @@ def test_categorical_concat(self, sort): "h": [None] * 6 + cat_values, } ) + exp["h"] = exp["h"].astype(df2["h"].dtype) tm.assert_frame_equal(res, exp) def test_categorical_concat_dtypes(self): From 4ae9061f1a1fddae70a4a9bcd31ec789fdbaff5e Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 28 Jan 2021 12:34:23 -0800 Subject: [PATCH 2/4] whatsnew --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 7fe0b53d7d2ff..feb2b570528ea 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -411,7 +411,7 @@ Reshaping - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) - :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) -- +- Bug in :meth:`DataFrame.apply` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`???`) Sparse ^^^^^^ From 27ad4bbf586d5631f75b91908a293e4a0e7590d4 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 28 Jan 2021 12:36:28 -0800 Subject: [PATCH 3/4] update GH ref --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index feb2b570528ea..0d360d238a8a0 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -411,7 +411,7 @@ Reshaping - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) - :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) -- Bug in :meth:`DataFrame.apply` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`???`) +- Bug in :meth:`DataFrame.apply` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) Sparse ^^^^^^ From a4abcf64ccb6f8bfcaa4eecb8f44f3a337346283 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 28 Jan 2021 13:33:03 -0800 Subject: [PATCH 4/4] typo fixup --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 0d360d238a8a0..3c9fea201eebf 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -411,7 +411,7 @@ Reshaping - Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`) - :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`) - Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`) -- Bug in :meth:`DataFrame.apply` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) +- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) Sparse ^^^^^^