Skip to content

Commit

Permalink
BUG: incorrect casting in DataFrame.append (#39454)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Feb 1, 2021
1 parent a9c803c commit 082023e
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 21 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ Reshaping
- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
- :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`)
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
-
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)

Sparse
^^^^^^
Expand Down
32 changes: 20 additions & 12 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pandas._typing import ArrayLike, DtypeObj, Manager, Shape
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.cast import find_common_type, maybe_promote
from pandas.core.dtypes.common import (
get_dtype,
is_categorical_dtype,
Expand Down Expand Up @@ -394,7 +394,11 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A
if _is_uniform_reindex(join_units):
# FIXME: integrate property
empty_dtype = join_units[0].block.dtype
upcasted_na = join_units[0].block.fill_value
if is_extension_array_dtype(empty_dtype):
# for dt64tz we need this to get NaT instead of np.datetime64("NaT")
upcasted_na = empty_dtype.na_value
else:
upcasted_na = join_units[0].block.fill_value
return empty_dtype, upcasted_na

has_none_blocks = False
Expand All @@ -405,25 +409,29 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A
else:
dtypes[i] = unit.dtype

filtered_dtypes = [
unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
]
if not len(filtered_dtypes):
filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None]
dtype_alt = find_common_type(filtered_dtypes)

upcast_classes = _get_upcast_classes(join_units, dtypes)

if is_extension_array_dtype(dtype_alt):
return dtype_alt, dtype_alt.na_value
elif dtype_alt == object:
return dtype_alt, np.nan

# TODO: de-duplicate with maybe_promote?
# create the result
if "extension" in upcast_classes:
if len(upcast_classes) == 1:
cls = upcast_classes["extension"][0]
return cls, cls.na_value
else:
return np.dtype("object"), np.nan
elif "object" in upcast_classes:
return np.dtype(np.object_), np.nan
return np.dtype("object"), np.nan
elif "bool" in upcast_classes:
if has_none_blocks:
return np.dtype(np.object_), np.nan
else:
return np.dtype(np.bool_), None
elif "category" in upcast_classes:
return np.dtype(np.object_), np.nan
elif "datetimetz" in upcast_classes:
# GH-25014. We use NaT instead of iNaT, since this eventually
# ends up in DatetimeArray.take, which does not allow iNaT.
Expand Down Expand Up @@ -481,7 +489,7 @@ def _get_upcast_classes(
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
"""Select upcast class name based on dtype."""
if is_categorical_dtype(dtype):
return "category"
return "extension"
elif is_datetime64tz_dtype(dtype):
return "datetimetz"
elif is_extension_array_dtype(dtype):
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,7 @@ class TestConstructors(base.BaseConstructorsTests):


class TestReshaping(base.BaseReshapingTests):
@pytest.mark.xfail(reason="Deliberately upcast to object?")
def test_concat_with_reindex(self, data):
super().test_concat_with_reindex(data)
pass


class TestGetitem(base.BaseGetitemTests):
Expand Down
22 changes: 17 additions & 5 deletions pandas/tests/reshape/concat/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,13 +365,25 @@ def test_append_empty_tz_frame_with_datetime64ns(self):

# pd.NaT gets inferred as tz-naive, so append result is tz-naive
result = df.append({"a": pd.NaT}, ignore_index=True)
expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
expected = DataFrame({"a": [pd.NaT]}).astype(object)
tm.assert_frame_equal(result, expected)

# also test with typed value to append
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
result = df.append(
Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True
)
expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
result = df.append(other, ignore_index=True)
expected = DataFrame({"a": [pd.NaT]}).astype(object)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
)
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str):
# https://github.com/pandas-dev/pandas/issues/35460
df = DataFrame(columns=["a"]).astype(dtype_str)

other = DataFrame({"a": [np.timedelta64("NaT", "ns")]})
result = df.append(other, ignore_index=True)

expected = other.astype(object)
tm.assert_frame_equal(result, expected)
1 change: 1 addition & 0 deletions pandas/tests/reshape/concat/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_categorical_concat(self, sort):
"h": [None] * 6 + cat_values,
}
)
exp["h"] = exp["h"].astype(df2["h"].dtype)
tm.assert_frame_equal(res, exp)

def test_categorical_concat_dtypes(self):
Expand Down

0 comments on commit 082023e

Please sign in to comment.