Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: incorrect casting in DataFrame.append #39454

Merged
merged 4 commits into from
Feb 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ Reshaping
- Bug in :meth:`DataFrame.join` not assigning values correctly when having :class:`MultiIndex` where at least one dimension is from dtype ``Categorical`` with non-alphabetically sorted categories (:issue:`38502`)
- :meth:`Series.value_counts` and :meth:`Series.mode` return consistent keys in original order (:issue:`12679`, :issue:`11227` and :issue:`39007`)
- Bug in :meth:`DataFrame.apply` would give incorrect results when used with a string argument and ``axis=1`` when the axis argument was not supported and now raises a ``ValueError`` instead (:issue:`39211`)
-
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`)

Sparse
^^^^^^
Expand Down
32 changes: 20 additions & 12 deletions pandas/core/internals/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pandas._typing import ArrayLike, DtypeObj, Manager, Shape
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.cast import find_common_type, maybe_promote
from pandas.core.dtypes.common import (
get_dtype,
is_categorical_dtype,
Expand Down Expand Up @@ -393,7 +393,11 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A
if _is_uniform_reindex(join_units):
# FIXME: integrate property
empty_dtype = join_units[0].block.dtype
upcasted_na = join_units[0].block.fill_value
if is_extension_array_dtype(empty_dtype):
# for dt64tz we need this to get NaT instead of np.datetime64("NaT")
upcasted_na = empty_dtype.na_value
else:
upcasted_na = join_units[0].block.fill_value
return empty_dtype, upcasted_na

has_none_blocks = False
Expand All @@ -404,25 +408,29 @@ def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, A
else:
dtypes[i] = unit.dtype

filtered_dtypes = [
unit.dtype for unit in join_units if unit.block is not None and not unit.is_na
]
if not len(filtered_dtypes):
filtered_dtypes = [unit.dtype for unit in join_units if unit.block is not None]
dtype_alt = find_common_type(filtered_dtypes)

upcast_classes = _get_upcast_classes(join_units, dtypes)

if is_extension_array_dtype(dtype_alt):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we just drop the upcast_classes stuff and just do an if/elseif here on dtype_alt? (as you have done with extension / object)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isnt't this heavily overlapping with: #39453 e.g. i like #39453 as it gets what we want ultimately rigth?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we're going to get there eventually, but there are a handful of tests that break if we do that right away. i think the behavior in those tests can be considered buggy (xref #39122), but want to address them in bite-size pieces

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok that's fine.

return dtype_alt, dtype_alt.na_value
elif dtype_alt == object:
return dtype_alt, np.nan

# TODO: de-duplicate with maybe_promote?
# create the result
if "extension" in upcast_classes:
if len(upcast_classes) == 1:
cls = upcast_classes["extension"][0]
return cls, cls.na_value
else:
return np.dtype("object"), np.nan
elif "object" in upcast_classes:
return np.dtype(np.object_), np.nan
return np.dtype("object"), np.nan
elif "bool" in upcast_classes:
if has_none_blocks:
return np.dtype(np.object_), np.nan
else:
return np.dtype(np.bool_), None
elif "category" in upcast_classes:
return np.dtype(np.object_), np.nan
elif "datetimetz" in upcast_classes:
# GH-25014. We use NaT instead of iNaT, since this eventually
# ends up in DatetimeArray.take, which does not allow iNaT.
Expand Down Expand Up @@ -480,7 +488,7 @@ def _get_upcast_classes(
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
"""Select upcast class name based on dtype."""
if is_categorical_dtype(dtype):
return "category"
return "extension"
elif is_datetime64tz_dtype(dtype):
return "datetimetz"
elif is_extension_array_dtype(dtype):
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,7 @@ class TestConstructors(base.BaseConstructorsTests):


class TestReshaping(base.BaseReshapingTests):
@pytest.mark.xfail(reason="Deliberately upcast to object?")
def test_concat_with_reindex(self, data):
super().test_concat_with_reindex(data)
pass


class TestGetitem(base.BaseGetitemTests):
Expand Down
22 changes: 17 additions & 5 deletions pandas/tests/reshape/concat/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,13 +365,25 @@ def test_append_empty_tz_frame_with_datetime64ns(self):

# pd.NaT gets inferred as tz-naive, so append result is tz-naive
result = df.append({"a": pd.NaT}, ignore_index=True)
expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
expected = DataFrame({"a": [pd.NaT]}).astype(object)
tm.assert_frame_equal(result, expected)

# also test with typed value to append
df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
result = df.append(
Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True
)
expected = DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
result = df.append(other, ignore_index=True)
expected = DataFrame({"a": [pd.NaT]}).astype(object)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
)
def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str):
# https://github.com/pandas-dev/pandas/issues/35460
df = DataFrame(columns=["a"]).astype(dtype_str)

other = DataFrame({"a": [np.timedelta64("NaT", "ns")]})
result = df.append(other, ignore_index=True)

expected = other.astype(object)
tm.assert_frame_equal(result, expected)
1 change: 1 addition & 0 deletions pandas/tests/reshape/concat/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def test_categorical_concat(self, sort):
"h": [None] * 6 + cat_values,
}
)
exp["h"] = exp["h"].astype(df2["h"].dtype)
tm.assert_frame_equal(res, exp)

def test_categorical_concat_dtypes(self):
Expand Down