Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: combine_first does not retain dtypes with Timestamp DataFrames #38145

Merged
merged 23 commits into from
Nov 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ Categorical

Datetimelike
^^^^^^^^^^^^
- Bug in :meth:`DataFrame.combine_first` that would convert datetime-like column on other :class:`DataFrame` to integer when the column is not present in original :class:`DataFrame` (:issue:`28481`)
- Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`)
- Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`)
- Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6386,7 +6386,7 @@ def combine(
otherSeries = otherSeries.astype(new_dtype)

arr = func(series, otherSeries)
arr = maybe_downcast_to_dtype(arr, this_dtype)
arr = maybe_downcast_to_dtype(arr, new_dtype)

result[col] = arr

Expand Down
103 changes: 74 additions & 29 deletions pandas/tests/frame/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def test_combine_first_mixed_bug(self):
combined = frame1.combine_first(frame2)
assert len(combined.columns) == 5

def test_combine_first_same_as_in_update(self):
# gh 3016 (same as in update)
df = DataFrame(
[[1.0, 2.0, False, True], [4.0, 5.0, True, False]],
Expand All @@ -118,6 +119,7 @@ def test_combine_first_mixed_bug(self):
df.loc[0, "A"] = 45
tm.assert_frame_equal(result, df)

def test_combine_first_doc_example(self):
# doc example
df1 = DataFrame(
{"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
Expand All @@ -134,38 +136,56 @@ def test_combine_first_mixed_bug(self):
expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]})
tm.assert_frame_equal(result, expected)

# GH3552, return object dtype with bools
def test_combine_first_return_obj_type_with_bools(self):
# GH3552

df1 = DataFrame(
[[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]]
)
df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2])

result = df1.combine_first(df2)[2]
expected = Series([True, True, False], name=2)
tm.assert_series_equal(result, expected)

# GH 3593, converting datetime64[ns] incorrectly
df0 = DataFrame(
{"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
)
df1 = DataFrame({"a": [None, None, None]})
df2 = df1.combine_first(df0)
tm.assert_frame_equal(df2, df0)

df2 = df0.combine_first(df1)
tm.assert_frame_equal(df2, df0)

df0 = DataFrame(
{"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]}
)
df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]})
df2 = df1.combine_first(df0)
result = df0.copy()
result.iloc[0, :] = df1.iloc[0, :]
tm.assert_frame_equal(df2, result)
expected = Series([True, True, False], name=2, dtype=object)

result_12 = df1.combine_first(df2)[2]
tm.assert_series_equal(result_12, expected)

result_21 = df2.combine_first(df1)[2]
tm.assert_series_equal(result_21, expected)

@pytest.mark.parametrize(
"data1, data2, data_expected",
(
(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[None, None, None],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
(
[None, None, None],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
(
[datetime(2000, 1, 2), None, None],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
(
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
[datetime(2000, 1, 2), None, None],
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
),
),
)
def test_combine_first_convert_datatime_correctly(
self, data1, data2, data_expected
):
# GH 3593

df2 = df0.combine_first(df1)
tm.assert_frame_equal(df2, df0)
df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2})
result = df1.combine_first(df2)
expected = DataFrame({"a": data_expected})
tm.assert_frame_equal(result, expected)

def test_combine_first_align_nan(self):
# GH 7509 (not fixed)
Expand Down Expand Up @@ -339,9 +359,14 @@ def test_combine_first_int(self):
df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64")
df2 = DataFrame({"a": [1, 4]}, dtype="int64")

res = df1.combine_first(df2)
tm.assert_frame_equal(res, df1)
assert res["a"].dtype == "int64"
result_12 = df1.combine_first(df2)
expected_12 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64")
tm.assert_frame_equal(result_12, expected_12)

result_21 = df2.combine_first(df1)
expected_21 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64")

tm.assert_frame_equal(result_21, expected_21)

@pytest.mark.parametrize("val", [1, 1.0])
def test_combine_first_with_asymmetric_other(self, val):
Expand All @@ -367,6 +392,26 @@ def test_combine_first_string_dtype_only_na(self):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"scalar1, scalar2",
[
(datetime(2020, 1, 1), datetime(2020, 1, 2)),
(pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can add an Interval example here as well

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

(pd.Timedelta("89 days"), pd.Timedelta("60 min")),
(pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")),
],
)
def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):
# GH28481
na_value = nulls_fixture
frame = DataFrame([[na_value, na_value]], columns=["a", "b"])
other = DataFrame([[scalar1, scalar2]], columns=["b", "c"])

result = frame.combine_first(other)
expected = DataFrame([[na_value, scalar1, scalar2]], columns=["a", "b", "c"])
tm.assert_frame_equal(result, expected)


def test_combine_first_with_nan_multiindex():
# gh-36562

Expand Down