From 8a4181db8d425dc8f19250067e915a69a1fb6dbf Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Sun, 2 Aug 2020 11:32:27 +0530 Subject: [PATCH 01/17] BUG: fix combine_first converting timestamp to int (#28481) --- pandas/core/frame.py | 2 +- .../tests/frame/methods/test_combine_first.py | 34 ++++++++++++++++--- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3f634c1e6e1ff..8dda2ead7db64 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6151,7 +6151,7 @@ def combine( otherSeries = otherSeries.astype(new_dtype) arr = func(series, otherSeries) - arr = maybe_downcast_to_dtype(arr, this_dtype) + arr = maybe_downcast_to_dtype(arr, new_dtype) result[col] = arr diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 78f265d32f8df..b892b074976ab 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -140,9 +140,13 @@ def test_combine_first_mixed_bug(self): ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - result = df1.combine_first(df2)[2] + result1 = df1.combine_first(df2)[2] + result2 = df2.combine_first(df1)[2] + # this would fail prior to this fix + tm.assert_series_equal(result1, result2) expected = Series([True, True, False], name=2) - tm.assert_series_equal(result, expected) + # regression + # tm.assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( @@ -339,9 +343,13 @@ def test_combine_first_int(self): df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") - res = df1.combine_first(df2) - tm.assert_frame_equal(res, df1) - assert res["a"].dtype == "int64" + res1 = df1.combine_first(df2) + res2 = df1.combine_first(df2) + # this would fail prior to this fix + assert res1["a"].dtype == res2["a"].dtype + # regression + # tm.assert_frame_equal(res, df1) + # assert res["a"].dtype == "int64" @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): @@ -353,3 +361,19 @@ def test_combine_first_with_asymmetric_other(self, val): exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp) + + +@pytest.mark.parametrize("val", [pd.NaT, np.nan, None]) +def test_combine_first_timestamp_bug(val): + + df1 = pd.DataFrame([[val, val]], columns=["a", "b"]) + df2 = pd.DataFrame( + [[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"] + ) + + res = df1.combine_first(df2) + exp = pd.DataFrame( + [[val, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"] + ) + + tm.assert_frame_equal(res, exp) From 0b938f60d4f4a18e1f70f9e287156889481f267e Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Wed, 5 Aug 2020 07:04:26 +0530 Subject: [PATCH 02/17] Add whats new entry --- doc/source/whatsnew/v1.2.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2066858e5de86..955e5b45cffc8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -59,7 +59,8 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- + +- Bug in :meth:`DataFrame.combine_first` that would convert datetime-like column on other :class:`DataFrame` to integer when the column is not present in original :class:`DataFrame` (:issue:`28481`) - Timedelta From 9f841c967f4b20016b19edbba58e46206292f224 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Wed, 5 Aug 2020 08:29:17 +0530 Subject: [PATCH 03/17] Uncomment failing test cases --- .../tests/frame/methods/test_combine_first.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index b892b074976ab..f97f5a32879f4 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -140,13 +140,13 @@ def test_combine_first_mixed_bug(self): ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - result1 = df1.combine_first(df2)[2] + result = df1.combine_first(df2)[2] result2 = df2.combine_first(df1)[2] - # this would fail prior to this fix - tm.assert_series_equal(result1, result2) + expected = Series([True, True, False], name=2) - # regression - # tm.assert_series_equal(result, expected) + + tm.assert_series_equal(result, result2) + tm.assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( @@ -343,13 +343,13 @@ def test_combine_first_int(self): df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") - res1 = df1.combine_first(df2) + res = df1.combine_first(df2) res2 = df1.combine_first(df2) - # this would fail prior to this fix - assert res1["a"].dtype == res2["a"].dtype - # regression - # tm.assert_frame_equal(res, df1) - # assert res["a"].dtype == "int64" + + assert res["a"].dtype == res2["a"].dtype + + tm.assert_frame_equal(res, df1) + assert res["a"].dtype == "int64" @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): From 457a0abdf4d3e4d912ddd92d512773c5f4c0280b Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Fri, 2 Oct 2020 18:57:24 +0530 Subject: [PATCH 04/17] Fix failing test cases --- pandas/tests/frame/methods/test_combine_first.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index f97f5a32879f4..77586542b0ba8 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -143,10 +143,7 @@ def test_combine_first_mixed_bug(self): result = df1.combine_first(df2)[2] result2 = df2.combine_first(df1)[2] - expected = Series([True, True, False], name=2) - tm.assert_series_equal(result, result2) - tm.assert_series_equal(result, expected) # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( @@ -348,9 +345,6 @@ def test_combine_first_int(self): assert res["a"].dtype == res2["a"].dtype - tm.assert_frame_equal(res, df1) - assert res["a"].dtype == "int64" - @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): # see gh-20699 From 8178c2e0f5dd355b8ace86287ce73458de0f0316 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Sun, 4 Oct 2020 13:55:05 +0530 Subject: [PATCH 05/17] Resolve comments --- .../tests/frame/methods/test_combine_first.py | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 77586542b0ba8..94aea753e3cfc 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -140,10 +140,14 @@ def test_combine_first_mixed_bug(self): ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - result = df1.combine_first(df2)[2] + expected1 = pd.Series([True, True, False], name=2, dtype=object) + expected2 = pd.Series([True, True, False], name=2, dtype=object) + + result1 = df1.combine_first(df2)[2] result2 = df2.combine_first(df1)[2] - tm.assert_series_equal(result, result2) + tm.assert_series_equal(result1, expected1) + tm.assert_series_equal(result2, expected2) # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( @@ -357,17 +361,21 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) -@pytest.mark.parametrize("val", [pd.NaT, np.nan, None]) -def test_combine_first_timestamp_bug(val): +@pytest.mark.parametrize("val1, val2", [ + (datetime(2020, 1, 1), datetime(2020, 1, 2)), + (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), + (pd.Timedelta('89 days'), pd.Timedelta('60 min')), +]) +def test_combine_first_timestamp_bug(val1, val2, nulls_fixture): - df1 = pd.DataFrame([[val, val]], columns=["a", "b"]) + df1 = pd.DataFrame([[nulls_fixture, nulls_fixture]], columns=["a", "b"]) df2 = pd.DataFrame( - [[datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["b", "c"] + [[val1, val2]], columns=["b", "c"] ) res = df1.combine_first(df2) exp = pd.DataFrame( - [[val, datetime(2020, 1, 1), datetime(2020, 1, 2)]], columns=["a", "b", "c"] + [[nulls_fixture, val1, val2]], columns=["a", "b", "c"] ) tm.assert_frame_equal(res, exp) From 28b61c3836c2ff8fedd2a4e1a1459371cdacf4f9 Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Sun, 4 Oct 2020 16:12:45 +0530 Subject: [PATCH 06/17] Black format --- .../tests/frame/methods/test_combine_first.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 94aea753e3cfc..237b9f11c869f 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -361,21 +361,20 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) -@pytest.mark.parametrize("val1, val2", [ - (datetime(2020, 1, 1), datetime(2020, 1, 2)), - (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), - (pd.Timedelta('89 days'), pd.Timedelta('60 min')), -]) +@pytest.mark.parametrize( + "val1, val2", + [ + (datetime(2020, 1, 1), datetime(2020, 1, 2)), + (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), + (pd.Timedelta("89 days"), pd.Timedelta("60 min")), + ], +) def test_combine_first_timestamp_bug(val1, val2, nulls_fixture): df1 = pd.DataFrame([[nulls_fixture, nulls_fixture]], columns=["a", "b"]) - df2 = pd.DataFrame( - [[val1, val2]], columns=["b", "c"] - ) + df2 = pd.DataFrame([[val1, val2]], columns=["b", "c"]) res = df1.combine_first(df2) - exp = pd.DataFrame( - [[nulls_fixture, val1, val2]], columns=["a", "b", "c"] - ) + exp = pd.DataFrame([[nulls_fixture, val1, val2]], columns=["a", "b", "c"]) tm.assert_frame_equal(res, exp) From 299ff0463346a4b07c6479319671624df5ccb49d Mon Sep 17 00:00:00 2001 From: Prabakaran Kumaresshan <4676330+nixphix@users.noreply.github.com> Date: Wed, 7 Oct 2020 20:28:11 +0530 Subject: [PATCH 07/17] Split test case --- pandas/tests/frame/methods/test_combine_first.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 237b9f11c869f..6c1531d182767 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -103,6 +103,7 @@ def test_combine_first_mixed_bug(self): combined = frame1.combine_first(frame2) assert len(combined.columns) == 5 + def test_combine_first_same_as_in_update(self): # gh 3016 (same as in update) df = DataFrame( [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], @@ -118,6 +119,7 @@ def test_combine_first_mixed_bug(self): df.loc[0, "A"] = 45 tm.assert_frame_equal(result, df) + def test_combine_first_doc_example(self): # doc example df1 = DataFrame( {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} @@ -134,6 +136,7 @@ def test_combine_first_mixed_bug(self): expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) tm.assert_frame_equal(result, expected) + def test_combine_first_return_obj_type_with_bools(self): # GH3552, return object dtype with bools df1 = DataFrame( [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] @@ -149,6 +152,7 @@ def test_combine_first_mixed_bug(self): tm.assert_series_equal(result1, expected1) tm.assert_series_equal(result2, expected2) + def test_combine_first_convert_datatime_correctly(self): # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} @@ -344,10 +348,14 @@ def test_combine_first_int(self): df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") - res = df1.combine_first(df2) - res2 = df1.combine_first(df2) + exp1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") + exp2 = pd.DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") + + res1 = df1.combine_first(df2) + res2 = df2.combine_first(df1) - assert res["a"].dtype == res2["a"].dtype + tm.assert_frame_equal(res1, exp1) + tm.assert_frame_equal(res2, exp2) @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): From 6aa312b0517cf94dd0fb658c0b81af5a7fc713c4 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 29 Nov 2020 03:27:02 +0000 Subject: [PATCH 08/17] tests --- .../tests/frame/methods/test_combine_first.py | 144 +++++++++--------- 1 file changed, 68 insertions(+), 76 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 08c4293323500..34355b93d4ee2 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series +from pandas import DataFrame, Index, Series import pandas._testing as tm @@ -18,7 +18,7 @@ def test_combine_first_mixed(self): b = Series(range(2), index=range(5, 7)) g = DataFrame({"A": a, "B": b}) - exp = DataFrame( + exp = pd.DataFrame( {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6] ) combined = f.combine_first(g) @@ -103,6 +103,7 @@ def test_combine_first_mixed_bug(self): combined = frame1.combine_first(frame2) assert len(combined.columns) == 5 + def test_combine_first_same_as_in_update(self): # gh 3016 (same as in update) df = DataFrame( [[1.0, 2.0, False, True], [4.0, 5.0, True, False]], @@ -118,6 +119,7 @@ def test_combine_first_mixed_bug(self): df.loc[0, "A"] = 45 tm.assert_frame_equal(result, df) + def test_combine_first_doc_example(self): # doc example df1 = DataFrame( {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} @@ -134,16 +136,23 @@ def test_combine_first_mixed_bug(self): expected = DataFrame({"A": [1, 2, 3, 5, 3, 7.0], "B": [np.nan, 2, 3, 4, 6, 8]}) tm.assert_frame_equal(result, expected) + def test_combine_first_return_obj_type_with_bools(self): # GH3552, return object dtype with bools df1 = DataFrame( [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - result = df1.combine_first(df2)[2] - expected = Series([True, True, False], name=2) - tm.assert_series_equal(result, expected) + expected1 = Series([True, True, False], name=2, dtype=object) + expected2 = Series([True, True, False], name=2, dtype=object) + result1 = df1.combine_first(df2)[2] + result2 = df2.combine_first(df1)[2] + + tm.assert_series_equal(result1, expected1) + tm.assert_series_equal(result2, expected2) + + def test_combine_first_convert_datatime_correctly(self): # GH 3593, converting datetime64[ns] incorrectly df0 = DataFrame( {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} @@ -169,13 +178,13 @@ def test_combine_first_mixed_bug(self): def test_combine_first_align_nan(self): # GH 7509 (not fixed) - dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) - dfb = DataFrame([[4], [5]], columns=["b"]) + dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) + dfb = pd.DataFrame([[4], [5]], columns=["b"]) assert dfa["a"].dtype == "datetime64[ns]" assert dfa["b"].dtype == "int64" res = dfa.combine_first(dfb) - exp = DataFrame( + exp = pd.DataFrame( {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]}, columns=["a", "b"], ) @@ -185,7 +194,7 @@ def test_combine_first_align_nan(self): assert res["b"].dtype == "float64" res = dfa.iloc[:0].combine_first(dfb) - exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) + exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) tm.assert_frame_equal(res, exp) # ToDo: this must be datetime64 assert res["a"].dtype == "float64" @@ -195,21 +204,21 @@ def test_combine_first_align_nan(self): def test_combine_first_timezone(self): # see gh-7630 data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") - df1 = DataFrame( + df1 = pd.DataFrame( columns=["UTCdatetime", "abc"], data=data1, index=pd.date_range("20140627", periods=1), dtype="object", ) data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") - df2 = DataFrame( + df2 = pd.DataFrame( columns=["UTCdatetime", "xyz"], data=data2, index=pd.date_range("20140628", periods=1), dtype="object", ) res = df2[["UTCdatetime"]].combine_first(df1) - exp = DataFrame( + exp = pd.DataFrame( { "UTCdatetime": [ pd.Timestamp("2010-01-01 01:01", tz="UTC"), @@ -230,9 +239,9 @@ def test_combine_first_timezone(self): # see gh-10567 dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") - df1 = DataFrame({"DATE": dts1}) + df1 = pd.DataFrame({"DATE": dts1}) dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") - df2 = DataFrame({"DATE": dts2}) + df2 = pd.DataFrame({"DATE": dts2}) res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) @@ -241,11 +250,11 @@ def test_combine_first_timezone(self): dts1 = pd.DatetimeIndex( ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" ) - df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) + df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) dts2 = pd.DatetimeIndex( ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" ) - df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5]) + df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.DatetimeIndex( @@ -259,14 +268,14 @@ def test_combine_first_timezone(self): ], tz="US/Eastern", ) - exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) # different tz dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") - df1 = DataFrame({"DATE": dts1}) + df1 = pd.DataFrame({"DATE": dts1}) dts2 = pd.date_range("2015-01-03", "2015-01-05") - df2 = DataFrame({"DATE": dts2}) + df2 = pd.DataFrame({"DATE": dts2}) # if df1 doesn't have NaN, keep its dtype res = df1.combine_first(df2) @@ -274,9 +283,9 @@ def test_combine_first_timezone(self): assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") - df1 = DataFrame({"DATE": dts1}) + df1 = pd.DataFrame({"DATE": dts1}) dts2 = pd.date_range("2015-01-01", "2015-01-03") - df2 = DataFrame({"DATE": dts2}) + df2 = pd.DataFrame({"DATE": dts2}) res = df1.combine_first(df2) exp_dts = [ @@ -284,41 +293,41 @@ def test_combine_first_timezone(self): pd.Timestamp("2015-01-02", tz="US/Eastern"), pd.Timestamp("2015-01-03"), ] - exp = DataFrame({"DATE": exp_dts}) + exp = pd.DataFrame({"DATE": exp_dts}) tm.assert_frame_equal(res, exp) assert res["DATE"].dtype == "object" def test_combine_first_timedelta(self): data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) - df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7]) + df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7]) data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) - df2 = DataFrame({"TD": data2}, index=[2, 4, 5]) + df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.TimedeltaIndex( ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"] ) - exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["TD"].dtype == "timedelta64[ns]" def test_combine_first_period(self): data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") - df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7]) + df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7]) data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") - df2 = DataFrame({"P": data2}, index=[2, 4, 5]) + df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.PeriodIndex( ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M" ) - exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["P"].dtype == data1.dtype # different freq dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") - df2 = DataFrame({"P": dts2}, index=[2, 4, 5]) + df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = [ @@ -329,68 +338,51 @@ def test_combine_first_period(self): pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M"), ] - exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["P"].dtype == "object" def test_combine_first_int(self): # GH14687 - integer series that do no align exactly - df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") - df2 = DataFrame({"a": [1, 4]}, dtype="int64") + df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") + df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") - res = df1.combine_first(df2) - tm.assert_frame_equal(res, df1) - assert res["a"].dtype == "int64" + exp1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") + exp2 = pd.DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") + + res1 = df1.combine_first(df2) + res2 = df2.combine_first(df1) + + tm.assert_frame_equal(res1, exp1) + tm.assert_frame_equal(res2, exp2) @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): # see gh-20699 - df1 = DataFrame({"isNum": [val]}) - df2 = DataFrame({"isBool": [True]}) + df1 = pd.DataFrame({"isNum": [val]}) + df2 = pd.DataFrame({"isBool": [True]}) res = df1.combine_first(df2) - exp = DataFrame({"isBool": [True], "isNum": [val]}) + exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp) - def test_combine_first_string_dtype_only_na(self): - # GH: 37519 - df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") - df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") - df.set_index(["a", "b"], inplace=True) - df2.set_index(["a", "b"], inplace=True) - result = df.combine_first(df2) - expected = DataFrame( - {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" - ).set_index(["a", "b"]) - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "val1, val2", + [ + (datetime(2020, 1, 1), datetime(2020, 1, 2)), + (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), + (pd.Timedelta("89 days"), pd.Timedelta("60 min")), + ], +) +def test_combine_first_timestamp_bug(val1, val2, nulls_fixture): + + df1 = pd.DataFrame([[nulls_fixture, nulls_fixture]], columns=["a", "b"]) + df2 = pd.DataFrame([[val1, val2]], columns=["b", "c"]) + + res = df1.combine_first(df2) + exp = pd.DataFrame([[nulls_fixture, val1, val2]], columns=["a", "b", "c"]) -def test_combine_first_with_nan_multiindex(): - # gh-36562 - - mi1 = MultiIndex.from_arrays( - [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] - ) - df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) - mi2 = MultiIndex.from_arrays( - [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] - ) - s = Series([1, 2, 3, 4, 5, 6], index=mi2) - res = df.combine_first(DataFrame({"d": s})) - mi_expected = MultiIndex.from_arrays( - [ - ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], - [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], - ], - names=["a", "b"], - ) - expected = DataFrame( - { - "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], - "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan], - }, - index=mi_expected, - ) - tm.assert_frame_equal(res, expected) + tm.assert_frame_equal(res, exp) From ac308960217d2efa11140ee1b49c5503e1e7784a Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 29 Nov 2020 03:30:03 +0000 Subject: [PATCH 09/17] pd namespace usage --- .../tests/frame/methods/test_combine_first.py | 72 +++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 34355b93d4ee2..350f4c61c779d 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -18,7 +18,7 @@ def test_combine_first_mixed(self): b = Series(range(2), index=range(5, 7)) g = DataFrame({"A": a, "B": b}) - exp = pd.DataFrame( + exp = DataFrame( {"A": list("abab"), "B": [0.0, 1.0, 0.0, 1.0]}, index=[0, 1, 5, 6] ) combined = f.combine_first(g) @@ -178,13 +178,13 @@ def test_combine_first_convert_datatime_correctly(self): def test_combine_first_align_nan(self): # GH 7509 (not fixed) - dfa = pd.DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) - dfb = pd.DataFrame([[4], [5]], columns=["b"]) + dfa = DataFrame([[pd.Timestamp("2011-01-01"), 2]], columns=["a", "b"]) + dfb = DataFrame([[4], [5]], columns=["b"]) assert dfa["a"].dtype == "datetime64[ns]" assert dfa["b"].dtype == "int64" res = dfa.combine_first(dfb) - exp = pd.DataFrame( + exp = DataFrame( {"a": [pd.Timestamp("2011-01-01"), pd.NaT], "b": [2.0, 5.0]}, columns=["a", "b"], ) @@ -194,7 +194,7 @@ def test_combine_first_align_nan(self): assert res["b"].dtype == "float64" res = dfa.iloc[:0].combine_first(dfb) - exp = pd.DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) + exp = DataFrame({"a": [np.nan, np.nan], "b": [4, 5]}, columns=["a", "b"]) tm.assert_frame_equal(res, exp) # ToDo: this must be datetime64 assert res["a"].dtype == "float64" @@ -204,21 +204,21 @@ def test_combine_first_align_nan(self): def test_combine_first_timezone(self): # see gh-7630 data1 = pd.to_datetime("20100101 01:01").tz_localize("UTC") - df1 = pd.DataFrame( + df1 = DataFrame( columns=["UTCdatetime", "abc"], data=data1, index=pd.date_range("20140627", periods=1), dtype="object", ) data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") - df2 = pd.DataFrame( + df2 = DataFrame( columns=["UTCdatetime", "xyz"], data=data2, index=pd.date_range("20140628", periods=1), dtype="object", ) res = df2[["UTCdatetime"]].combine_first(df1) - exp = pd.DataFrame( + exp = DataFrame( { "UTCdatetime": [ pd.Timestamp("2010-01-01 01:01", tz="UTC"), @@ -239,9 +239,9 @@ def test_combine_first_timezone(self): # see gh-10567 dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") - df1 = pd.DataFrame({"DATE": dts1}) + df1 = DataFrame({"DATE": dts1}) dts2 = pd.date_range("2015-01-03", "2015-01-05", tz="UTC") - df2 = pd.DataFrame({"DATE": dts2}) + df2 = DataFrame({"DATE": dts2}) res = df1.combine_first(df2) tm.assert_frame_equal(res, df1) @@ -250,11 +250,11 @@ def test_combine_first_timezone(self): dts1 = pd.DatetimeIndex( ["2011-01-01", "NaT", "2011-01-03", "2011-01-04"], tz="US/Eastern" ) - df1 = pd.DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) + df1 = DataFrame({"DATE": dts1}, index=[1, 3, 5, 7]) dts2 = pd.DatetimeIndex( ["2012-01-01", "2012-01-02", "2012-01-03"], tz="US/Eastern" ) - df2 = pd.DataFrame({"DATE": dts2}, index=[2, 4, 5]) + df2 = DataFrame({"DATE": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.DatetimeIndex( @@ -268,14 +268,14 @@ def test_combine_first_timezone(self): ], tz="US/Eastern", ) - exp = pd.DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = DataFrame({"DATE": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) # different tz dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="US/Eastern") - df1 = pd.DataFrame({"DATE": dts1}) + df1 = DataFrame({"DATE": dts1}) dts2 = pd.date_range("2015-01-03", "2015-01-05") - df2 = pd.DataFrame({"DATE": dts2}) + df2 = DataFrame({"DATE": dts2}) # if df1 doesn't have NaN, keep its dtype res = df1.combine_first(df2) @@ -283,9 +283,9 @@ def test_combine_first_timezone(self): assert res["DATE"].dtype == "datetime64[ns, US/Eastern]" dts1 = pd.date_range("2015-01-01", "2015-01-02", tz="US/Eastern") - df1 = pd.DataFrame({"DATE": dts1}) + df1 = DataFrame({"DATE": dts1}) dts2 = pd.date_range("2015-01-01", "2015-01-03") - df2 = pd.DataFrame({"DATE": dts2}) + df2 = DataFrame({"DATE": dts2}) res = df1.combine_first(df2) exp_dts = [ @@ -293,41 +293,41 @@ def test_combine_first_timezone(self): pd.Timestamp("2015-01-02", tz="US/Eastern"), pd.Timestamp("2015-01-03"), ] - exp = pd.DataFrame({"DATE": exp_dts}) + exp = DataFrame({"DATE": exp_dts}) tm.assert_frame_equal(res, exp) assert res["DATE"].dtype == "object" def test_combine_first_timedelta(self): data1 = pd.TimedeltaIndex(["1 day", "NaT", "3 day", "4day"]) - df1 = pd.DataFrame({"TD": data1}, index=[1, 3, 5, 7]) + df1 = DataFrame({"TD": data1}, index=[1, 3, 5, 7]) data2 = pd.TimedeltaIndex(["10 day", "11 day", "12 day"]) - df2 = pd.DataFrame({"TD": data2}, index=[2, 4, 5]) + df2 = DataFrame({"TD": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.TimedeltaIndex( ["1 day", "10 day", "NaT", "11 day", "3 day", "4 day"] ) - exp = pd.DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = DataFrame({"TD": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["TD"].dtype == "timedelta64[ns]" def test_combine_first_period(self): data1 = pd.PeriodIndex(["2011-01", "NaT", "2011-03", "2011-04"], freq="M") - df1 = pd.DataFrame({"P": data1}, index=[1, 3, 5, 7]) + df1 = DataFrame({"P": data1}, index=[1, 3, 5, 7]) data2 = pd.PeriodIndex(["2012-01-01", "2012-02", "2012-03"], freq="M") - df2 = pd.DataFrame({"P": data2}, index=[2, 4, 5]) + df2 = DataFrame({"P": data2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = pd.PeriodIndex( ["2011-01", "2012-01", "NaT", "2012-02", "2011-03", "2011-04"], freq="M" ) - exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["P"].dtype == data1.dtype # different freq dts2 = pd.PeriodIndex(["2012-01-01", "2012-01-02", "2012-01-03"], freq="D") - df2 = pd.DataFrame({"P": dts2}, index=[2, 4, 5]) + df2 = DataFrame({"P": dts2}, index=[2, 4, 5]) res = df1.combine_first(df2) exp_dts = [ @@ -338,18 +338,18 @@ def test_combine_first_period(self): pd.Period("2011-03", freq="M"), pd.Period("2011-04", freq="M"), ] - exp = pd.DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) + exp = DataFrame({"P": exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) assert res["P"].dtype == "object" def test_combine_first_int(self): # GH14687 - integer series that do no align exactly - df1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") - df2 = pd.DataFrame({"a": [1, 4]}, dtype="int64") + df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") + df2 = DataFrame({"a": [1, 4]}, dtype="int64") - exp1 = pd.DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") - exp2 = pd.DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") + exp1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") + exp2 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") res1 = df1.combine_first(df2) res2 = df2.combine_first(df1) @@ -360,11 +360,11 @@ def test_combine_first_int(self): @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): # see gh-20699 - df1 = pd.DataFrame({"isNum": [val]}) - df2 = pd.DataFrame({"isBool": [True]}) + df1 = DataFrame({"isNum": [val]}) + df2 = DataFrame({"isBool": [True]}) res = df1.combine_first(df2) - exp = pd.DataFrame({"isBool": [True], "isNum": [val]}) + exp = DataFrame({"isBool": [True], "isNum": [val]}) tm.assert_frame_equal(res, exp) @@ -379,10 +379,10 @@ def test_combine_first_with_asymmetric_other(self, val): ) def test_combine_first_timestamp_bug(val1, val2, nulls_fixture): - df1 = pd.DataFrame([[nulls_fixture, nulls_fixture]], columns=["a", "b"]) - df2 = pd.DataFrame([[val1, val2]], columns=["b", "c"]) + df1 = DataFrame([[nulls_fixture, nulls_fixture]], columns=["a", "b"]) + df2 = DataFrame([[val1, val2]], columns=["b", "c"]) res = df1.combine_first(df2) - exp = pd.DataFrame([[nulls_fixture, val1, val2]], columns=["a", "b", "c"]) + exp = DataFrame([[nulls_fixture, val1, val2]], columns=["a", "b", "c"]) tm.assert_frame_equal(res, exp) From 259822e14968c4f6399d31ce0e08a0fcf37a1ff9 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 29 Nov 2020 03:33:05 +0000 Subject: [PATCH 10/17] merge conflicts --- .../tests/frame/methods/test_combine_first.py | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 350f4c61c779d..755a4ea5810b2 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Index, Series +from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm @@ -368,6 +368,18 @@ def test_combine_first_with_asymmetric_other(self, val): tm.assert_frame_equal(res, exp) + def test_combine_first_string_dtype_only_na(self): + # GH: 37519 + df = DataFrame({"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string") + df2 = DataFrame({"a": ["85"], "b": [pd.NA]}, dtype="string") + df.set_index(["a", "b"], inplace=True) + df2.set_index(["a", "b"], inplace=True) + result = df.combine_first(df2) + expected = DataFrame( + {"a": ["962", "85"], "b": [pd.NA] * 2}, dtype="string" + ).set_index(["a", "b"]) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "val1, val2", @@ -386,3 +398,32 @@ def test_combine_first_timestamp_bug(val1, val2, nulls_fixture): exp = DataFrame([[nulls_fixture, val1, val2]], columns=["a", "b", "c"]) tm.assert_frame_equal(res, exp) + + +def test_combine_first_with_nan_multiindex(): + # gh-36562 + + mi1 = MultiIndex.from_arrays( + [["b", "b", "c", "a", "b", np.nan], [1, 2, 3, 4, 5, 6]], names=["a", "b"] + ) + df = DataFrame({"c": [1, 1, 1, 1, 1, 1]}, index=mi1) + mi2 = MultiIndex.from_arrays( + [["a", "b", "c", "a", "b", "d"], [1, 1, 1, 1, 1, 1]], names=["a", "b"] + ) + s = Series([1, 2, 3, 4, 5, 6], index=mi2) + res = df.combine_first(DataFrame({"d": s})) + mi_expected = MultiIndex.from_arrays( + [ + ["a", "a", "a", "b", "b", "b", "b", "c", "c", "d", np.nan], + [1, 1, 4, 1, 1, 2, 5, 1, 3, 1, 6], + ], + names=["a", "b"], + ) + expected = DataFrame( + { + "c": [np.nan, np.nan, 1, 1, 1, 1, 1, np.nan, 1, np.nan, 1], + "d": [1.0, 4.0, np.nan, 2.0, 5.0, np.nan, np.nan, 3.0, np.nan, 6.0, np.nan], + }, + index=mi_expected, + ) + tm.assert_frame_equal(res, expected) From 5f6f79b4260278e424cc931725ad1250acbeb7cb Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 29 Nov 2020 03:39:50 +0000 Subject: [PATCH 11/17] rewrite test --- pandas/tests/frame/methods/test_combine_first.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 755a4ea5810b2..bd08ed25309e1 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -137,20 +137,20 @@ def test_combine_first_doc_example(self): tm.assert_frame_equal(result, expected) def test_combine_first_return_obj_type_with_bools(self): - # GH3552, return object dtype with bools + # GH3552 + df1 = DataFrame( [[np.nan, 3.0, True], [-4.6, np.nan, True], [np.nan, 7.0, False]] ) df2 = DataFrame([[-42.6, np.nan, True], [-5.0, 1.6, False]], index=[1, 2]) - expected1 = Series([True, True, False], name=2, dtype=object) - expected2 = Series([True, True, False], name=2, dtype=object) + expected = Series([True, True, False], name=2, dtype=object) - result1 = df1.combine_first(df2)[2] - result2 = df2.combine_first(df1)[2] + result_12 = df1.combine_first(df2)[2] + tm.assert_series_equal(result_12, expected) - tm.assert_series_equal(result1, expected1) - tm.assert_series_equal(result2, expected2) + result_21 = df2.combine_first(df1)[2] + tm.assert_series_equal(result_21, expected) def test_combine_first_convert_datatime_correctly(self): # GH 3593, converting datetime64[ns] incorrectly From 2240e956d88a15b49747927d4cb10423a87548cb Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 29 Nov 2020 04:08:38 +0000 Subject: [PATCH 12/17] rewrite test --- .../tests/frame/methods/test_combine_first.py | 57 +++++++++++-------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index bd08ed25309e1..a7eb0ecc678bb 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -152,29 +152,40 @@ def test_combine_first_return_obj_type_with_bools(self): result_21 = df2.combine_first(df1)[2] tm.assert_series_equal(result_21, expected) - def test_combine_first_convert_datatime_correctly(self): - # GH 3593, converting datetime64[ns] incorrectly - df0 = DataFrame( - {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} - ) - df1 = DataFrame({"a": [None, None, None]}) - df2 = df1.combine_first(df0) - tm.assert_frame_equal(df2, df0) - - df2 = df0.combine_first(df1) - tm.assert_frame_equal(df2, df0) - - df0 = DataFrame( - {"a": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)]} - ) - df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) - df2 = df1.combine_first(df0) - result = df0.copy() - result.iloc[0, :] = df1.iloc[0, :] - tm.assert_frame_equal(df2, result) - - df2 = df0.combine_first(df1) - tm.assert_frame_equal(df2, df0) + @pytest.mark.parametrize( + "data0, data1, data_expected", + ( + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [None, None, None], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + ( + [None, None, None], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + ( + [datetime(2000, 1, 2), None, None], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [datetime(2000, 1, 2), datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + ( + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + [datetime(2000, 1, 2), None, None], + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + ), + ), + ) + def test_combine_first_convert_datatime_correctly( + self, data0, data1, data_expected + ): + # GH 3593 + + df0, df1 = DataFrame({"a": data0}), DataFrame({"a": data1}) + result = df0.combine_first(df1) + expected = DataFrame({"a": data_expected}) + tm.assert_frame_equal(result, expected) def test_combine_first_align_nan(self): # GH 7509 (not fixed) From 268c0627ce31edb9bcb8cdbc115871f7e1fdc33d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 29 Nov 2020 04:14:18 +0000 Subject: [PATCH 13/17] rewrite test --- pandas/tests/frame/methods/test_combine_first.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index a7eb0ecc678bb..b83d9d8322234 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -359,14 +359,14 @@ def test_combine_first_int(self): df1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="int64") df2 = DataFrame({"a": [1, 4]}, dtype="int64") - exp1 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") - exp2 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") + result_12 = df1.combine_first(df2) + expected_12 = DataFrame({"a": [0, 1, 3, 5]}, dtype="float64") + tm.assert_frame_equal(result_12, expected_12) - res1 = df1.combine_first(df2) - res2 = df2.combine_first(df1) + result_21 = df2.combine_first(df1) + expected_21 = DataFrame({"a": [1, 4, 3, 5]}, dtype="float64") - tm.assert_frame_equal(res1, exp1) - tm.assert_frame_equal(res2, exp2) + tm.assert_frame_equal(result_21, expected_21) @pytest.mark.parametrize("val", [1, 1.0]) def test_combine_first_with_asymmetric_other(self, val): From b43926906fa4d0262f347e202594bc33a93257a3 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 29 Nov 2020 04:21:41 +0000 Subject: [PATCH 14/17] rewrite test --- .../tests/frame/methods/test_combine_first.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index b83d9d8322234..9988bf0282880 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -393,22 +393,22 @@ def test_combine_first_string_dtype_only_na(self): @pytest.mark.parametrize( - "val1, val2", + "scalar1, scalar2", [ (datetime(2020, 1, 1), datetime(2020, 1, 2)), (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), (pd.Timedelta("89 days"), pd.Timedelta("60 min")), ], ) -def test_combine_first_timestamp_bug(val1, val2, nulls_fixture): - - df1 = DataFrame([[nulls_fixture, nulls_fixture]], columns=["a", "b"]) - df2 = DataFrame([[val1, val2]], columns=["b", "c"]) - - res = df1.combine_first(df2) - exp = DataFrame([[nulls_fixture, val1, val2]], columns=["a", "b", "c"]) - - tm.assert_frame_equal(res, exp) +def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture): + # GH28481 + na_value = nulls_fixture + frame = DataFrame([[na_value, na_value]], columns=["a", "b"]) + other = DataFrame([[scalar1, scalar2]], columns=["b", "c"]) + + result = frame.combine_first(other) + expected = DataFrame([[na_value, scalar1, scalar2]], columns=["a", "b", "c"]) + tm.assert_frame_equal(result, expected) def test_combine_first_with_nan_multiindex(): From c0fdfe21233cf8a81a0f5f6483bba5b76e9aa394 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 29 Nov 2020 04:23:17 +0000 Subject: [PATCH 15/17] rewrite test --- pandas/tests/frame/methods/test_combine_first.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 9988bf0282880..492fa14f95135 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -153,7 +153,7 @@ def test_combine_first_return_obj_type_with_bools(self): tm.assert_series_equal(result_21, expected) @pytest.mark.parametrize( - "data0, data1, data_expected", + "data1, data2, data_expected", ( ( [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], @@ -178,12 +178,12 @@ def test_combine_first_return_obj_type_with_bools(self): ), ) def test_combine_first_convert_datatime_correctly( - self, data0, data1, data_expected + self, data1, data2, data_expected ): # GH 3593 - df0, df1 = DataFrame({"a": data0}), DataFrame({"a": data1}) - result = df0.combine_first(df1) + df1, df2 = DataFrame({"a": data1}), DataFrame({"a": data2}) + result = df1.combine_first(df2) expected = DataFrame({"a": data_expected}) tm.assert_frame_equal(result, expected) From a4f30eb235508f7fc576224eaf803010e559455c Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 29 Nov 2020 04:28:55 +0000 Subject: [PATCH 16/17] merge error --- doc/source/whatsnew/v1.2.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index efb8bcf4afc5a..54339acb5be91 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -532,7 +532,6 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :meth:`DataFrame.combine_first` that would convert datetime-like column on other :class:`DataFrame` to integer when the column is not present in original :class:`DataFrame` (:issue:`28481`) -- Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Bug in :attr:`.DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) - Bug in :class:`.DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) From 76ca62018f6ef20ff61f6f991c3c080286f09735 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 29 Nov 2020 23:14:33 -0500 Subject: [PATCH 17/17] add Interval example --- pandas/tests/frame/methods/test_combine_first.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 492fa14f95135..934ad9eb8213a 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -398,6 +398,7 @@ def test_combine_first_string_dtype_only_na(self): (datetime(2020, 1, 1), datetime(2020, 1, 2)), (pd.Period("2020-01-01", "D"), pd.Period("2020-01-02", "D")), (pd.Timedelta("89 days"), pd.Timedelta("60 min")), + (pd.Interval(left=0, right=1), pd.Interval(left=2, right=3, closed="left")), ], ) def test_combine_first_timestamp_bug(scalar1, scalar2, nulls_fixture):