Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge nonstring columns #46879

Closed
wants to merge 26 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ Other enhancements
- ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`)
- :class:`DataError`, :class:`SpecificationError`, :class:`SettingWithCopyError`, :class:`SettingWithCopyWarning`, and :class:`NumExprClobberingError` are now exposed in ``pandas.errors`` (:issue:`27656`)
- Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`)
- :func:`merge` and :meth:`DataFrame.merge` now allows passing ``None`` or ``(None, None)`` for ``suffixes`` argument, keeping column labels unchanged in the resulting :class:`DataFrame` potentially with duplicate column labels (:issue:`46885`)
- :func:`DataFrame.join` now allows passing empty string for ``lsuffix`` and ``rsuffix`` arguments, keeping column labels unchanged in the resulting :class:`DataFrame` potentially with duplicate column labels (:issue:`46885`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.notable_bug_fixes:
Expand Down
2 changes: 1 addition & 1 deletion pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@
IndexLabel = Union[Hashable, Sequence[Hashable]]
Level = Union[Hashable, int]
Shape = Tuple[int, ...]
Suffixes = Tuple[Optional[str], Optional[str]]
Suffixes = Optional[Tuple[Optional[str], Optional[str]]]
Ordered = Optional[bool]
JSONSerializable = Optional[Union[PythonScalar, List, Dict]]
Frequency = Union[str, "DateOffset"]
Expand Down
27 changes: 15 additions & 12 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,13 +322,13 @@
sort : bool, default False
Sort the join keys lexicographically in the result DataFrame. If False,
the order of the join keys depends on the join type (how keyword).
suffixes : list-like, default is ("_x", "_y")
A length-2 sequence where each element is optionally a string
suffixes : optional list-like, default is ("_x", "_y")
An optional length-2 sequence where each element is optionally a string
indicating the suffix to add to overlapping column names in
`left` and `right` respectively. Pass a value of `None` instead
of a string to indicate that the column name from `left` or
`right` should be left as-is, with no suffix. At least one of the
values must not be None.
`right` should be left as-is, with no suffix. Pass `None` to keep
both columns labels as-is.
copy : bool, default True
If False, avoid copy if possible.
indicator : bool or str, default False
Expand Down Expand Up @@ -412,14 +412,17 @@
4 bar 2 bar 6
5 baz 3 baz 7

Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
any overlapping columns.

>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
Traceback (most recent call last):
...
ValueError: columns overlap but no suffix specified:
Index(['value'], dtype='object')
Merge DataFrames df1 and df2 with null as suffix will keep
the original columns names

>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=None)
lkey value rkey value
0 foo 1 foo 5
1 foo 1 foo 8
2 foo 5 foo 5
3 foo 5 foo 8
4 bar 2 bar 6
5 baz 3 baz 7

>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
Expand Down
11 changes: 4 additions & 7 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2359,7 +2359,7 @@ def _items_overlap_with_suffix(
If corresponding suffix is empty, the entry is simply converted to string.

"""
if not is_list_like(suffixes, allow_sets=False):
if not (is_list_like(suffixes, allow_sets=False) or suffixes is None):
warnings.warn(
f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give "
"unexpected results. Provide 'suffixes' as a tuple instead. In the "
jreback marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -2372,10 +2372,7 @@ def _items_overlap_with_suffix(
if len(to_rename) == 0:
return left, right

lsuffix, rsuffix = suffixes

if not lsuffix and not rsuffix:
raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
lsuffix, rsuffix = suffixes if suffixes else (None, None)

def renamer(x, suffix):
"""
Expand All @@ -2386,12 +2383,12 @@ def renamer(x, suffix):

Parameters
----------
x : original column name
x : original column label
suffix : str or None

Returns
-------
x : renamed column name
x : renamed column label
"""
if x in to_rename and suffix is not None:
return f"{x}{suffix}"
Expand Down
6 changes: 0 additions & 6 deletions pandas/tests/frame/methods/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,12 +280,6 @@ def test_join_index(float_frame):
with pytest.raises(ValueError, match="join method"):
f.join(f2, how="foo")

# corner case - overlapping columns
msg = "columns overlap but no suffix"
for how in ("outer", "left", "inner"):
with pytest.raises(ValueError, match=msg):
float_frame.join(float_frame, how=how)


def test_join_index_more(float_frame):
af = float_frame.loc[:, ["A", "B"]]
Expand Down
19 changes: 4 additions & 15 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2209,6 +2209,10 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
(0, 0, {"suffixes": ("_a", None)}, ["0_a", 0]),
("a", "a", {}, ["a_x", "a_y"]),
(0, 0, {}, ["0_x", "0_y"]),
(0, 0, {"suffixes": None}, [0, 0]),
(0, 0, {"suffixes": (None, None)}, [0, 0]),
("a", "a", {"suffixes": None}, ["a", "a"]),
("a", "a", {"suffixes": (None, None)}, ["a", "a"]),
jreback marked this conversation as resolved.
Show resolved Hide resolved
],
)
def test_merge_suffix(col1, col2, kwargs, expected_cols):
Expand Down Expand Up @@ -2255,21 +2259,6 @@ def test_merge_duplicate_suffix(how, expected):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"col1, col2, suffixes",
[("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))],
)
def test_merge_suffix_error(col1, col2, suffixes):
# issue: 24782
a = DataFrame({col1: [1, 2, 3]})
b = DataFrame({col2: [3, 4, 5]})

# TODO: might reconsider current raise behaviour, see issue 24782
msg = "columns overlap but no suffix specified"
with pytest.raises(ValueError, match=msg):
merge(a, b, left_index=True, right_index=True, suffixes=suffixes)


@pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}])
def test_merge_suffix_warns(suffixes):
a = DataFrame({"a": [1, 2, 3]})
Expand Down
6 changes: 0 additions & 6 deletions pandas/tests/reshape/merge/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -636,12 +636,6 @@ def test_join_multi_levels_invalid(self, portfolio, household):
):
household.join(portfolio, how="inner")

portfolio2 = portfolio.copy()
portfolio2.index.set_names(["household_id", "foo"])

with pytest.raises(ValueError, match="columns overlap but no suffix specified"):
jreback marked this conversation as resolved.
Show resolved Hide resolved
portfolio2.join(portfolio, how="inner")

def test_join_multi_levels2(self):

# some more advanced merges
Expand Down