Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enh: pandas-like semi join #439

Merged
merged 5 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def join(
self,
other: Self,
*,
how: Literal["left", "inner", "outer", "cross", "anti"] = "inner",
how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner",
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
) -> Self:
Expand Down Expand Up @@ -339,7 +339,7 @@ def join(
n_bytes=8, columns=[*self.columns, *other.columns]
)

other = (
other_native = (
other._native_dataframe.loc[:, right_on]
.rename( # rename to avoid creating extra columns in join
columns=dict(zip(right_on, left_on)) # type: ignore[arg-type]
Expand All @@ -348,7 +348,7 @@ def join(
)
return self._from_native_dataframe(
self._native_dataframe.merge(
other,
other_native,
how="outer",
indicator=indicator_token,
left_on=left_on,
Expand All @@ -359,6 +359,23 @@ def join(
.reset_index(drop=True)
)

if how == "semi":
other_native = (
other._native_dataframe.loc[:, right_on]
.rename( # rename to avoid creating extra columns in join
columns=dict(zip(right_on, left_on)) # type: ignore[arg-type]
)
.drop_duplicates() # avoids potential rows duplication from inner join
)
return self._from_native_dataframe(
self._native_dataframe.merge(
other_native,
how="inner",
left_on=left_on,
right_on=left_on,
).reset_index(drop=True)
)

return self._from_native_dataframe(
self._native_dataframe.merge(
other._native_dataframe,
Expand Down
21 changes: 12 additions & 9 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,14 @@ def join(
self,
other: Self,
*,
how: Literal["inner", "cross", "anti"] = "inner",
how: Literal["inner", "cross", "semi", "anti"] = "inner",
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
) -> Self:
_supported_joins = {"inner", "cross", "anti"}
_supported_joins = ("inner", "cross", "anti", "semi")

if how not in _supported_joins:
msg = f"Only the following join stragies are supported: {_supported_joins}"
msg = f"Only the following join stragies are supported: {_supported_joins}; found '{how}'."
raise NotImplementedError(msg)

if how == "cross" and (left_on or right_on):
Expand Down Expand Up @@ -1475,7 +1476,7 @@ def join(
self,
other: Self,
*,
how: Literal["inner", "cross", "anti"] = "inner",
how: Literal["inner", "cross", "semi", "anti"] = "inner",
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
) -> Self:
Expand All @@ -1487,8 +1488,9 @@ def join(

how: Join strategy.

* *inner*: Returns rows that have matching values in both tables
* *cross*: Returns the Cartesian product of rows from both tables
* *inner*: Returns rows that have matching values in both tables.
* *cross*: Returns the Cartesian product of rows from both tables.
* *semi*: Filter rows that have a match in the right table..
* *anti*: Filter rows that do not have a match in the right table.

left_on: Name(s) of the left join column(s).
Expand Down Expand Up @@ -2900,7 +2902,7 @@ def join(
self,
other: Self,
*,
how: Literal["inner", "cross", "anti"] = "inner",
how: Literal["inner", "cross", "semi", "anti"] = "inner",
left_on: str | list[str] | None = None,
right_on: str | list[str] | None = None,
) -> Self:
Expand All @@ -2912,8 +2914,9 @@ def join(

how: Join strategy.

* *inner*: Returns rows that have matching values in both tables
* *cross*: Returns the Cartesian product of rows from both tables
* *inner*: Returns rows that have matching values in both tables.
* *cross*: Returns the Cartesian product of rows from both tables.
* *semi*: Filter rows that have a match in the right table.
* *anti*: Filter rows that do not have a match in the right table.

left_on: Join column of the left DataFrame.
Expand Down
110 changes: 110 additions & 0 deletions tests/frame/join_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from __future__ import annotations

import re
from typing import Any

import pandas as pd
import pytest

import narwhals.stable.v1 as nw
from narwhals._pandas_like.utils import Implementation
from tests.utils import compare_dicts


def test_inner_join(constructor_with_lazy: Any) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
df = nw.from_native(constructor_with_lazy(data)).lazy()
df_right = df
result = df.join(df_right, left_on=["a", "b"], right_on=["a", "b"], how="inner")
result_native = nw.to_native(result)
expected = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9], "z_right": [7.0, 8, 9]}
compare_dicts(result_native, expected)

result = df.collect().join(df_right.collect(), left_on="a", right_on="a", how="inner") # type: ignore[assignment]
result_native = nw.to_native(result)
expected = {
"a": [1, 3, 2],
"b": [4, 4, 6],
"b_right": [4, 4, 6],
"z": [7.0, 8, 9],
"z_right": [7.0, 8, 9],
}
compare_dicts(result_native, expected)


def test_cross_join(constructor_with_lazy: Any) -> None:
data = {"a": [1, 3, 2]}
df = nw.from_native(constructor_with_lazy(data))
result = df.join(df, how="cross") # type: ignore[arg-type]

expected = {"a": [1, 1, 1, 3, 3, 3, 2, 2, 2], "a_right": [1, 3, 2, 1, 3, 2, 1, 3, 2]}
compare_dicts(result, expected)

with pytest.raises(ValueError, match="Can not pass left_on, right_on for cross join"):
df.join(df, how="cross", left_on="a") # type: ignore[arg-type]


def test_cross_join_non_pandas() -> None:
data = {"a": [1, 3, 2]}
df = nw.from_native(pd.DataFrame(data))
# HACK to force testing for a non-pandas codepath
df._dataframe._implementation = Implementation.MODIN
result = df.join(df, how="cross") # type: ignore[arg-type]
expected = {"a": [1, 1, 1, 3, 3, 3, 2, 2, 2], "a_right": [1, 3, 2, 1, 3, 2, 1, 3, 2]}
compare_dicts(result, expected)


@pytest.mark.parametrize(
("join_key", "filter_expr", "expected"),
[
(["a", "b"], (nw.col("b") < 5), {"a": [2], "b": [6], "z": [9]}),
(["b"], (nw.col("b") < 5), {"a": [2], "b": [6], "z": [9]}),
(["b"], (nw.col("b") > 5), {"a": [1, 3], "b": [4, 4], "z": [7.0, 8.0]}),
],
)
def test_anti_join(
constructor_with_lazy: Any,
join_key: list[str],
filter_expr: nw.Expr,
expected: dict[str, list[Any]],
) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
df = nw.from_native(constructor_with_lazy(data))
other = df.filter(filter_expr)
result = df.join(other, how="anti", left_on=join_key, right_on=join_key) # type: ignore[arg-type]
compare_dicts(result, expected)


@pytest.mark.parametrize(
("join_key", "filter_expr", "expected"),
[
(["a"], (nw.col("b") > 5), {"a": [2], "b": [6], "z": [9]}),
(["b"], (nw.col("b") < 5), {"a": [1, 3], "b": [4, 4], "z": [7, 8]}),
(["a", "b"], (nw.col("b") < 5), {"a": [1, 3], "b": [4, 4], "z": [7, 8]}),
],
)
def test_semi_join(
constructor: Any,
join_key: list[str],
filter_expr: nw.Expr,
expected: dict[str, list[Any]],
) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
df = nw.from_native(constructor(data))
other = df.filter(filter_expr)
result = df.join(other, how="semi", left_on=join_key, right_on=join_key) # type: ignore[arg-type]
compare_dicts(result, expected)


@pytest.mark.parametrize("how", ["left", "right", "full"])
def test_join_not_implemented(constructor_with_lazy: Any, how: str) -> None:
data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
df = nw.from_native(constructor_with_lazy(data))

with pytest.raises(
NotImplementedError,
match=re.escape(
f"Only the following join stragies are supported: ('inner', 'cross', 'anti', 'semi'); found '{how}'."
),
):
df.join(df, left_on="a", right_on="a", how=how) # type: ignore[arg-type]
74 changes: 0 additions & 74 deletions tests/frame/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import pytest

import narwhals.stable.v1 as nw
from narwhals._pandas_like.utils import Implementation
from narwhals.functions import _get_deps_info
from narwhals.functions import _get_sys_info
from narwhals.functions import show_versions
Expand Down Expand Up @@ -132,79 +131,6 @@ def test_lit_error(df_raw: Any) -> None:
_ = df.with_columns(nw.lit([1, 2]).alias("lit"))


@pytest.mark.parametrize(
"df_raw", [df_pandas, df_lazy, df_pandas_nullable, df_pandas_pyarrow]
)
def test_join(df_raw: Any) -> None:
df = nw.from_native(df_raw).lazy()
df_right = df
result = df.join(df_right, left_on=["a", "b"], right_on=["a", "b"], how="inner")
result_native = nw.to_native(result)
expected = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9], "z_right": [7.0, 8, 9]}
compare_dicts(result_native, expected)

with pytest.raises(NotImplementedError):
result = df.join(df_right, left_on="a", right_on="a", how="left") # type: ignore[arg-type]

result = df.collect().join(df_right.collect(), left_on="a", right_on="a", how="inner") # type: ignore[assignment]
result_native = nw.to_native(result)
expected = {
"a": [1, 3, 2],
"b": [4, 4, 6],
"b_right": [4, 4, 6],
"z": [7.0, 8, 9],
"z_right": [7.0, 8, 9],
}
compare_dicts(result_native, expected)


@pytest.mark.parametrize("df_raw", [df_polars, df_lazy, df_pandas, df_mpd])
def test_cross_join(df_raw: Any) -> None:
df = nw.from_native(df_raw).select("a")
result = df.join(df, how="cross") # type: ignore[arg-type]

expected = {"a": [1, 1, 1, 3, 3, 3, 2, 2, 2], "a_right": [1, 3, 2, 1, 3, 2, 1, 3, 2]}
compare_dicts(result, expected)

with pytest.raises(ValueError, match="Can not pass left_on, right_on for cross join"):
df.join(df, how="cross", left_on="a") # type: ignore[arg-type]


def test_cross_join_non_pandas() -> None:
df = nw.from_native(df_pandas).select("a")
# HACK to force testing for a non-pandas codepath
df._dataframe._implementation = Implementation.MODIN
result = df.join(df, how="cross") # type: ignore[arg-type]
expected = {"a": [1, 1, 1, 3, 3, 3, 2, 2, 2], "a_right": [1, 3, 2, 1, 3, 2, 1, 3, 2]}
compare_dicts(result, expected)


@pytest.mark.parametrize(
"df_raw",
[
df_polars,
df_lazy,
df_pandas,
# df_mpd, (TODO(Unassigned): understand the difference between ipython/jupyter and pytest runs)
],
)
@pytest.mark.parametrize(
("join_key", "filter_expr", "expected"),
[
(["a", "b"], (nw.col("b") < 5), {"a": [2], "b": [6], "z": [9]}),
(["b"], (nw.col("b") < 5), {"a": [2], "b": [6], "z": [9]}),
(["b"], (nw.col("b") > 5), {"a": [1, 3], "b": [4, 4], "z": [7.0, 8.0]}),
],
)
def test_anti_join(
df_raw: Any, join_key: list[str], filter_expr: nw.Expr, expected: dict[str, list[Any]]
) -> None:
df = nw.from_native(df_raw)
other = df.filter(filter_expr)
result = df.join(other, how="anti", left_on=join_key, right_on=join_key) # type: ignore[arg-type]
compare_dicts(result, expected)


@pytest.mark.parametrize(
"df_raw", [df_pandas, df_lazy, df_pandas_nullable, df_pandas_pyarrow]
)
Expand Down
Loading