Skip to content

Commit

Permalink
feat: str.split (narwhals-dev#1932)
Browse files Browse the repository at this point in the history
  * implemented without inclusive option (https://docs.pola.rs/api/python/dev/reference/expressions/api/polars.Expr.str.split.html)
  * support for _arrow, _dask, _duckdb, spark_like
  * support for pandas_like(pyarrow backed)
  • Loading branch information
Stelios Kritsotalakis authored and Stelios Kritsotalakis committed Feb 23, 2025
1 parent d00b962 commit bd06949
Show file tree
Hide file tree
Showing 13 changed files with 259 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/api-reference/expr_str.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- replace
- replace_all
- slice
- split
- starts_with
- strip_chars
- tail
Expand Down
1 change: 1 addition & 0 deletions docs/api-reference/series_str.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- replace
- replace_all
- slice
- split
- starts_with
- strip_chars
- tail
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_arrow/expr_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ def slice(self: Self, offset: int, length: int | None) -> ArrowExpr:
self._compliant_expr, "str", "slice", offset=offset, length=length
)

def split(self: Self, by: str) -> ArrowExpr:
return reuse_series_namespace_implementation(
self._compliant_expr, "str", "split", by=by
)

def to_datetime(self: Self, format: str | None) -> ArrowExpr: # noqa: A002
return reuse_series_namespace_implementation(
self._compliant_expr, "str", "to_datetime", format=format
Expand Down
4 changes: 4 additions & 0 deletions narwhals/_arrow/series_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ def slice(self: Self, offset: int, length: int | None) -> ArrowSeries:
)
)

def split(self: Self, by: str) -> ArrowSeries:
split_series = pc.split_pattern(self._compliant_series._native_series, by) # type: ignore[call-overload]
return self._compliant_series._from_native_series(split_series)

def to_datetime(self: Self, format: str | None) -> ArrowSeries: # noqa: A002
native = self._compliant_series._native_series
format = parse_datetime_format(native) if format is None else format
Expand Down
8 changes: 8 additions & 0 deletions narwhals/_dask/expr_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,14 @@ def slice(self: Self, offset: int, length: int | None) -> DaskExpr:
length=length,
)

def split(self: Self, by: str) -> DaskExpr:
return self._compliant_expr._from_call(
lambda _input, by: _input.str.split(pat=by),
"split",
by=by,
returns_scalar=self._compliant_expr._returns_scalar,
)

def to_datetime(self: Self, format: str | None) -> DaskExpr: # noqa: A002
return self._compliant_expr._from_call(
lambda _input, format: dd.to_datetime(_input, format=format), # noqa: A006
Expand Down
7 changes: 7 additions & 0 deletions narwhals/_duckdb/expr_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ def func(_input: duckdb.Expression) -> duckdb.Expression:

return self._compliant_expr._from_call(func, "slice")

def split(self: Self, by: str) -> DuckDBExpr:
return self._compliant_expr._from_call(
lambda _input: FunctionExpression("str_split", _input, lit(by)),
"split",
expr_kind=self._compliant_expr._expr_kind,
)

def len_chars(self: Self) -> DuckDBExpr:
return self._compliant_expr._from_call(
lambda _input: FunctionExpression("length", _input), "len_chars"
Expand Down
5 changes: 5 additions & 0 deletions narwhals/_pandas_like/expr_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ def slice(self: Self, offset: int, length: int | None) -> PandasLikeExpr:
self._compliant_expr, "str", "slice", offset=offset, length=length
)

def split(self: Self, by: str) -> PandasLikeExpr:
return reuse_series_namespace_implementation(
self._compliant_expr, "str", "split", by=by
)

def to_datetime(self: Self, format: str | None) -> PandasLikeExpr: # noqa: A002
return reuse_series_namespace_implementation(
self._compliant_expr,
Expand Down
14 changes: 14 additions & 0 deletions narwhals/_pandas_like/series_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from typing import TYPE_CHECKING

from narwhals._pandas_like.utils import get_dtype_backend
from narwhals._pandas_like.utils import to_datetime

if TYPE_CHECKING:
Expand Down Expand Up @@ -61,6 +62,19 @@ def slice(self: Self, offset: int, length: int | None) -> PandasLikeSeries:
self._compliant_series._native_series.str.slice(start=offset, stop=stop),
)

def split(self: Self, by: str) -> PandasLikeSeries:
dtype_backend = get_dtype_backend(
self._compliant_series._native_series.dtype,
self._compliant_series._implementation,
)
if dtype_backend != "pyarrow":
msg = "This operation requires a pyarrow-backed series. "
raise TypeError(msg)

return self._compliant_series._from_native_series(
self._compliant_series._native_series.str.split(pat=by),
)

def to_datetime(self: Self, format: str | None) -> PandasLikeSeries: # noqa: A002
return self._compliant_series._from_native_series(
to_datetime(self._compliant_series._implementation)(
Expand Down
7 changes: 7 additions & 0 deletions narwhals/_spark_like/expr_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,13 @@ def func(_input: Column) -> Column:

return self._compliant_expr._from_call(func, "slice")

def split(self: Self, by: str) -> SparkLikeExpr:
return self._compliant_expr._from_call(
lambda _input: self._compliant_expr._F.split(_input, by),
"split",
expr_kind=self._compliant_expr._expr_kind,
)

def to_uppercase(self: Self) -> SparkLikeExpr:
return self._compliant_expr._from_call(
self._compliant_expr._F.upper, "to_uppercase"
Expand Down
60 changes: 60 additions & 0 deletions narwhals/expr_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,66 @@ def slice(self: Self, offset: int, length: int | None = None) -> ExprT:
self._expr._metadata,
)

def split(self: Self, by: str) -> ExprT:
r"""Split the string by a substring.
Arguments:
by: The delimiter string by which to split the values.
Returns:
A new expression.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrameT
>>>
>>> data = {"s": ["foo bar", "foo_bar", "foo_bar_baz", "foo,bar"]}
>>> df_pd = pd.DataFrame(data, dtype=pd.ArrowDtype(pa.string()))
>>> df_pl = pl.DataFrame(data)
>>> df_pa = pa.table(data)
We define a dataframe-agnostic function:
>>> def agnostic_str_split(df_native: IntoFrameT) -> IntoFrameT:
... df = nw.from_native(df_native)
... return df.with_columns(s_split=nw.col("s").str.split("_")).to_native()
We can then pass any supported library such as pandas(pyarrow backed), Polars, or
PyArrow to `agnostic_str_split`:
>>> agnostic_str_split(df_pd) # doctest: +NORMALIZE_WHITESPACE
s s_split
0 foo bar ['foo bar']
1 foo_bar ['foo' 'bar']
2 foo_bar_baz ['foo' 'bar' 'baz']
3 foo,bar ['foo,bar']
>>> agnostic_str_split(df_pl)
shape: (4, 2)
s s_split
str list[str]
"foo bar" ["foo bar"]
"foo_bar" ["foo", "bar"]
"foo_bar_baz" ["foo", "bar", "baz"]
"foo,bar" ["foo,bar"]
>>> agnostic_str_split(df_pa)
pyarrow.Table
s: string
s_split: list<item: string>
child 0, item: string
----
s: [["foo bar","foo_bar","foo_bar_baz","foo,bar"]]
s_split: [[["foo bar"],["foo","bar"],["foo","bar","baz"],["foo,bar"]]]
"""
return self._expr.__class__(
lambda plx: self._expr._to_compliant_expr(plx).str.split(by=by),
self._expr._metadata,
)

def head(self: Self, n: int = 5) -> ExprT:
r"""Take the first n elements of each string.
Expand Down
72 changes: 72 additions & 0 deletions narwhals/series_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,78 @@ def slice(self: Self, offset: int, length: int | None = None) -> SeriesT:
)
)

def split(self: Self, by: str) -> SeriesT:
r"""Split the string by a substring.
Arguments:
by: The delimiter string by which to split the values. If set to `None` (default),
each character in the string is split into a separate substring.
Returns:
A new Series containing lists of substrings.
Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoSeriesT
>>> data = ["foo bar", "foo_bar", "foo_bar_baz", "foo,bar"]
>>> s_pd = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
>>> s_pl = pl.Series(data)
>>> s_pa = pa.chunked_array([data])
We define a dataframe-agnostic function:
>>> def agnostic_split(s_native: IntoSeriesT) -> IntoSeriesT:
... s = nw.from_native(s_native, series_only=True)
... return s.str.split("_").to_native()
We can then pass any supported library such as pandas (pyarrow backed), Polars, or
PyArrow to `agnostic_split`:
>>> agnostic_split(s_pd) # doctest: +NORMALIZE_WHITESPACE
0 ['foo bar']
1 ['foo' 'bar']
2 ['foo' 'bar' 'baz']
3 ['foo,bar']
dtype: list<item: string>[pyarrow]
>>> agnostic_split(s_pl) # doctest: +NORMALIZE_WHITESPACE
shape: (4,)
list[str]
["foo bar"]
["foo", "bar"]
["foo", "bar", "baz"]
["foo,bar"]
>>> agnostic_split(s_pa) # doctest: +ELLIPSIS
<pyarrow.lib.ChunkedArray object at 0x7efefc13c7c0>
[
[
[
"foo bar"
],
[
"foo",
"bar"
],
[
"foo",
"bar",
"baz"
],
[
"foo,bar"
]
]
]
"""
return self._narwhals_series._from_compliant_series(
self._narwhals_series._compliant_series.str.split(by=by)
)

def head(self: Self, n: int = 5) -> SeriesT:
r"""Take the first n elements of each string.
Expand Down
71 changes: 71 additions & 0 deletions tests/expr_and_series/str/split_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

import re
from typing import Any

import pytest

import narwhals.stable.v1 as nw
from tests.utils import Constructor
from tests.utils import ConstructorEager
from tests.utils import assert_equal_data

data = {"s": ["foo bar", "foo_bar", "foo_bar_baz", "foo,bar"]}


@pytest.mark.parametrize(
("by", "expected"),
[
("_", {"s": [["foo bar"], ["foo", "bar"], ["foo", "bar", "baz"], ["foo,bar"]]}),
(
",",
{"s": [["foo bar"], ["foo_bar"], ["foo_bar_baz"], ["foo", "bar"]]},
),
],
)
def test_str_split(
constructor: Constructor,
by: str,
expected: Any,
) -> None:
if (
constructor.__name__.startswith("pandas")
and "pyarrow" not in constructor.__name__
):
df = nw.from_native(constructor(data))
msg = re.escape("This operation requires a pyarrow-backed series. ")
with pytest.raises(TypeError, match=msg):
df.select(nw.col("s").str.split(by=by))
return
df = nw.from_native(constructor(data))
result_frame = df.select(nw.col("s").str.split(by=by))
assert_equal_data(result_frame, expected)


@pytest.mark.parametrize(
("by", "expected"),
[
("_", {"s": [["foo bar"], ["foo", "bar"], ["foo", "bar", "baz"], ["foo,bar"]]}),
(
",",
{"s": [["foo bar"], ["foo_bar"], ["foo_bar_baz"], ["foo", "bar"]]},
),
],
)
def test_str_split_series(
constructor_eager: ConstructorEager,
by: str,
expected: Any,
) -> None:
if (
constructor_eager.__name__.startswith("pandas")
and "pyarrow" not in constructor_eager.__name__
):
df = nw.from_native(constructor_eager(data), eager_only=True)
msg = re.escape("This operation requires a pyarrow-backed series. ")
with pytest.raises(TypeError, match=msg):
df["s"].str.split(by=by)
return
df = nw.from_native(constructor_eager(data), eager_only=True)
result_series = df["s"].str.split(by=by)
assert_equal_data({"s": result_series}, expected)
4 changes: 4 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ def assert_equal_data(result: Any, expected: Mapping[str, Any]) -> None:
are_equivalent_values = lhs is None or math.isnan(lhs)
elif lhs is None:
are_equivalent_values = rhs is None
elif isinstance(lhs, list) and isinstance(rhs, list):
are_equivalent_values = all(
left_side == right_side for left_side, right_side in zip(lhs, rhs)
)
elif pd.isna(lhs):
are_equivalent_values = pd.isna(rhs)
else:
Expand Down

0 comments on commit bd06949

Please sign in to comment.