feat: str.split (narwhals-dev#1932)

* implemented without inclusive option (https://docs.pola.rs/api/python/dev/reference/expressions/api/polars.Expr.str.split.html) * support for _arrow, _dask, _duckdb, spark_like * support for pandas_like(pyarrow backed)
skritsotalakis · Feb 23, 2025 · bd06949 · bd06949
1 parent d00b962
commit bd06949
Show file tree

Hide file tree

Showing 13 changed files with 259 additions and 0 deletions.
diff --git a/docs/api-reference/expr_str.md b/docs/api-reference/expr_str.md
@@ -11,6 +11,7 @@
         - replace
         - replace_all
         - slice
+        - split
         - starts_with
         - strip_chars
         - tail

diff --git a/docs/api-reference/series_str.md b/docs/api-reference/series_str.md
@@ -11,6 +11,7 @@
         - replace
         - replace_all
         - slice
+        - split
         - starts_with
         - strip_chars
         - tail

diff --git a/narwhals/_arrow/expr_str.py b/narwhals/_arrow/expr_str.py
@@ -67,6 +67,11 @@ def slice(self: Self, offset: int, length: int | None) -> ArrowExpr:
             self._compliant_expr, "str", "slice", offset=offset, length=length
         )
 
+    def split(self: Self, by: str) -> ArrowExpr:
+        return reuse_series_namespace_implementation(
+            self._compliant_expr, "str", "split", by=by
+        )
+
     def to_datetime(self: Self, format: str | None) -> ArrowExpr:  # noqa: A002
         return reuse_series_namespace_implementation(
             self._compliant_expr, "str", "to_datetime", format=format

diff --git a/narwhals/_arrow/series_str.py b/narwhals/_arrow/series_str.py
@@ -71,6 +71,10 @@ def slice(self: Self, offset: int, length: int | None) -> ArrowSeries:
             )
         )
 
+    def split(self: Self, by: str) -> ArrowSeries:
+        split_series = pc.split_pattern(self._compliant_series._native_series, by)  # type: ignore[call-overload]
+        return self._compliant_series._from_native_series(split_series)
+
     def to_datetime(self: Self, format: str | None) -> ArrowSeries:  # noqa: A002
         native = self._compliant_series._native_series
         format = parse_datetime_format(native) if format is None else format

diff --git a/narwhals/_dask/expr_str.py b/narwhals/_dask/expr_str.py
@@ -81,6 +81,14 @@ def slice(self: Self, offset: int, length: int | None) -> DaskExpr:
             length=length,
         )
 
+    def split(self: Self, by: str) -> DaskExpr:
+        return self._compliant_expr._from_call(
+            lambda _input, by: _input.str.split(pat=by),
+            "split",
+            by=by,
+            returns_scalar=self._compliant_expr._returns_scalar,
+        )
+
     def to_datetime(self: Self, format: str | None) -> DaskExpr:  # noqa: A002
         return self._compliant_expr._from_call(
             lambda _input, format: dd.to_datetime(_input, format=format),  # noqa: A006

diff --git a/narwhals/_duckdb/expr_str.py b/narwhals/_duckdb/expr_str.py
@@ -54,6 +54,13 @@ def func(_input: duckdb.Expression) -> duckdb.Expression:
 
         return self._compliant_expr._from_call(func, "slice")
 
+    def split(self: Self, by: str) -> DuckDBExpr:
+        return self._compliant_expr._from_call(
+            lambda _input: FunctionExpression("str_split", _input, lit(by)),
+            "split",
+            expr_kind=self._compliant_expr._expr_kind,
+        )
+
     def len_chars(self: Self) -> DuckDBExpr:
         return self._compliant_expr._from_call(
             lambda _input: FunctionExpression("length", _input), "len_chars"

diff --git a/narwhals/_pandas_like/expr_str.py b/narwhals/_pandas_like/expr_str.py
@@ -93,6 +93,11 @@ def slice(self: Self, offset: int, length: int | None) -> PandasLikeExpr:
             self._compliant_expr, "str", "slice", offset=offset, length=length
         )
 
+    def split(self: Self, by: str) -> PandasLikeExpr:
+        return reuse_series_namespace_implementation(
+            self._compliant_expr, "str", "split", by=by
+        )
+
     def to_datetime(self: Self, format: str | None) -> PandasLikeExpr:  # noqa: A002
         return reuse_series_namespace_implementation(
             self._compliant_expr,

diff --git a/narwhals/_pandas_like/series_str.py b/narwhals/_pandas_like/series_str.py
@@ -2,6 +2,7 @@
 
 from typing import TYPE_CHECKING
 
+from narwhals._pandas_like.utils import get_dtype_backend
 from narwhals._pandas_like.utils import to_datetime
 
 if TYPE_CHECKING:
@@ -61,6 +62,19 @@ def slice(self: Self, offset: int, length: int | None) -> PandasLikeSeries:
             self._compliant_series._native_series.str.slice(start=offset, stop=stop),
         )
 
+    def split(self: Self, by: str) -> PandasLikeSeries:
+        dtype_backend = get_dtype_backend(
+            self._compliant_series._native_series.dtype,
+            self._compliant_series._implementation,
+        )
+        if dtype_backend != "pyarrow":
+            msg = "This operation requires a pyarrow-backed series. "
+            raise TypeError(msg)
+
+        return self._compliant_series._from_native_series(
+            self._compliant_series._native_series.str.split(pat=by),
+        )
+
     def to_datetime(self: Self, format: str | None) -> PandasLikeSeries:  # noqa: A002
         return self._compliant_series._from_native_series(
             to_datetime(self._compliant_series._implementation)(

diff --git a/narwhals/_spark_like/expr_str.py b/narwhals/_spark_like/expr_str.py
@@ -90,6 +90,13 @@ def func(_input: Column) -> Column:
 
         return self._compliant_expr._from_call(func, "slice")
 
+    def split(self: Self, by: str) -> SparkLikeExpr:
+        return self._compliant_expr._from_call(
+            lambda _input: self._compliant_expr._F.split(_input, by),
+            "split",
+            expr_kind=self._compliant_expr._expr_kind,
+        )
+
     def to_uppercase(self: Self) -> SparkLikeExpr:
         return self._compliant_expr._from_call(
             self._compliant_expr._F.upper, "to_uppercase"

diff --git a/narwhals/expr_str.py b/narwhals/expr_str.py
@@ -574,6 +574,66 @@ def slice(self: Self, offset: int, length: int | None = None) -> ExprT:
             self._expr._metadata,
         )
 
+    def split(self: Self, by: str) -> ExprT:
+        r"""Split the string by a substring.
+
+        Arguments:
+            by: The delimiter string by which to split the values.
+
+        Returns:
+            A new expression.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import pyarrow as pa
+            >>> import narwhals as nw
+            >>> from narwhals.typing import IntoFrameT
+            >>>
+            >>> data = {"s": ["foo bar", "foo_bar", "foo_bar_baz", "foo,bar"]}
+            >>> df_pd = pd.DataFrame(data, dtype=pd.ArrowDtype(pa.string()))
+            >>> df_pl = pl.DataFrame(data)
+            >>> df_pa = pa.table(data)
+
+            We define a dataframe-agnostic function:
+
+            >>> def agnostic_str_split(df_native: IntoFrameT) -> IntoFrameT:
+            ...     df = nw.from_native(df_native)
+            ...     return df.with_columns(s_split=nw.col("s").str.split("_")).to_native()
+
+            We can then pass any supported library such as pandas(pyarrow backed), Polars, or
+            PyArrow to `agnostic_str_split`:
+
+            >>> agnostic_str_split(df_pd)  # doctest: +NORMALIZE_WHITESPACE
+                         s	s_split
+            0	foo bar	['foo bar']
+            1	foo_bar	['foo' 'bar']
+            2	foo_bar_baz	['foo' 'bar' 'baz']
+            3	foo,bar	['foo,bar']
+
+            >>> agnostic_str_split(df_pl)
+            shape: (4, 2)
+            s	s_split
+            str	list[str]
+            "foo bar"	["foo bar"]
+            "foo_bar"	["foo", "bar"]
+            "foo_bar_baz"	["foo", "bar", "baz"]
+            "foo,bar"	["foo,bar"]
+
+            >>> agnostic_str_split(df_pa)
+            pyarrow.Table
+            s: string
+            s_split: list<item: string>
+            child 0, item: string
+            ----
+            s: [["foo bar","foo_bar","foo_bar_baz","foo,bar"]]
+            s_split: [[["foo bar"],["foo","bar"],["foo","bar","baz"],["foo,bar"]]]
+        """
+        return self._expr.__class__(
+            lambda plx: self._expr._to_compliant_expr(plx).str.split(by=by),
+            self._expr._metadata,
+        )
+
     def head(self: Self, n: int = 5) -> ExprT:
         r"""Take the first n elements of each string.
 

diff --git a/narwhals/series_str.py b/narwhals/series_str.py
@@ -547,6 +547,78 @@ def slice(self: Self, offset: int, length: int | None = None) -> SeriesT:
             )
         )
 
+    def split(self: Self, by: str) -> SeriesT:
+        r"""Split the string by a substring.
+
+        Arguments:
+            by: The delimiter string by which to split the values. If set to `None` (default),
+                each character in the string is split into a separate substring.
+
+        Returns:
+            A new Series containing lists of substrings.
+
+        Examples:
+            >>> import pandas as pd
+            >>> import polars as pl
+            >>> import pyarrow as pa
+            >>> import narwhals as nw
+            >>> from narwhals.typing import IntoSeriesT
+
+            >>> data = ["foo bar", "foo_bar", "foo_bar_baz", "foo,bar"]
+            >>> s_pd = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))
+            >>> s_pl = pl.Series(data)
+            >>> s_pa = pa.chunked_array([data])
+
+            We define a dataframe-agnostic function:
+
+            >>> def agnostic_split(s_native: IntoSeriesT) -> IntoSeriesT:
+            ...     s = nw.from_native(s_native, series_only=True)
+            ...     return s.str.split("_").to_native()
+
+            We can then pass any supported library such as pandas (pyarrow backed), Polars, or
+            PyArrow to `agnostic_split`:
+
+            >>> agnostic_split(s_pd)  # doctest: +NORMALIZE_WHITESPACE
+            0            ['foo bar']
+            1          ['foo' 'bar']
+            2    ['foo' 'bar' 'baz']
+            3            ['foo,bar']
+            dtype: list<item: string>[pyarrow]
+
+            >>> agnostic_split(s_pl)  # doctest: +NORMALIZE_WHITESPACE
+            shape: (4,)
+            list[str]
+            ["foo bar"]
+            ["foo", "bar"]
+            ["foo", "bar", "baz"]
+            ["foo,bar"]
+
+            >>> agnostic_split(s_pa)  # doctest: +ELLIPSIS
+            <pyarrow.lib.ChunkedArray object at 0x7efefc13c7c0>
+            [
+            [
+                [
+                "foo bar"
+                ],
+                [
+                "foo",
+                "bar"
+                ],
+                [
+                "foo",
+                "bar",
+                "baz"
+                ],
+                [
+                "foo,bar"
+                ]
+            ]
+            ]
+        """
+        return self._narwhals_series._from_compliant_series(
+            self._narwhals_series._compliant_series.str.split(by=by)
+        )
+
     def head(self: Self, n: int = 5) -> SeriesT:
         r"""Take the first n elements of each string.
 

diff --git a/tests/expr_and_series/str/split_test.py b/tests/expr_and_series/str/split_test.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+import re
+from typing import Any
+
+import pytest
+
+import narwhals.stable.v1 as nw
+from tests.utils import Constructor
+from tests.utils import ConstructorEager
+from tests.utils import assert_equal_data
+
+data = {"s": ["foo bar", "foo_bar", "foo_bar_baz", "foo,bar"]}
+
+
+@pytest.mark.parametrize(
+    ("by", "expected"),
+    [
+        ("_", {"s": [["foo bar"], ["foo", "bar"], ["foo", "bar", "baz"], ["foo,bar"]]}),
+        (
+            ",",
+            {"s": [["foo bar"], ["foo_bar"], ["foo_bar_baz"], ["foo", "bar"]]},
+        ),
+    ],
+)
+def test_str_split(
+    constructor: Constructor,
+    by: str,
+    expected: Any,
+) -> None:
+    if (
+        constructor.__name__.startswith("pandas")
+        and "pyarrow" not in constructor.__name__
+    ):
+        df = nw.from_native(constructor(data))
+        msg = re.escape("This operation requires a pyarrow-backed series. ")
+        with pytest.raises(TypeError, match=msg):
+            df.select(nw.col("s").str.split(by=by))
+        return
+    df = nw.from_native(constructor(data))
+    result_frame = df.select(nw.col("s").str.split(by=by))
+    assert_equal_data(result_frame, expected)
+
+
+@pytest.mark.parametrize(
+    ("by", "expected"),
+    [
+        ("_", {"s": [["foo bar"], ["foo", "bar"], ["foo", "bar", "baz"], ["foo,bar"]]}),
+        (
+            ",",
+            {"s": [["foo bar"], ["foo_bar"], ["foo_bar_baz"], ["foo", "bar"]]},
+        ),
+    ],
+)
+def test_str_split_series(
+    constructor_eager: ConstructorEager,
+    by: str,
+    expected: Any,
+) -> None:
+    if (
+        constructor_eager.__name__.startswith("pandas")
+        and "pyarrow" not in constructor_eager.__name__
+    ):
+        df = nw.from_native(constructor_eager(data), eager_only=True)
+        msg = re.escape("This operation requires a pyarrow-backed series. ")
+        with pytest.raises(TypeError, match=msg):
+            df["s"].str.split(by=by)
+        return
+    df = nw.from_native(constructor_eager(data), eager_only=True)
+    result_series = df["s"].str.split(by=by)
+    assert_equal_data({"s": result_series}, expected)
diff --git a/tests/utils.py b/tests/utils.py
@@ -131,6 +131,10 @@ def assert_equal_data(result: Any, expected: Mapping[str, Any]) -> None:
                 are_equivalent_values = lhs is None or math.isnan(lhs)
             elif lhs is None:
                 are_equivalent_values = rhs is None
+            elif isinstance(lhs, list) and isinstance(rhs, list):
+                are_equivalent_values = all(
+                    left_side == right_side for left_side, right_side in zip(lhs, rhs)
+                )
             elif pd.isna(lhs):
                 are_equivalent_values = pd.isna(rhs)
             else:
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ @@
             - replace
             - replace_all
             - slice
+            - split
             - starts_with
             - strip_chars
             - tail
@@ Expand Down @@