From db1b8ab2fc1a65c2e520dc9ffb6390743a00d63b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 2 Sep 2024 11:47:39 -0700 Subject: [PATCH] API (string): str.center with pyarrow-backed string dtype (#59624) --- doc/source/whatsnew/v2.3.0.rst | 3 ++- pandas/core/arrays/_arrow_string_mixins.py | 20 ++++++++++++++++++-- pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/strings/test_case_justify.py | 6 +----- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 8a64aa7c609d6..03355f655eb28 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -103,7 +103,8 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) -- +- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) + Interval ^^^^^^^^ diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index c810af32f7480..b5cf0573e70ba 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial from typing import ( TYPE_CHECKING, Literal, @@ -7,7 +8,10 @@ import numpy as np -from pandas.compat import pa_version_under10p1 +from pandas.compat import ( + pa_version_under10p1, + pa_version_under17p0, +) from pandas.core.dtypes.missing import isna @@ -49,7 +53,19 @@ def _str_pad( elif side == "right": pa_pad = pc.utf8_rpad elif side == "both": - pa_pad = pc.utf8_center + if pa_version_under17p0: + # GH#59624 fall back to object dtype + from pandas import array + + obj_arr = self.astype(object, copy=False) # type: ignore[attr-defined] + obj = array(obj_arr, dtype=object) + result = obj._str_pad(width, side, fillchar) # type: ignore[attr-defined] + return type(self)._from_sequence(result, dtype=self.dtype) # type: ignore[attr-defined] + else: + # GH#54792 + # https://github.com/apache/arrow/issues/15053#issuecomment-2317032347 + lean_left = (width % 2) == 0 + pa_pad = partial(pc.utf8_center, lean_left_on_odd_padding=lean_left) else: raise ValueError( f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e8e74b0ba1215..a3169985828e8 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -284,6 +284,7 @@ def astype(self, dtype, copy: bool = True): _str_map = BaseStringArray._str_map _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith + _str_pad = ArrowStringArrayMixin._str_pad def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -546,7 +547,6 @@ class ArrowStringArrayNumpySemantics(ArrowStringArray): _str_get = ArrowStringArrayMixin._str_get _str_removesuffix = ArrowStringArrayMixin._str_removesuffix _str_capitalize = ArrowStringArrayMixin._str_capitalize - _str_pad = ArrowStringArrayMixin._str_pad _str_title = ArrowStringArrayMixin._str_title _str_swapcase = ArrowStringArrayMixin._str_swapcase _str_slice_replace = ArrowStringArrayMixin._str_slice_replace diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py index 41aedae90ca76..819556f961fa3 100644 --- a/pandas/tests/strings/test_case_justify.py +++ b/pandas/tests/strings/test_case_justify.py @@ -291,11 +291,7 @@ def test_center_ljust_rjust_mixed_object(): def test_center_ljust_rjust_fillchar(any_string_dtype): - if any_string_dtype == "string[pyarrow_numpy]": - pytest.skip( - "Arrow logic is different, " - "see https://github.com/pandas-dev/pandas/pull/54533/files#r1299808126", - ) + # GH#54533, GH#54792 s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype) result = s.str.center(5, fillchar="X")