From 74aedd96ba18b8395bc1bbfbbf3c6bd75f8f0b1b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 11 Aug 2024 11:04:16 +0200 Subject: [PATCH] BUG (string dtype): convert dictionary input to materialized string array in ArrowStringArray constructor --- pandas/core/arrays/string_arrow.py | 16 ++++++++++------ pandas/tests/arrays/string_/test_string_arrow.py | 11 +++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index cc37995969f0a..f48aec19685d3 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -130,18 +130,22 @@ class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringAr def __init__(self, values) -> None: _chk_pyarrow_available() - if isinstance(values, (pa.Array, pa.ChunkedArray)) and pa.types.is_string( - values.type + if isinstance(values, (pa.Array, pa.ChunkedArray)) and ( + pa.types.is_string(values.type) + or ( + pa.types.is_dictionary(values.type) + and ( + pa.types.is_string(values.type.value_type) + or pa.types.is_large_string(values.type.value_type) + ) + ) ): values = pc.cast(values, pa.large_string()) super().__init__(values) self._dtype = StringDtype(storage=self._storage, na_value=self._na_value) - if not pa.types.is_large_string(self._pa_array.type) and not ( - pa.types.is_dictionary(self._pa_array.type) - and pa.types.is_large_string(self._pa_array.type.value_type) - ): + if not pa.types.is_large_string(self._pa_array.type): raise ValueError( "ArrowStringArray requires a PyArrow (chunked) array of " "large_string type" diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index 7d4aae0f7bb4e..c01966c72f811 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -88,19 +88,18 @@ def test_constructor_not_string_type_value_dictionary_raises(chunked): ArrowStringArray(arr) -@pytest.mark.xfail( - reason="dict conversion does not seem to be implemented for large string in arrow" -) +@pytest.mark.parametrize("string_type", ["string", "large_string"]) @pytest.mark.parametrize("chunked", [True, False]) -def test_constructor_valid_string_type_value_dictionary(chunked): +def test_constructor_valid_string_type_value_dictionary(string_type, chunked): pa = pytest.importorskip("pyarrow") - arr = pa.array(["1", "2", "3"], pa.large_string()).dictionary_encode() + arr = pa.array(["1", "2", "3"], getattr(pa, string_type)()).dictionary_encode() if chunked: arr = pa.chunked_array(arr) arr = ArrowStringArray(arr) - assert pa.types.is_string(arr._pa_array.type.value_type) + # dictionary type get converted to dense large string array + assert pa.types.is_large_string(arr._pa_array.type) def test_constructor_from_list():