diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 219c52c4a65b9..a6835ab1325a0 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -31,14 +31,17 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, + is_dtype_equal, is_integer, is_integer_dtype, is_object_dtype, is_scalar, is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna @@ -48,6 +51,7 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import ( check_array_indexer, @@ -290,10 +294,14 @@ def to_numpy( # type: ignore[override] """ # TODO: copy argument is ignored - if na_value is lib.no_default: - na_value = self._dtype.na_value - result = self._data.__array__(dtype=dtype) - result[isna(result)] = na_value + result = np.array(self._data, dtype=dtype) + if self._data.null_count > 0: + if na_value is lib.no_default: + if dtype and np.issubdtype(dtype, np.floating): + return result + na_value = self._dtype.na_value + mask = self.isna() + result[mask] = na_value return result def __len__(self) -> int: @@ -737,6 +745,24 @@ def value_counts(self, dropna: bool = True) -> Series: return Series(counts, index=index).astype("Int64") + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + + if is_dtype_equal(dtype, self.dtype): + if copy: + return self.copy() + return self + + elif isinstance(dtype, NumericDtype): + data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) + return dtype.__from_arrow__(data) + + elif isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + return cls._from_sequence(self, dtype=dtype, copy=copy) + + return super().astype(dtype, copy) + # ------------------------------------------------------------------------ # String methods interface diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index e3b43c544a477..c9533e239abe0 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -3,6 +3,8 @@ Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ +import re + import numpy as np import pytest @@ -325,12 +327,19 @@ def test_from_sequence_no_mutate(copy, cls, request): tm.assert_numpy_array_equal(nan_arr, expected) -def test_astype_int(dtype, request): - if dtype == "arrow_string": - reason = "Cannot interpret 'Int64Dtype()' as a data type" - mark = pytest.mark.xfail(raises=TypeError, reason=reason) - request.node.add_marker(mark) +def test_astype_int(dtype): + arr = pd.array(["1", "2", "3"], dtype=dtype) + result = arr.astype("int64") + expected = np.array([1, 2, 3], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + arr = pd.array(["1", pd.NA, "3"], dtype=dtype) + msg = re.escape("int() argument must be a string, a bytes-like object or a number") + with pytest.raises(TypeError, match=msg): + arr.astype("int64") + +def test_astype_nullable_int(dtype): arr = pd.array(["1", pd.NA, "3"], dtype=dtype) result = arr.astype("Int64") @@ -338,19 +347,9 @@ def test_astype_int(dtype, request): tm.assert_extension_array_equal(result, expected) -def test_astype_float(dtype, any_float_allowed_nullable_dtype, request): +def test_astype_float(dtype, any_float_allowed_nullable_dtype): # Don't compare arrays (37974) - - if dtype == "arrow_string": - if any_float_allowed_nullable_dtype in {"Float32", "Float64"}: - reason = "Cannot interpret 'Float32Dtype()' as a data type" - else: - reason = "float() argument must be a string or a number, not 'NAType'" - mark = pytest.mark.xfail(raises=TypeError, reason=reason) - request.node.add_marker(mark) - ser = pd.Series(["1.1", pd.NA, "3.3"], dtype=dtype) - result = ser.astype(any_float_allowed_nullable_dtype) expected = pd.Series([1.1, np.nan, 3.3], dtype=any_float_allowed_nullable_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index bebe6948cff9c..ffaecf1576364 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -379,7 +379,9 @@ class TestAstypeString: # currently no way to parse IntervalArray from a list of strings ], ) - def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): + def test_astype_string_to_extension_dtype_roundtrip( + self, data, dtype, request, nullable_string_dtype + ): if dtype == "boolean" or ( dtype in ("period[M]", "datetime64[ns]", "timedelta64[ns]") and NaT in data ): @@ -389,7 +391,8 @@ def test_astype_string_to_extension_dtype_roundtrip(self, data, dtype, request): request.node.add_marker(mark) # GH-40351 s = Series(data, dtype=dtype) - tm.assert_series_equal(s, s.astype("string").astype(dtype)) + result = s.astype(nullable_string_dtype).astype(dtype) + tm.assert_series_equal(result, s) class TestAstypeCategorical: