From c8025354ab256804c0df45a759fc11c4afd24e7e Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 7 Aug 2024 15:28:31 -0700 Subject: [PATCH 1/5] REF: move ArrowStringArrayNumpySemantics methods to parent class --- pandas/core/arrays/string_arrow.py | 64 ++++++++++++------------------ 1 file changed, 26 insertions(+), 38 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f48aec19685d3..20943e29e8344 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -217,12 +217,17 @@ def dtype(self) -> StringDtype: # type: ignore[override] return self._dtype def insert(self, loc: int, item) -> ArrowStringArray: + if self.dtype.na_value is np.nan and item is np.nan: + item = libmissing.NA if not isinstance(item, str) and item is not libmissing.NA: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - @classmethod - def _result_converter(cls, values, na=None): + def _result_converter(self, values, na=None): + if self.dtype.na_value is np.nan: + if not isna(na): + values = values.fill_null(bool(na)) + return ArrowExtensionArray(values).to_numpy(na_value=np.nan) return BooleanDtype().__from_arrow__(values) def _maybe_convert_setitem_value(self, value): @@ -497,11 +502,30 @@ def _str_get_dummies(self, sep: str = "|"): return dummies.astype(np.int64, copy=False), labels def _convert_int_dtype(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + if result.dtype == np.int32: + result = result.astype(np.int64) + return result + return Int64Dtype().__from_arrow__(result) def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): + if self.dtype.na_value is np.nan and name in ["any", "all"]: + if not skipna: + nas = pc.is_null(self._pa_array) + arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) + else: + arr = pc.not_equal(self._pa_array, "") + return ArrowExtensionArray(arr)._reduce( + name, skipna=skipna, keepdims=keepdims, **kwargs + ) + result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) if name in ("argmin", "argmax") and isinstance(result, pa.Array): return self._convert_int_dtype(result) @@ -534,15 +558,8 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): - _storage = "pyarrow" _na_value = np.nan - @classmethod - def _result_converter(cls, values, na=None): - if not isna(na): - values = values.fill_null(bool(na)) - return ArrowExtensionArray(values).to_numpy(na_value=np.nan) - def __getattribute__(self, item): # ArrowStringArray and we both inherit from ArrowExtensionArray, which # creates inheritance problems (Diamond inheritance) @@ -553,15 +570,6 @@ def __getattribute__(self, item): return partial(getattr(ArrowStringArrayMixin, item), self) return super().__getattribute__(item) - def _convert_int_dtype(self, result): - if isinstance(result, pa.Array): - result = result.to_numpy(zero_copy_only=False) - else: - result = result.to_numpy() - if result.dtype == np.int32: - result = result.astype(np.int64) - return result - def _cmp_method(self, other, op): try: result = super()._cmp_method(other, op) @@ -579,23 +587,3 @@ def value_counts(self, dropna: bool = True) -> Series: return Series( result._values.to_numpy(), index=result.index, name=result.name, copy=False ) - - def _reduce( - self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs - ): - if name in ["any", "all"]: - if not skipna: - nas = pc.is_null(self._pa_array) - arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, "")) - else: - arr = pc.not_equal(self._pa_array, "") - return ArrowExtensionArray(arr)._reduce( - name, skipna=skipna, keepdims=keepdims, **kwargs - ) - else: - return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) - - def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics: - if item is np.nan: - item = libmissing.NA - return super().insert(loc, item) # type: ignore[return-value] From 459d6550e3c13a7103a8021955bd497daef61aa5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Aug 2024 09:58:47 -0700 Subject: [PATCH 2/5] REF: move methods to ArrowStringArray --- pandas/core/arrays/arrow/array.py | 15 +++++++++++++-- pandas/core/arrays/string_arrow.py | 28 +++++++--------------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index d07bfeda50e1d..a4dad0282bd71 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -728,7 +728,14 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: raise NotImplementedError( f"{op.__name__} not implemented for {type(other)}" ) - return ArrowExtensionArray(result) + result = ArrowExtensionArray(result) + if self.dtype.na_value is np.nan: + # i.e. ArrowStringArray with Numpy Semantics + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) + return result def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type @@ -1523,7 +1530,11 @@ def value_counts(self, dropna: bool = True) -> Series: index = Index(type(self)(values)) - return Series(counts, index=index, name="count", copy=False) + result = Series(counts, index=index, name="count", copy=False) + if self.dtype.na_value is np.nan: + # i.e. ArrowStringArray with Numpy Semantics + return Series(counts.to_numpy(), index=index, name="count", copy=False) + return result @classmethod def _concat_same_type(cls, to_concat) -> Self: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 20943e29e8344..15663b7105db4 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,5 @@ from __future__ import annotations -from functools import partial import operator import re from typing import ( @@ -63,8 +62,6 @@ from pandas.core.dtypes.dtypes import ExtensionDtype - from pandas import Series - ArrowStringScalarOrNAT = Union[str, libmissing.NAType] @@ -559,16 +556,13 @@ def _rank( class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan - - def __getattribute__(self, item): - # ArrowStringArray and we both inherit from ArrowExtensionArray, which - # creates inheritance problems (Diamond inheritance) - if item in ArrowStringArrayMixin.__dict__ and item not in ( - "_pa_array", - "__dict__", - ): - return partial(getattr(ArrowStringArrayMixin, item), self) - return super().__getattribute__(item) + _str_get = ArrowStringArrayMixin._str_get + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_pad = ArrowStringArrayMixin._str_pad + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace def _cmp_method(self, other, op): try: @@ -579,11 +573,3 @@ def _cmp_method(self, other, op): return result.to_numpy(np.bool_, na_value=True) else: return result.to_numpy(np.bool_, na_value=False) - - def value_counts(self, dropna: bool = True) -> Series: - from pandas import Series - - result = super().value_counts(dropna) - return Series( - result._values.to_numpy(), index=result.index, name=result.name, copy=False - ) From d323ec3e83b07cb0196b24bf96df6f38bddb2af5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Aug 2024 10:35:34 -0700 Subject: [PATCH 3/5] mypy fixup --- pandas/core/arrays/arrow/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a4dad0282bd71..05b49fba7f31f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -729,7 +729,7 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: f"{op.__name__} not implemented for {type(other)}" ) result = ArrowExtensionArray(result) - if self.dtype.na_value is np.nan: + if self.dtype.na_value is np.nan: # type: ignore[comparison-overlap] # i.e. ArrowStringArray with Numpy Semantics if op == operator.ne: return result.to_numpy(np.bool_, na_value=True) @@ -1531,7 +1531,7 @@ def value_counts(self, dropna: bool = True) -> Series: index = Index(type(self)(values)) result = Series(counts, index=index, name="count", copy=False) - if self.dtype.na_value is np.nan: + if self.dtype.na_value is np.nan: # type: ignore[comparison-overlap] # i.e. ArrowStringArray with Numpy Semantics return Series(counts.to_numpy(), index=index, name="count", copy=False) return result From 1816902362a09ed772ab6f2ea44be1b548cdd158 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Aug 2024 14:15:07 -0700 Subject: [PATCH 4/5] Fix incorrect double-unpacking --- pandas/core/arrays/string_arrow.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 15663b7105db4..a6739038538ff 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,6 +1,5 @@ from __future__ import annotations -import operator import re from typing import ( TYPE_CHECKING, @@ -566,10 +565,6 @@ class ArrowStringArrayNumpySemantics(ArrowStringArray): def _cmp_method(self, other, op): try: - result = super()._cmp_method(other, op) + return super()._cmp_method(other, op) except pa.ArrowNotImplementedError: return invalid_comparison(self, other, op) - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) From 7dccab18931580848e68f2884d41050591fdaf8b Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 15 Aug 2024 10:15:41 -0700 Subject: [PATCH 5/5] move methods to subclass --- pandas/core/arrays/arrow/array.py | 15 ++------------- pandas/core/arrays/string_arrow.py | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 85d3c1c2f3a23..e95fa441e18fb 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -734,14 +734,7 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: raise NotImplementedError( f"{op.__name__} not implemented for {type(other)}" ) - result = ArrowExtensionArray(result) - if self.dtype.na_value is np.nan: # type: ignore[comparison-overlap] - # i.e. ArrowStringArray with Numpy Semantics - if op == operator.ne: - return result.to_numpy(np.bool_, na_value=True) - else: - return result.to_numpy(np.bool_, na_value=False) - return result + return ArrowExtensionArray(result) def _evaluate_op_method(self, other, op, arrow_funcs) -> Self: pa_type = self._pa_array.type @@ -1536,11 +1529,7 @@ def value_counts(self, dropna: bool = True) -> Series: index = Index(type(self)(values)) - result = Series(counts, index=index, name="count", copy=False) - if self.dtype.na_value is np.nan: # type: ignore[comparison-overlap] - # i.e. ArrowStringArray with Numpy Semantics - return Series(counts.to_numpy(), index=index, name="count", copy=False) - return result + return Series(counts, index=index, name="count", copy=False) @classmethod def _concat_same_type(cls, to_concat) -> Self: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ab66ef5646611..65ffce3595738 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +import operator import re from typing import ( TYPE_CHECKING, @@ -60,6 +61,8 @@ from pandas.core.dtypes.dtypes import ExtensionDtype + from pandas import Series + ArrowStringScalarOrNAT = Union[str, libmissing.NAType] @@ -551,6 +554,24 @@ def _rank( ) ) + def value_counts(self, dropna: bool = True) -> Series: + result = super().value_counts(dropna=dropna) + if self.dtype.na_value is np.nan: + res_values = result._values.to_numpy() + return result._constructor( + res_values, index=result.index, name=result.name, copy=False + ) + return result + + def _cmp_method(self, other, op): + result = super()._cmp_method(other, op) + if self.dtype.na_value is np.nan: + if op == operator.ne: + return result.to_numpy(np.bool_, na_value=True) + else: + return result.to_numpy(np.bool_, na_value=False) + return result + class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan