From c53a3c2dd7b1138d58c305b161894ed73652afdc Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 21 Nov 2020 19:54:02 +0000 Subject: [PATCH 01/10] moreless copy/paste from fletcher --- pandas/core/arrays/string_arrow.py | 46 +++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 184fbc050036b..8adceaa13eb47 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,7 +1,7 @@ from __future__ import annotations from distutils.version import LooseVersion -from typing import TYPE_CHECKING, Any, Sequence, Type, Union +from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union import numpy as np @@ -15,10 +15,12 @@ from pandas.api.types import ( is_array_like, is_bool_dtype, + is_int64_dtype, is_integer, is_integer_dtype, is_scalar, ) +from pandas.core.algorithms import factorize from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import check_array_indexer, validate_indices @@ -252,9 +254,45 @@ def __len__(self) -> int: """ return len(self._data) - @classmethod - def _from_factorized(cls, values, original): - return cls._from_sequence(values) + def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: + """Encode the extension array as an enumerated type. + Parameters + ---------- + na_sentinel : int, default -1 + Value to use in the `labels` array to indicate missing values. + Returns + ------- + labels : ndarray + An integer NumPy array that's an indexer into the original + ExtensionArray. + uniques : ExtensionArray + An ExtensionArray containing the unique values of `self`. + .. note:: + uniques will *not* contain an entry for the NA value of + the ExtensionArray if there are any missing values present + in `self`. + See Also + -------- + pandas.factorize : Top-level factorize method that dispatches here. + Notes + ----- + :meth:`pandas.factorize` offers a `sort` keyword as well. + """ + if pa.types.is_dictionary(self._data.type): + raise NotImplementedError() + elif self._data.num_chunks == 1: + # Dictionaryencode and do the same as above + encoded = self._data.chunk(0).dictionary_encode() + indices = encoded.indices.to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(int) + if not is_int64_dtype(indices): + indices = indices.astype(np.int64) + return indices.values, type(self)(encoded.dictionary) + else: + np_array = self._data.to_pandas().values + return factorize(np_array, na_sentinel=na_sentinel) @classmethod def _concat_same_type(cls, to_concat) -> ArrowStringArray: From b7d0ab815c60f88ea88a8f8e7b855ffce8c39b60 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Nov 2020 19:31:14 +0000 Subject: [PATCH 02/10] use docstring from base class --- pandas/core/arrays/string_arrow.py | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 8adceaa13eb47..dae635bbfe3a6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -6,6 +6,7 @@ import numpy as np from pandas._libs import lib, missing as libmissing +from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.base import ExtensionDtype @@ -254,30 +255,8 @@ def __len__(self) -> int: """ return len(self._data) + @doc(ExtensionArray.factorize) def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: - """Encode the extension array as an enumerated type. - Parameters - ---------- - na_sentinel : int, default -1 - Value to use in the `labels` array to indicate missing values. - Returns - ------- - labels : ndarray - An integer NumPy array that's an indexer into the original - ExtensionArray. - uniques : ExtensionArray - An ExtensionArray containing the unique values of `self`. - .. note:: - uniques will *not* contain an entry for the NA value of - the ExtensionArray if there are any missing values present - in `self`. - See Also - -------- - pandas.factorize : Top-level factorize method that dispatches here. - Notes - ----- - :meth:`pandas.factorize` offers a `sort` keyword as well. - """ if pa.types.is_dictionary(self._data.type): raise NotImplementedError() elif self._data.num_chunks == 1: From 154496aea77fcf7776b2fbbf011a31f5f461eb56 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 22 Nov 2020 19:53:53 +0000 Subject: [PATCH 03/10] remove redundant type check --- pandas/core/arrays/string_arrow.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index dae635bbfe3a6..7c89b6d306437 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -257,10 +257,7 @@ def __len__(self) -> int: @doc(ExtensionArray.factorize) def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: - if pa.types.is_dictionary(self._data.type): - raise NotImplementedError() - elif self._data.num_chunks == 1: - # Dictionaryencode and do the same as above + if self._data.num_chunks == 1: encoded = self._data.chunk(0).dictionary_encode() indices = encoded.indices.to_pandas() if indices.dtype.kind == "f": From 73c7de9e9cdefc5a8fd48c6e393f038a30f0cfc0 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 15 Feb 2021 11:07:59 +0000 Subject: [PATCH 04/10] ignore new mypy error --- pandas/core/arrays/string_arrow.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 3e4881f445406..14cc422bdcfe7 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -273,7 +273,11 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: return indices.values, type(self)(encoded.dictionary) else: np_array = self._data.to_pandas().values - return factorize(np_array, na_sentinel=na_sentinel) + # error: Incompatible return value type (got "Tuple[Any, Union[Any, + # Index]]", expected "Tuple[Any, ExtensionArray]") + return factorize( # type: ignore[return-value] + np_array, na_sentinel=na_sentinel + ) @classmethod def _concat_same_type(cls, to_concat) -> ArrowStringArray: From 42ca9c3dabdc220da8265840c7b54dbfd9749986 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 15 Feb 2021 15:14:43 +0000 Subject: [PATCH 05/10] update algorithms.Factorize.time_factorize --- asv_bench/benchmarks/algorithms.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 65e52e03c43c7..d2fb564b47cc0 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,6 +5,7 @@ from pandas._libs import lib import pandas as pd +from pandas.core.arrays.string_arrow import ArrowStringDtype from .pandas_vb_common import tm @@ -43,23 +44,34 @@ class Factorize: "datetime64[ns, tz]", "Int64", "boolean", + "string_arrow", ], ] param_names = ["unique", "sort", "dtype"] def setup(self, unique, sort, dtype): N = 10 ** 5 + string_index = tm.makeStringIndex(N) + try: + string_arrow = pd.array(string_index, dtype=ArrowStringDtype()) + except ImportError: + string_arrow = None + + if dtype == "string_arrow" and not string_arrow: + raise NotImplementedError + data = { "int": pd.Int64Index(np.arange(N)), "uint": pd.UInt64Index(np.arange(N)), "float": pd.Float64Index(np.random.randn(N)), - "string": tm.makeStringIndex(N), + "string": string_index, "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N), "datetime64[ns, tz]": pd.date_range( "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo" ), "Int64": pd.array(np.arange(N), dtype="Int64"), "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"), + "string_arrow": string_arrow, }[dtype] if not unique: data = data.repeat(5) From a251537a5a2ddec60d5b1818b36b2568292a00ba Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 15 Feb 2021 18:16:17 +0000 Subject: [PATCH 06/10] test for arrays with 2 chunks --- pandas/tests/extension/test_string.py | 48 +++++++++++++++++++++------ 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index d0a3ef17afdbc..49aee76e10f6a 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -26,6 +26,29 @@ from pandas.tests.extension import base +def split_array(arr): + if not isinstance(arr.dtype, ArrowStringDtype): + pytest.skip("chunked array n/a") + + def _split_array(arr): + import pyarrow as pa + + arrow_array = arr._data + split = len(arrow_array) // 2 + arrow_array = pa.chunked_array( + [*arrow_array[:split].chunks, *arrow_array[split:].chunks] + ) + assert arrow_array.num_chunks == 2 + return type(arr)(arrow_array) + + return _split_array(arr) + + +@pytest.fixture(params=[True, False]) +def chunked(request): + return request.param + + @pytest.fixture( params=[ StringDtype, @@ -39,28 +62,32 @@ def dtype(request): @pytest.fixture -def data(dtype): +def data(dtype, chunked): strings = np.random.choice(list(string.ascii_letters), size=100) while strings[0] == strings[1]: strings = np.random.choice(list(string.ascii_letters), size=100) - return dtype.construct_array_type()._from_sequence(strings) + arr = dtype.construct_array_type()._from_sequence(strings) + return split_array(arr) if chunked else arr @pytest.fixture -def data_missing(dtype): +def data_missing(dtype, chunked): """Length 2 array with [NA, Valid]""" - return dtype.construct_array_type()._from_sequence([pd.NA, "A"]) + arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"]) + return split_array(arr) if chunked else arr @pytest.fixture -def data_for_sorting(dtype): - return dtype.construct_array_type()._from_sequence(["B", "C", "A"]) +def data_for_sorting(dtype, chunked): + arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"]) + return split_array(arr) if chunked else arr @pytest.fixture -def data_missing_for_sorting(dtype): - return dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"]) +def data_missing_for_sorting(dtype, chunked): + arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"]) + return split_array(arr) if chunked else arr @pytest.fixture @@ -69,10 +96,11 @@ def na_value(): @pytest.fixture -def data_for_grouping(dtype): - return dtype.construct_array_type()._from_sequence( +def data_for_grouping(dtype, chunked): + arr = dtype.construct_array_type()._from_sequence( ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"] ) + return split_array(arr) if chunked else arr class TestDtype(base.BaseDtypeTests): From ea59c38759e43f8f4953004528f3e184199facbc Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 18 Feb 2021 12:39:50 +0000 Subject: [PATCH 07/10] fix failing test_factorize_equivalence --- pandas/core/arrays/string_arrow.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4a6cc15152e00..ee9059d7604a5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -36,7 +36,6 @@ is_integer_dtype, is_scalar, ) -from pandas.core.algorithms import factorize from pandas.core.arraylike import OpsMixin from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( @@ -279,22 +278,16 @@ def __len__(self) -> int: @doc(ExtensionArray.factorize) def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: - if self._data.num_chunks == 1: - encoded = self._data.chunk(0).dictionary_encode() - indices = encoded.indices.to_pandas() - if indices.dtype.kind == "f": - indices[np.isnan(indices)] = na_sentinel - indices = indices.astype(int) - if not is_int64_dtype(indices): - indices = indices.astype(np.int64) - return indices.values, type(self)(encoded.dictionary) - else: - np_array = self._data.to_pandas().values - # error: Incompatible return value type (got "Tuple[Any, Union[Any, - # Index]]", expected "Tuple[Any, ExtensionArray]") - return factorize( # type: ignore[return-value] - np_array, na_sentinel=na_sentinel - ) + encoded = self._data.dictionary_encode() + indices = pa.chunked_array( + [c.indices for c in encoded.chunks], type=encoded.type.index_type + ).to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(int) + if not is_int64_dtype(indices): + indices = indices.astype(np.int64) + return indices.values, type(self)(encoded.chunk(0).dictionary) @classmethod def _concat_same_type(cls, to_concat) -> ArrowStringArray: From 7d987275aeacb174ec2222d3a3058d12f416d489 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 18 Feb 2021 13:05:47 +0000 Subject: [PATCH 08/10] fix failing test_factorize_empty --- pandas/core/arrays/string_arrow.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ee9059d7604a5..c1b8d875ade1b 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -287,7 +287,13 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: indices = indices.astype(int) if not is_int64_dtype(indices): indices = indices.astype(np.int64) - return indices.values, type(self)(encoded.chunk(0).dictionary) + + if encoded.num_chunks: + uniques = type(self)(encoded.chunk(0).dictionary) + else: + uniques = type(self)(pa.array([], type=encoded.type.value_type)) + + return indices.values, uniques @classmethod def _concat_same_type(cls, to_concat) -> ArrowStringArray: From 0023f0843cb7fe316a5b30393fef8957b5213d21 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 18 Feb 2021 13:25:22 +0000 Subject: [PATCH 09/10] address dtype comment --- pandas/core/arrays/string_arrow.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index c1b8d875ade1b..0ed213f5ba4cb 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -31,7 +31,6 @@ from pandas.api.types import ( is_array_like, is_bool_dtype, - is_int64_dtype, is_integer, is_integer_dtype, is_scalar, @@ -284,9 +283,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: ).to_pandas() if indices.dtype.kind == "f": indices[np.isnan(indices)] = na_sentinel - indices = indices.astype(int) - if not is_int64_dtype(indices): - indices = indices.astype(np.int64) + indices = indices.astype(np.int64, copy=False) if encoded.num_chunks: uniques = type(self)(encoded.chunk(0).dictionary) From 6a28414a7069067df6b93000bee88ff13e805d5f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 19 Feb 2021 16:46:42 +0000 Subject: [PATCH 10/10] move ArrowStringDtype import inside try/except --- asv_bench/benchmarks/algorithms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index d2fb564b47cc0..29c492442c0b2 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,7 +5,6 @@ from pandas._libs import lib import pandas as pd -from pandas.core.arrays.string_arrow import ArrowStringDtype from .pandas_vb_common import tm @@ -53,6 +52,8 @@ def setup(self, unique, sort, dtype): N = 10 ** 5 string_index = tm.makeStringIndex(N) try: + from pandas.core.arrays.string_arrow import ArrowStringDtype + string_arrow = pd.array(string_index, dtype=ArrowStringDtype()) except ImportError: string_arrow = None