From c53a3c2dd7b1138d58c305b161894ed73652afdc Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sat, 21 Nov 2020 19:54:02 +0000
Subject: [PATCH 01/10] moreless copy/paste from fletcher

---
 pandas/core/arrays/string_arrow.py | 46 +++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 184fbc050036b..8adceaa13eb47 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from distutils.version import LooseVersion
-from typing import TYPE_CHECKING, Any, Sequence, Type, Union
+from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
 
 import numpy as np
 
@@ -15,10 +15,12 @@
 from pandas.api.types import (
     is_array_like,
     is_bool_dtype,
+    is_int64_dtype,
     is_integer,
     is_integer_dtype,
     is_scalar,
 )
+from pandas.core.algorithms import factorize
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import check_array_indexer, validate_indices
@@ -252,9 +254,45 @@ def __len__(self) -> int:
         """
         return len(self._data)
 
-    @classmethod
-    def _from_factorized(cls, values, original):
-        return cls._from_sequence(values)
+    def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
+        """Encode the extension array as an enumerated type.
+        Parameters
+        ----------
+        na_sentinel : int, default -1
+            Value to use in the `labels` array to indicate missing values.
+        Returns
+        -------
+        labels : ndarray
+            An integer NumPy array that's an indexer into the original
+            ExtensionArray.
+        uniques : ExtensionArray
+            An ExtensionArray containing the unique values of `self`.
+            .. note::
+               uniques will *not* contain an entry for the NA value of
+               the ExtensionArray if there are any missing values present
+               in `self`.
+        See Also
+        --------
+        pandas.factorize : Top-level factorize method that dispatches here.
+        Notes
+        -----
+        :meth:`pandas.factorize` offers a `sort` keyword as well.
+        """
+        if pa.types.is_dictionary(self._data.type):
+            raise NotImplementedError()
+        elif self._data.num_chunks == 1:
+            # Dictionaryencode and do the same as above
+            encoded = self._data.chunk(0).dictionary_encode()
+            indices = encoded.indices.to_pandas()
+            if indices.dtype.kind == "f":
+                indices[np.isnan(indices)] = na_sentinel
+                indices = indices.astype(int)
+            if not is_int64_dtype(indices):
+                indices = indices.astype(np.int64)
+            return indices.values, type(self)(encoded.dictionary)
+        else:
+            np_array = self._data.to_pandas().values
+            return factorize(np_array, na_sentinel=na_sentinel)
 
     @classmethod
     def _concat_same_type(cls, to_concat) -> ArrowStringArray:

From b7d0ab815c60f88ea88a8f8e7b855ffce8c39b60 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 22 Nov 2020 19:31:14 +0000
Subject: [PATCH 02/10] use docstring from base class

---
 pandas/core/arrays/string_arrow.py | 25 ++-----------------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 8adceaa13eb47..dae635bbfe3a6 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from pandas._libs import lib, missing as libmissing
+from pandas.util._decorators import doc
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.base import ExtensionDtype
@@ -254,30 +255,8 @@ def __len__(self) -> int:
         """
         return len(self._data)
 
+    @doc(ExtensionArray.factorize)
     def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
-        """Encode the extension array as an enumerated type.
-        Parameters
-        ----------
-        na_sentinel : int, default -1
-            Value to use in the `labels` array to indicate missing values.
-        Returns
-        -------
-        labels : ndarray
-            An integer NumPy array that's an indexer into the original
-            ExtensionArray.
-        uniques : ExtensionArray
-            An ExtensionArray containing the unique values of `self`.
-            .. note::
-               uniques will *not* contain an entry for the NA value of
-               the ExtensionArray if there are any missing values present
-               in `self`.
-        See Also
-        --------
-        pandas.factorize : Top-level factorize method that dispatches here.
-        Notes
-        -----
-        :meth:`pandas.factorize` offers a `sort` keyword as well.
-        """
         if pa.types.is_dictionary(self._data.type):
             raise NotImplementedError()
         elif self._data.num_chunks == 1:

From 154496aea77fcf7776b2fbbf011a31f5f461eb56 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Sun, 22 Nov 2020 19:53:53 +0000
Subject: [PATCH 03/10] remove redundant type check

---
 pandas/core/arrays/string_arrow.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index dae635bbfe3a6..7c89b6d306437 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -257,10 +257,7 @@ def __len__(self) -> int:
 
     @doc(ExtensionArray.factorize)
     def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
-        if pa.types.is_dictionary(self._data.type):
-            raise NotImplementedError()
-        elif self._data.num_chunks == 1:
-            # Dictionaryencode and do the same as above
+        if self._data.num_chunks == 1:
             encoded = self._data.chunk(0).dictionary_encode()
             indices = encoded.indices.to_pandas()
             if indices.dtype.kind == "f":

From 73c7de9e9cdefc5a8fd48c6e393f038a30f0cfc0 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 15 Feb 2021 11:07:59 +0000
Subject: [PATCH 04/10] ignore new mypy error

---
 pandas/core/arrays/string_arrow.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 3e4881f445406..14cc422bdcfe7 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -273,7 +273,11 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
             return indices.values, type(self)(encoded.dictionary)
         else:
             np_array = self._data.to_pandas().values
-            return factorize(np_array, na_sentinel=na_sentinel)
+            # error: Incompatible return value type (got "Tuple[Any, Union[Any,
+            # Index]]", expected "Tuple[Any, ExtensionArray]")
+            return factorize(  # type: ignore[return-value]
+                np_array, na_sentinel=na_sentinel
+            )
 
     @classmethod
     def _concat_same_type(cls, to_concat) -> ArrowStringArray:

From 42ca9c3dabdc220da8265840c7b54dbfd9749986 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 15 Feb 2021 15:14:43 +0000
Subject: [PATCH 05/10] update algorithms.Factorize.time_factorize

---
 asv_bench/benchmarks/algorithms.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index 65e52e03c43c7..d2fb564b47cc0 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -5,6 +5,7 @@
 from pandas._libs import lib
 
 import pandas as pd
+from pandas.core.arrays.string_arrow import ArrowStringDtype
 
 from .pandas_vb_common import tm
 
@@ -43,23 +44,34 @@ class Factorize:
             "datetime64[ns, tz]",
             "Int64",
             "boolean",
+            "string_arrow",
         ],
     ]
     param_names = ["unique", "sort", "dtype"]
 
     def setup(self, unique, sort, dtype):
         N = 10 ** 5
+        string_index = tm.makeStringIndex(N)
+        try:
+            string_arrow = pd.array(string_index, dtype=ArrowStringDtype())
+        except ImportError:
+            string_arrow = None
+
+        if dtype == "string_arrow" and not string_arrow:
+            raise NotImplementedError
+
         data = {
             "int": pd.Int64Index(np.arange(N)),
             "uint": pd.UInt64Index(np.arange(N)),
             "float": pd.Float64Index(np.random.randn(N)),
-            "string": tm.makeStringIndex(N),
+            "string": string_index,
             "datetime64[ns]": pd.date_range("2011-01-01", freq="H", periods=N),
             "datetime64[ns, tz]": pd.date_range(
                 "2011-01-01", freq="H", periods=N, tz="Asia/Tokyo"
             ),
             "Int64": pd.array(np.arange(N), dtype="Int64"),
             "boolean": pd.array(np.random.randint(0, 2, N), dtype="boolean"),
+            "string_arrow": string_arrow,
         }[dtype]
         if not unique:
             data = data.repeat(5)

From a251537a5a2ddec60d5b1818b36b2568292a00ba Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Mon, 15 Feb 2021 18:16:17 +0000
Subject: [PATCH 06/10] test for arrays with 2 chunks

---
 pandas/tests/extension/test_string.py | 48 +++++++++++++++++++++------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index d0a3ef17afdbc..49aee76e10f6a 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -26,6 +26,29 @@
 from pandas.tests.extension import base
 
 
+def split_array(arr):
+    if not isinstance(arr.dtype, ArrowStringDtype):
+        pytest.skip("chunked array n/a")
+
+    def _split_array(arr):
+        import pyarrow as pa
+
+        arrow_array = arr._data
+        split = len(arrow_array) // 2
+        arrow_array = pa.chunked_array(
+            [*arrow_array[:split].chunks, *arrow_array[split:].chunks]
+        )
+        assert arrow_array.num_chunks == 2
+        return type(arr)(arrow_array)
+
+    return _split_array(arr)
+
+
+@pytest.fixture(params=[True, False])
+def chunked(request):
+    return request.param
+
+
 @pytest.fixture(
     params=[
         StringDtype,
@@ -39,28 +62,32 @@ def dtype(request):
 
 
 @pytest.fixture
-def data(dtype):
+def data(dtype, chunked):
     strings = np.random.choice(list(string.ascii_letters), size=100)
     while strings[0] == strings[1]:
         strings = np.random.choice(list(string.ascii_letters), size=100)
 
-    return dtype.construct_array_type()._from_sequence(strings)
+    arr = dtype.construct_array_type()._from_sequence(strings)
+    return split_array(arr) if chunked else arr
 
 
 @pytest.fixture
-def data_missing(dtype):
+def data_missing(dtype, chunked):
     """Length 2 array with [NA, Valid]"""
-    return dtype.construct_array_type()._from_sequence([pd.NA, "A"])
+    arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"])
+    return split_array(arr) if chunked else arr
 
 
 @pytest.fixture
-def data_for_sorting(dtype):
-    return dtype.construct_array_type()._from_sequence(["B", "C", "A"])
+def data_for_sorting(dtype, chunked):
+    arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"])
+    return split_array(arr) if chunked else arr
 
 
 @pytest.fixture
-def data_missing_for_sorting(dtype):
-    return dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"])
+def data_missing_for_sorting(dtype, chunked):
+    arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"])
+    return split_array(arr) if chunked else arr
 
 
 @pytest.fixture
@@ -69,10 +96,11 @@ def na_value():
 
 
 @pytest.fixture
-def data_for_grouping(dtype):
-    return dtype.construct_array_type()._from_sequence(
+def data_for_grouping(dtype, chunked):
+    arr = dtype.construct_array_type()._from_sequence(
         ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"]
     )
+    return split_array(arr) if chunked else arr
 
 
 class TestDtype(base.BaseDtypeTests):

From ea59c38759e43f8f4953004528f3e184199facbc Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 18 Feb 2021 12:39:50 +0000
Subject: [PATCH 07/10] fix failing test_factorize_equivalence

---
 pandas/core/arrays/string_arrow.py | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 4a6cc15152e00..ee9059d7604a5 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -36,7 +36,6 @@
     is_integer_dtype,
     is_scalar,
 )
-from pandas.core.algorithms import factorize
 from pandas.core.arraylike import OpsMixin
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.indexers import (
@@ -279,22 +278,16 @@ def __len__(self) -> int:
 
     @doc(ExtensionArray.factorize)
     def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
-        if self._data.num_chunks == 1:
-            encoded = self._data.chunk(0).dictionary_encode()
-            indices = encoded.indices.to_pandas()
-            if indices.dtype.kind == "f":
-                indices[np.isnan(indices)] = na_sentinel
-                indices = indices.astype(int)
-            if not is_int64_dtype(indices):
-                indices = indices.astype(np.int64)
-            return indices.values, type(self)(encoded.dictionary)
-        else:
-            np_array = self._data.to_pandas().values
-            # error: Incompatible return value type (got "Tuple[Any, Union[Any,
-            # Index]]", expected "Tuple[Any, ExtensionArray]")
-            return factorize(  # type: ignore[return-value]
-                np_array, na_sentinel=na_sentinel
-            )
+        encoded = self._data.dictionary_encode()
+        indices = pa.chunked_array(
+            [c.indices for c in encoded.chunks], type=encoded.type.index_type
+        ).to_pandas()
+        if indices.dtype.kind == "f":
+            indices[np.isnan(indices)] = na_sentinel
+            indices = indices.astype(int)
+        if not is_int64_dtype(indices):
+            indices = indices.astype(np.int64)
+        return indices.values, type(self)(encoded.chunk(0).dictionary)
 
     @classmethod
     def _concat_same_type(cls, to_concat) -> ArrowStringArray:

From 7d987275aeacb174ec2222d3a3058d12f416d489 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 18 Feb 2021 13:05:47 +0000
Subject: [PATCH 08/10] fix failing test_factorize_empty

---
 pandas/core/arrays/string_arrow.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index ee9059d7604a5..c1b8d875ade1b 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -287,7 +287,13 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
             indices = indices.astype(int)
         if not is_int64_dtype(indices):
             indices = indices.astype(np.int64)
-        return indices.values, type(self)(encoded.chunk(0).dictionary)
+
+        if encoded.num_chunks:
+            uniques = type(self)(encoded.chunk(0).dictionary)
+        else:
+            uniques = type(self)(pa.array([], type=encoded.type.value_type))
+
+        return indices.values, uniques
 
     @classmethod
     def _concat_same_type(cls, to_concat) -> ArrowStringArray:

From 0023f0843cb7fe316a5b30393fef8957b5213d21 Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Thu, 18 Feb 2021 13:25:22 +0000
Subject: [PATCH 09/10] address dtype comment

---
 pandas/core/arrays/string_arrow.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index c1b8d875ade1b..0ed213f5ba4cb 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -31,7 +31,6 @@
 from pandas.api.types import (
     is_array_like,
     is_bool_dtype,
-    is_int64_dtype,
     is_integer,
     is_integer_dtype,
     is_scalar,
@@ -284,9 +283,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]:
         ).to_pandas()
         if indices.dtype.kind == "f":
             indices[np.isnan(indices)] = na_sentinel
-            indices = indices.astype(int)
-        if not is_int64_dtype(indices):
-            indices = indices.astype(np.int64)
+        indices = indices.astype(np.int64, copy=False)
 
         if encoded.num_chunks:
             uniques = type(self)(encoded.chunk(0).dictionary)

From 6a28414a7069067df6b93000bee88ff13e805d5f Mon Sep 17 00:00:00 2001
From: Simon Hawkins <simonjayhawkins@gmail.com>
Date: Fri, 19 Feb 2021 16:46:42 +0000
Subject: [PATCH 10/10] move ArrowStringDtype import inside try/except

---
 asv_bench/benchmarks/algorithms.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py
index d2fb564b47cc0..29c492442c0b2 100644
--- a/asv_bench/benchmarks/algorithms.py
+++ b/asv_bench/benchmarks/algorithms.py
@@ -5,7 +5,6 @@
 from pandas._libs import lib
 
 import pandas as pd
-from pandas.core.arrays.string_arrow import ArrowStringDtype
 
 from .pandas_vb_common import tm
 
@@ -53,6 +52,8 @@ def setup(self, unique, sort, dtype):
         N = 10 ** 5
         string_index = tm.makeStringIndex(N)
         try:
+            from pandas.core.arrays.string_arrow import ArrowStringDtype
+
             string_arrow = pd.array(string_index, dtype=ArrowStringDtype())
         except ImportError:
             string_arrow = None