GH-40153: [Python] Avoid using np.take in Array.to_numpy() (#40295)

### Rationale for this change `Array.to_numpy` calls `np.take` to linearize dictionary arrays. This fails on 32-bit Numpy builds because we give Numpy 64-bit indices and Numpy would like to downcast them. ### What changes are included in this PR? Avoid calling `np.take`, instead using our own dictionary decoding routine. ### Are these changes tested? Yes. A test failure is fixed on 32-bit. ### Are there any user-facing changes? No. * GitHub Issue: #40153 Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
apache · Feb 29, 2024 · 5c4869d · 5c4869d
1 parent fc48b89
commit 5c4869d
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 4 deletions.
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -1573,7 +1573,7 @@ cdef class Array(_PandasConvertible):
         # decoding the dictionary will make sure nulls are correctly handled.
         # Decoding a dictionary does imply a copy by the way,
         # so it can't be done if the user requested a zero_copy.
-        c_options.decode_dictionaries = not zero_copy_only
+        c_options.decode_dictionaries = True
         c_options.zero_copy_only = zero_copy_only
         c_options.to_numpy = True
 
@@ -1585,9 +1585,6 @@ cdef class Array(_PandasConvertible):
         # always convert to numpy array without pandas dependency
         array = PyObject_to_object(out)
 
-        if isinstance(array, dict):
-            array = np.take(array['dictionary'], array['indices'])
-
         if writable and not array.flags.writeable:
             # if the conversion already needed to a copy, writeable is True
             array = array.copy()

diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -2515,6 +2515,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options,
                                    std::shared_ptr<ChunkedArray> arr, PyObject* py_ref,
                                    PyObject** out) {
   if (options.decode_dictionaries && arr->type()->id() == Type::DICTIONARY) {
+    // XXX we should return an error as below if options.zero_copy_only
+    // is true, but that would break compatibility with existing tests.
     const auto& dense_type =
         checked_cast<const DictionaryType&>(*arr->type()).value_type();
     RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &arr));