From 0d3af8d766944d98f463b5cbc04a65b507e27f14 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Apr 2024 17:07:44 +0200 Subject: [PATCH] GH-41098: [Python] Add copy keyword in Array.__array__ for numpy 2.0+ compatibility (#41071) ### Rationale for this change Adapting for changes in numpy 2.0 as decribed at https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword and future changes to pass copy=True (https://github.com/numpy/numpy/issues/26208) ### What changes are included in this PR? Add a `copy=None` to the signatures of our `__array__` methods. This does have impact on the user facing behaviour, though. Questioning that upstream at https://github.com/numpy/numpy/issues/25941#issuecomment-2043035821 ### Are these changes tested? Yes ### Are there any user-facing changes? No (compared to usage with numpy<2) * GitHub Issue: #39532 * GitHub Issue: #41098 Authored-by: Joris Van den Bossche Signed-off-by: Antoine Pitrou --- python/pyarrow/array.pxi | 21 +++++++++++-- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/table.pxi | 23 ++++++++++++-- python/pyarrow/tests/test_array.py | 47 ++++++++++++++++++++++++++++ python/pyarrow/tests/test_table.py | 16 ++++++++++ 5 files changed, 103 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 45fd29ad3b3f3..60fc09ea861b6 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1516,11 +1516,28 @@ cdef class Array(_PandasConvertible): def _to_pandas(self, options, types_mapper=None, **kwargs): return _array_like_to_pandas(self, options, types_mapper=types_mapper) - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): + if copy is False: + try: + values = self.to_numpy(zero_copy_only=True) + except ArrowInvalid: + raise ValueError( + "Unable to avoid a copy while creating a numpy array as requested.\n" + "If using `np.array(obj, copy=False)` replace it with " + "`np.asarray(obj)` to allow a copy when needed" + ) + # values is already a numpy array at this point, but calling np.array(..) + # again to handle the `dtype` keyword with a no-copy guarantee + return np.array(values, dtype=dtype, copy=False) + values = self.to_numpy(zero_copy_only=False) + if copy is True and is_numeric(self.type.id) and self.null_count == 0: + # to_numpy did not yet make a copy (is_numeric = integer/floats, no decimal) + return np.array(values, dtype=dtype, copy=True) + if dtype is None: return values - return values.astype(dtype) + return np.asarray(values, dtype=dtype) def to_numpy(self, zero_copy_only=True, writable=False): """ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index a35919579541a..6dae45ab80b1c 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -173,6 +173,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string ToString() c_bool is_primitive(Type type) + c_bool is_numeric(Type type) cdef cppclass CArrayData" arrow::ArrayData": shared_ptr[CDataType] type diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index d31ea0a5fa1e9..b35a321dd2ffc 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -525,11 +525,19 @@ cdef class ChunkedArray(_PandasConvertible): return values - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid a copy while creating a numpy array as requested " + "(converting a pyarrow.ChunkedArray always results in a copy).\n" + "If using `np.array(obj, copy=False)` replace it with " + "`np.asarray(obj)` to allow a copy when needed" + ) + # 'copy' can further be ignored because to_numpy() already returns a copy values = self.to_numpy() if dtype is None: return values - return values.astype(dtype) + return values.astype(dtype, copy=False) def cast(self, object target_type=None, safe=None, options=None): """ @@ -1562,7 +1570,16 @@ cdef class _Tabular(_PandasConvertible): raise TypeError(f"Do not call {self.__class__.__name__}'s constructor directly, use " f"one of the `{self.__class__.__name__}.from_*` functions instead.") - def __array__(self, dtype=None): + def __array__(self, dtype=None, copy=None): + if copy is False: + raise ValueError( + "Unable to avoid a copy while creating a numpy array as requested " + f"(converting a pyarrow.{self.__class__.__name__} always results " + "in a copy).\n" + "If using `np.array(obj, copy=False)` replace it with " + "`np.asarray(obj)` to allow a copy when needed" + ) + # 'copy' can further be ignored because stacking will result in a copy column_arrays = [ np.asarray(self.column(i), dtype=dtype) for i in range(self.num_columns) ] diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 8bcb28c0d41b9..156d58326b961 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -31,6 +31,7 @@ import pyarrow as pa import pyarrow.tests.strategies as past +from pyarrow.vendored.version import Version def test_total_bytes_allocated(): @@ -3302,6 +3303,52 @@ def test_array_from_large_pyints(): pa.array([int(2 ** 63)]) +def test_numpy_array_protocol(): + # test the __array__ method on pyarrow.Array + arr = pa.array([1, 2, 3]) + result = np.asarray(arr) + expected = np.array([1, 2, 3], dtype="int64") + np.testing.assert_array_equal(result, expected) + + # this should not raise a deprecation warning with numpy 2.0+ + result = np.array(arr, copy=False) + np.testing.assert_array_equal(result, expected) + + result = np.array(arr, dtype="int64", copy=False) + np.testing.assert_array_equal(result, expected) + + # no zero-copy is possible + arr = pa.array([1, 2, None]) + expected = np.array([1, 2, np.nan], dtype="float64") + result = np.asarray(arr) + np.testing.assert_array_equal(result, expected) + + if Version(np.__version__) < Version("2.0"): + # copy keyword is not strict and not passed down to __array__ + result = np.array(arr, copy=False) + np.testing.assert_array_equal(result, expected) + + result = np.array(arr, dtype="float64", copy=False) + np.testing.assert_array_equal(result, expected) + else: + # starting with numpy 2.0, the copy=False keyword is assumed to be strict + with pytest.raises(ValueError, match="Unable to avoid a copy"): + np.array(arr, copy=False) + + arr = pa.array([1, 2, 3]) + with pytest.raises(ValueError): + np.array(arr, dtype="float64", copy=False) + + # copy=True -> not yet passed by numpy, so we have to call this directly to test + arr = pa.array([1, 2, 3]) + result = arr.__array__(copy=True) + assert result.flags.writeable + + arr = pa.array([1, 2, 3]) + result = arr.__array__(dtype=np.dtype("float64"), copy=True) + assert result.dtype == "float64" + + def test_array_protocol(): class MyArray: diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 539da0e685381..e11efa64db4d4 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -24,6 +24,7 @@ import pytest import pyarrow as pa import pyarrow.compute as pc +from pyarrow.vendored.version import Version def test_chunked_array_basics(): @@ -3238,6 +3239,21 @@ def test_numpy_asarray(constructor): assert result.dtype == "int32" +@pytest.mark.parametrize("constructor", [pa.table, pa.record_batch]) +def test_numpy_array_protocol(constructor): + table = constructor([[1, 2, 3], [4.0, 5.0, 6.0]], names=["a", "b"]) + expected = np.array([[1, 4], [2, 5], [3, 6]], dtype="float64") + + if Version(np.__version__) < Version("2.0"): + # copy keyword is not strict and not passed down to __array__ + result = np.array(table, copy=False) + np.testing.assert_array_equal(result, expected) + else: + # starting with numpy 2.0, the copy=False keyword is assumed to be strict + with pytest.raises(ValueError, match="Unable to avoid a copy"): + np.array(table, copy=False) + + @pytest.mark.acero def test_invalid_non_join_column(): NUM_ITEMS = 30