From 1fe490cbe6cabb803c99fbbe7a2827ec2bec66d3 Mon Sep 17 00:00:00 2001
From: Alenka Frim <frim.alenka@gmail.com>
Date: Tue, 10 Jan 2023 15:52:24 +0100
Subject: [PATCH] Update the handling of allow_copy keyword

---
 python/pyarrow/interchange/from_dataframe.py  | 47 +++++++++++++++++--
 .../tests/interchange/test_conversion.py      | 46 ++++++++++++++++++
 2 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
index b2f65508a3e4b..204530a33547c 100644
--- a/python/pyarrow/interchange/from_dataframe.py
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -82,7 +82,8 @@ def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table:
     if not hasattr(df, "__dataframe__"):
         raise ValueError("`df` does not support __dataframe__")
 
-    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
+    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy),
+                           allow_copy=allow_copy)
 
 
 def _from_dataframe(df: DataFrameObject, allow_copy=True):
@@ -147,7 +148,7 @@ def protocol_df_chunk_to_pyarrow(
             DtypeKind.STRING,
             DtypeKind.DATETIME,
         ):
-            columns[name] = column_to_array(col)
+            columns[name] = column_to_array(col, allow_copy)
         elif dtype == DtypeKind.BOOL:
             columns[name] = bool_column_to_array(col, allow_copy)
         elif dtype == DtypeKind.CATEGORICAL:
@@ -160,6 +161,7 @@ def protocol_df_chunk_to_pyarrow(
 
 def column_to_array(
     col: ColumnObject,
+    allow_copy: bool = True,
 ) -> pa.Array:
     """
     Convert a column holding one of the primitive dtypes to a PyArrow array.
@@ -168,6 +170,9 @@ def column_to_array(
     Parameters
     ----------
     col : ColumnObject
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
 
     Returns
     -------
@@ -176,7 +181,8 @@ def column_to_array(
     buffers = col.get_buffers()
     data = buffers_to_array(buffers, col.size(),
                             col.describe_null,
-                            col.offset)
+                            col.offset,
+                            allow_copy)
     return data
 
 
@@ -298,6 +304,7 @@ def buffers_to_array(
     length: int,
     describe_null: ColumnNullType,
     offset: int = 0,
+    allow_copy: bool = True,
 ) -> pa.Array:
     """
     Build a PyArrow array from the passed buffer.
@@ -314,6 +321,9 @@ def buffers_to_array(
         as a tuple ``(kind, value)``
     offset : int, default: 0
         Number of elements to offset from the start of the buffer.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
 
     Returns
     -------
@@ -345,13 +355,15 @@ def buffers_to_array(
                                                      validity_dtype,
                                                      describe_null,
                                                      length,
-                                                     offset)
+                                                     offset,
+                                                     allow_copy)
     else:
         validity_pa_buff = validity_buffer_nan_sentinel(data_pa_buffer,
                                                         data_type,
                                                         describe_null,
                                                         length,
-                                                        offset)
+                                                        offset,
+                                                        allow_copy)
 
     # Construct a pyarrow Array from buffers
     data_dtype = map_date_type(data_type)
@@ -394,6 +406,7 @@ def validity_buffer_from_mask(
     describe_null: ColumnNullType,
     length: int,
     offset: int = 0,
+    allow_copy: bool = True,
 ) -> pa.Buffer:
     """
     Build a PyArrow buffer from the passed mask buffer.
@@ -412,6 +425,9 @@ def validity_buffer_from_mask(
         The number of values in the array.
     offset : int, default: 0
         Number of elements to offset from the start of the buffer.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
 
     Returns
     -------
@@ -435,6 +451,11 @@ def validity_buffer_from_mask(
                                  base=validity_buff)
 
         if null_kind == ColumnNullType.USE_BYTEMASK:
+            if not allow_copy:
+                raise RuntimeError(
+                    "To create a bitmask a copy of the data is "
+                    "required which is forbidden by allow_copy=False"
+                )
             mask = pa.Array.from_buffers(pa.int8(), length,
                                          [None, buff],
                                          offset=offset)
@@ -464,6 +485,7 @@ def validity_buffer_nan_sentinel(
     describe_null: ColumnNullType,
     length: int,
     offset: int = 0,
+    allow_copy: bool = True,
 ) -> pa.Buffer:
     """
     Build a PyArrow buffer from NaN or sentinel values.
@@ -482,6 +504,9 @@ def validity_buffer_nan_sentinel(
         The number of values in the array.
     offset : int, default: 0
         Number of elements to offset from the start of the buffer.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
 
     Returns
     -------
@@ -493,6 +518,12 @@ def validity_buffer_nan_sentinel(
 
     # Check for float NaN values
     if null_kind == ColumnNullType.USE_NAN:
+        if not allow_copy:
+            raise RuntimeError(
+                "To create a bitmask a copy of the data is "
+                "required which is forbidden by allow_copy=False"
+            )
+
         if kind == DtypeKind.FLOAT and bit_width == 16:
             # 'pyarrow.compute.is_nan' kernel not yet implemented
             # for float16
@@ -511,6 +542,12 @@ def validity_buffer_nan_sentinel(
 
     # Check for sentinel values
     elif null_kind == ColumnNullType.USE_SENTINEL:
+        if not allow_copy:
+            raise RuntimeError(
+                "To create a bitmask a copy of the data is "
+                "required which is forbidden by allow_copy=False"
+            )
+
         if kind == DtypeKind.DATETIME:
             sentinel_dtype = pa.int64()
         else:
diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py
index 83aff6cd0c54d..e3d33e7b77d52 100644
--- a/python/pyarrow/tests/interchange/test_conversion.py
+++ b/python/pyarrow/tests/interchange/test_conversion.py
@@ -448,3 +448,49 @@ def test_nan_as_null():
     table = pa.table({"a": [1, 2, 3, 4]})
     with pytest.raises(RuntimeError):
         table.__dataframe__(nan_as_null=True)
+
+
+@pytest.mark.pandas
+def test_allow_copy_false():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    # Test that an error is raised when a copy is needed
+    # to create a bitmask
+
+    df = pd.DataFrame({"a": [0, 1.0, 2.0]})
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({
+        "dt": [None, dt(2007, 7, 14), dt(2007, 7, 15)]
+    })
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+
+@pytest.mark.pandas
+def test_allow_copy_false_bool_categorical():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    # Test that an error is raised for boolean
+    # and categorical dtype (copy is always made)
+
+    df = pd.DataFrame({"a": [None, False, True]})
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({"a": [True, False, True]})
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({"weekday": ["a", "b", None]})
+    df = df.astype("category")
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({"weekday": ["a", "b", "c"]})
+    df = df.astype("category")
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)