Skip to content

Commit

Permalink
Update the handling of allow_copy keyword
Browse files Browse the repository at this point in the history
  • Loading branch information
AlenkaF committed Jan 10, 2023
1 parent a99d0c1 commit 1fe490c
Showing 2 changed files with 88 additions and 5 deletions.
47 changes: 42 additions & 5 deletions python/pyarrow/interchange/from_dataframe.py
Original file line number Diff line number Diff line change
@@ -82,7 +82,8 @@ def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table:
if not hasattr(df, "__dataframe__"):
raise ValueError("`df` does not support __dataframe__")

return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
return _from_dataframe(df.__dataframe__(allow_copy=allow_copy),
allow_copy=allow_copy)


def _from_dataframe(df: DataFrameObject, allow_copy=True):
@@ -147,7 +148,7 @@ def protocol_df_chunk_to_pyarrow(
DtypeKind.STRING,
DtypeKind.DATETIME,
):
columns[name] = column_to_array(col)
columns[name] = column_to_array(col, allow_copy)
elif dtype == DtypeKind.BOOL:
columns[name] = bool_column_to_array(col, allow_copy)
elif dtype == DtypeKind.CATEGORICAL:
@@ -160,6 +161,7 @@ def protocol_df_chunk_to_pyarrow(

def column_to_array(
col: ColumnObject,
allow_copy: bool = True,
) -> pa.Array:
"""
Convert a column holding one of the primitive dtypes to a PyArrow array.
@@ -168,6 +170,9 @@ def column_to_array(
Parameters
----------
col : ColumnObject
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
@@ -176,7 +181,8 @@ def column_to_array(
buffers = col.get_buffers()
data = buffers_to_array(buffers, col.size(),
col.describe_null,
col.offset)
col.offset,
allow_copy)
return data


@@ -298,6 +304,7 @@ def buffers_to_array(
length: int,
describe_null: ColumnNullType,
offset: int = 0,
allow_copy: bool = True,
) -> pa.Array:
"""
Build a PyArrow array from the passed buffer.
@@ -314,6 +321,9 @@ def buffers_to_array(
as a tuple ``(kind, value)``
offset : int, default: 0
Number of elements to offset from the start of the buffer.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
@@ -345,13 +355,15 @@ def buffers_to_array(
validity_dtype,
describe_null,
length,
offset)
offset,
allow_copy)
else:
validity_pa_buff = validity_buffer_nan_sentinel(data_pa_buffer,
data_type,
describe_null,
length,
offset)
offset,
allow_copy)

# Construct a pyarrow Array from buffers
data_dtype = map_date_type(data_type)
@@ -394,6 +406,7 @@ def validity_buffer_from_mask(
describe_null: ColumnNullType,
length: int,
offset: int = 0,
allow_copy: bool = True,
) -> pa.Buffer:
"""
Build a PyArrow buffer from the passed mask buffer.
@@ -412,6 +425,9 @@ def validity_buffer_from_mask(
The number of values in the array.
offset : int, default: 0
Number of elements to offset from the start of the buffer.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
@@ -435,6 +451,11 @@ def validity_buffer_from_mask(
base=validity_buff)

if null_kind == ColumnNullType.USE_BYTEMASK:
if not allow_copy:
raise RuntimeError(
"To create a bitmask a copy of the data is "
"required which is forbidden by allow_copy=False"
)
mask = pa.Array.from_buffers(pa.int8(), length,
[None, buff],
offset=offset)
@@ -464,6 +485,7 @@ def validity_buffer_nan_sentinel(
describe_null: ColumnNullType,
length: int,
offset: int = 0,
allow_copy: bool = True,
) -> pa.Buffer:
"""
Build a PyArrow buffer from NaN or sentinel values.
@@ -482,6 +504,9 @@ def validity_buffer_nan_sentinel(
The number of values in the array.
offset : int, default: 0
Number of elements to offset from the start of the buffer.
allow_copy : bool, default: True
Whether to allow copying the memory to perform the conversion
(if false then zero-copy approach is requested).
Returns
-------
@@ -493,6 +518,12 @@ def validity_buffer_nan_sentinel(

# Check for float NaN values
if null_kind == ColumnNullType.USE_NAN:
if not allow_copy:
raise RuntimeError(
"To create a bitmask a copy of the data is "
"required which is forbidden by allow_copy=False"
)

if kind == DtypeKind.FLOAT and bit_width == 16:
# 'pyarrow.compute.is_nan' kernel not yet implemented
# for float16
@@ -511,6 +542,12 @@ def validity_buffer_nan_sentinel(

# Check for sentinel values
elif null_kind == ColumnNullType.USE_SENTINEL:
if not allow_copy:
raise RuntimeError(
"To create a bitmask a copy of the data is "
"required which is forbidden by allow_copy=False"
)

if kind == DtypeKind.DATETIME:
sentinel_dtype = pa.int64()
else:
46 changes: 46 additions & 0 deletions python/pyarrow/tests/interchange/test_conversion.py
Original file line number Diff line number Diff line change
@@ -448,3 +448,49 @@ def test_nan_as_null():
table = pa.table({"a": [1, 2, 3, 4]})
with pytest.raises(RuntimeError):
table.__dataframe__(nan_as_null=True)


@pytest.mark.pandas
def test_allow_copy_false():
if Version(pd.__version__) < Version("1.5.0"):
pytest.skip("__dataframe__ added to pandas in 1.5.0")

# Test that an error is raised when a copy is needed
# to create a bitmask

df = pd.DataFrame({"a": [0, 1.0, 2.0]})
with pytest.raises(RuntimeError):
pi.from_dataframe(df, allow_copy=False)

df = pd.DataFrame({
"dt": [None, dt(2007, 7, 14), dt(2007, 7, 15)]
})
with pytest.raises(RuntimeError):
pi.from_dataframe(df, allow_copy=False)


@pytest.mark.pandas
def test_allow_copy_false_bool_categorical():
if Version(pd.__version__) < Version("1.5.0"):
pytest.skip("__dataframe__ added to pandas in 1.5.0")

# Test that an error is raised for boolean
# and categorical dtype (copy is always made)

df = pd.DataFrame({"a": [None, False, True]})
with pytest.raises(RuntimeError):
pi.from_dataframe(df, allow_copy=False)

df = pd.DataFrame({"a": [True, False, True]})
with pytest.raises(RuntimeError):
pi.from_dataframe(df, allow_copy=False)

df = pd.DataFrame({"weekday": ["a", "b", None]})
df = df.astype("category")
with pytest.raises(RuntimeError):
pi.from_dataframe(df, allow_copy=False)

df = pd.DataFrame({"weekday": ["a", "b", "c"]})
df = df.astype("category")
with pytest.raises(RuntimeError):
pi.from_dataframe(df, allow_copy=False)

0 comments on commit 1fe490c

Please sign in to comment.