Skip to content

Commit

Permalink
API / CoW: Copy NumPy arrays by default in DataFrame constructor (pan…
Browse files Browse the repository at this point in the history
…das-dev#51731)

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
phofl and jorisvandenbossche committed Mar 17, 2023
1 parent a498448 commit 3826ad7
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 16 deletions.
7 changes: 7 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,13 @@ Copy-on-Write improvements
of Series objects and specifying ``copy=False``, will now use a lazy copy
of those Series objects for the columns of the DataFrame (:issue:`50777`)

- The :class:`DataFrame` constructor, when constructing from a NumPy array,
will now copy the array by default to avoid mutating the :class:`DataFrame`
when mutating the array. Specify ``copy=False`` to get the old behavior.
When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write
behavior when the NumPy array is modified after creation of the
:class:`DataFrame`.

- Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``)
will now always raise an warning when Copy-on-Write is enabled. In this mode,
chained assignment can never work because we are always setting into a temporary
Expand Down
4 changes: 4 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,10 @@ def __init__(
# INFO(ArrayManager) by default copy the 2D input array to get
# contiguous 1D arrays
copy = True
elif using_copy_on_write() and not isinstance(
data, (Index, DataFrame, Series)
):
copy = True
else:
copy = False

Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,19 @@ def test_dataframe_from_dict_of_series_with_dtype(index):
df.iloc[0, 0] = 100
arr_after = get_array(df, "a")
assert np.shares_memory(arr_before, arr_after)


@pytest.mark.parametrize("copy", [False, None, True])
def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager):
arr = np.array([[1, 2], [3, 4]])
df = DataFrame(arr, copy=copy)

if (
using_copy_on_write
and copy is not False
or copy is True
or (using_array_manager and copy is None)
):
assert not np.shares_memory(get_array(df, 0), arr)
else:
assert np.shares_memory(get_array(df, 0), arr)
3 changes: 2 additions & 1 deletion pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ def test_fillna_dict_inplace_nonunique_columns(self, using_copy_on_write):
def test_fillna_on_column_view(self, using_copy_on_write):
# GH#46149 avoid unnecessary copies
arr = np.full((40, 50), np.nan)
df = DataFrame(arr)
df = DataFrame(arr, copy=False)

# TODO(CoW): This should raise a chained assignment error
df[0].fillna(-1, inplace=True)
if using_copy_on_write:
assert np.isnan(arr[:, 0]).all()
Expand Down
17 changes: 14 additions & 3 deletions pandas/tests/frame/methods/test_to_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,24 @@ def test_to_numpy_dtype(self):
tm.assert_numpy_array_equal(result, expected)

@td.skip_array_manager_invalid_test
def test_to_numpy_copy(self):
def test_to_numpy_copy(self, using_copy_on_write):
arr = np.random.randn(4, 3)
df = DataFrame(arr)
assert df.values.base is arr
assert df.to_numpy(copy=False).base is arr
if using_copy_on_write:
assert df.values.base is not arr
assert df.to_numpy(copy=False).base is df.values.base
else:
assert df.values.base is arr
assert df.to_numpy(copy=False).base is arr
assert df.to_numpy(copy=True).base is not arr

# we still don't want a copy when na_value=np.nan is passed,
# and that can be respected because we are already numpy-float
if using_copy_on_write:
assert df.to_numpy(copy=False).base is df.values.base
else:
assert df.to_numpy(copy=False, na_value=np.nan).base is arr

def test_to_numpy_mixed_dtype_to_str(self):
# https://github.com/pandas-dev/pandas/issues/35455
df = DataFrame([[Timestamp("2020-01-01 00:00:00"), 100.0]])
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/frame/methods/test_transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def test_transpose_get_view(self, float_frame, using_copy_on_write):
assert (float_frame.values[5:10] == 5).all()

@td.skip_array_manager_invalid_test
def test_transpose_get_view_dt64tzget_view(self):
def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write):
dti = date_range("2016-01-01", periods=6, tz="US/Pacific")
arr = dti._data.reshape(3, 2)
df = DataFrame(arr)
Expand All @@ -130,4 +130,7 @@ def test_transpose_get_view_dt64tzget_view(self):
assert result._mgr.nblocks == 1

rtrip = result._mgr.blocks[0].values
assert np.shares_memory(arr._ndarray, rtrip._ndarray)
if using_copy_on_write:
assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray)
else:
assert np.shares_memory(arr._ndarray, rtrip._ndarray)
18 changes: 12 additions & 6 deletions pandas/tests/frame/methods/test_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,29 +230,35 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame):

class TestPrivateValues:
@td.skip_array_manager_invalid_test
def test_private_values_dt64tz(self):
def test_private_values_dt64tz(self, using_copy_on_write):
dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1)

df = DataFrame(dta, columns=["A"])
tm.assert_equal(df._values, dta)

# we have a view
assert np.shares_memory(df._values._ndarray, dta._ndarray)
if using_copy_on_write:
assert not np.shares_memory(df._values._ndarray, dta._ndarray)
else:
# we have a view
assert np.shares_memory(df._values._ndarray, dta._ndarray)

# TimedeltaArray
tda = dta - dta
df2 = df - df
tm.assert_equal(df2._values, tda)

@td.skip_array_manager_invalid_test
def test_private_values_dt64tz_multicol(self):
def test_private_values_dt64tz_multicol(self, using_copy_on_write):
dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2)

df = DataFrame(dta, columns=["A", "B"])
tm.assert_equal(df._values, dta)

# we have a view
assert np.shares_memory(df._values._ndarray, dta._ndarray)
if using_copy_on_write:
assert not np.shares_memory(df._values._ndarray, dta._ndarray)
else:
# we have a view
assert np.shares_memory(df._values._ndarray, dta._ndarray)

# TimedeltaArray
tda = dta - dta
Expand Down
13 changes: 9 additions & 4 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,14 +309,14 @@ def test_constructor_dtype_nocast_view_2d_array(
def test_1d_object_array_does_not_copy(self):
# https://github.com/pandas-dev/pandas/issues/39272
arr = np.array(["a", "b"], dtype="object")
df = DataFrame(arr)
df = DataFrame(arr, copy=False)
assert np.shares_memory(df.values, arr)

@td.skip_array_manager_invalid_test
def test_2d_object_array_does_not_copy(self):
# https://github.com/pandas-dev/pandas/issues/39272
arr = np.array([["a", "b"], ["c", "d"]], dtype="object")
df = DataFrame(arr)
df = DataFrame(arr, copy=False)
assert np.shares_memory(df.values, arr)

def test_constructor_dtype_list_data(self):
Expand Down Expand Up @@ -2107,13 +2107,18 @@ def test_constructor_frame_shallow_copy(self, float_frame):
cop.index = np.arange(len(cop))
tm.assert_frame_equal(float_frame, orig)

def test_constructor_ndarray_copy(self, float_frame, using_array_manager):
def test_constructor_ndarray_copy(
self, float_frame, using_array_manager, using_copy_on_write
):
if not using_array_manager:
arr = float_frame.values.copy()
df = DataFrame(arr)

arr[5] = 5
assert (df.values[5] == 5).all()
if using_copy_on_write:
assert not (df.values[5] == 5).all()
else:
assert (df.values[5] == 5).all()

df = DataFrame(arr, copy=True)
arr[6] = 6
Expand Down

0 comments on commit 3826ad7

Please sign in to comment.