diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3c3c490d2c468..d9495d843d939 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -193,12 +193,13 @@ Copy-on-Write improvements - The :class:`DataFrame` constructor, when constructing a DataFrame from a :class:`Series` and specifying ``copy=False``, will now respect Copy-on-Write. -- The :class:`DataFrame` constructor, when constructing from a NumPy array, - will now copy the array by default to avoid mutating the :class:`DataFrame` +- The :class:`DataFrame` and :class:`Series` constructors, when constructing from + a NumPy array, will now copy the array by default to avoid mutating + the :class:`DataFrame` / :class:`Series` when mutating the array. Specify ``copy=False`` to get the old behavior. When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write behavior when the NumPy array is modified after creation of the - :class:`DataFrame`. + :class:`DataFrame` / :class:`Series`. - The :meth:`DataFrame.from_records` will now respect Copy-on-Write when called with a :class:`DataFrame`. diff --git a/pandas/conftest.py b/pandas/conftest.py index 1aaad070dd313..8a9ba5f708d1c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -739,7 +739,7 @@ def _create_series(index): """Helper for the _series dict""" size = len(index) data = np.random.randn(size) - return Series(data, index=index, name="a") + return Series(data, index=index, name="a", copy=False) _series = { diff --git a/pandas/core/series.py b/pandas/core/series.py index d747a36002ffd..475920da20e88 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -88,6 +88,7 @@ validate_percentile, ) +from pandas.core.dtypes.astype import astype_is_view from pandas.core.dtypes.cast import ( LossySetitemError, convert_dtypes, @@ -370,14 +371,14 @@ def __init__( index=None, dtype: Dtype | None = None, name=None, - copy: bool = False, + copy: bool | None = None, fastpath: bool = False, ) -> None: if ( isinstance(data, (SingleBlockManager, SingleArrayManager)) and index is None and dtype is None - and copy is False + and (copy is False or copy is None) ): if using_copy_on_write(): data = data.copy(deep=False) @@ -390,6 +391,13 @@ def __init__( self.name = name return + if isinstance(data, (ExtensionArray, np.ndarray)): + if copy is not False and using_copy_on_write(): + if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)): + data = data.copy() + if copy is None: + copy = False + # we are called internally, so short-circuit if fastpath: # data is a ndarray, index is defined diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index ee9e1dbc81e12..d38f0b8719de0 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -60,7 +60,7 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): # GH#26988 cat = Categorical(["a", "b"]) expected = Categorical(result) - result = pd.Series(cat).replace(to_replace, value)._values + result = pd.Series(cat, copy=False).replace(to_replace, value)._values tm.assert_categorical_equal(result, expected) if to_replace == "b": # the "c" test is supposed to be unchanged @@ -68,7 +68,7 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg): # ensure non-inplace call does not affect original tm.assert_categorical_equal(cat, expected) - pd.Series(cat).replace(to_replace, value, inplace=True) + pd.Series(cat, copy=False).replace(to_replace, value, inplace=True) tm.assert_categorical_equal(cat, expected) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 47477b9e2c79d..dd4310c19aaff 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -200,7 +200,9 @@ def test_astype_arrow_timestamp(using_copy_on_write): result = df.astype("timestamp[ns][pyarrow]") if using_copy_on_write: assert not result._mgr._has_no_reference(0) - assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._data) + # TODO(CoW): arrow is not setting copy=False in the Series constructor + # under the hood + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")._data) def test_convert_dtypes_infer_objects(using_copy_on_write): diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 76b281b9fad68..2f2548ad0f238 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -30,7 +30,7 @@ def test_series_from_series(dtype, using_copy_on_write): result = Series(ser, dtype=dtype) # the shallow copy still shares memory - assert np.shares_memory(ser.values, result.values) + assert np.shares_memory(get_array(ser), get_array(result)) if using_copy_on_write: assert result._mgr.blocks[0].refs.has_reference() @@ -40,13 +40,13 @@ def test_series_from_series(dtype, using_copy_on_write): result.iloc[0] = 0 assert ser.iloc[0] == 1 # mutating triggered a copy-on-write -> no longer shares memory - assert not np.shares_memory(ser.values, result.values) + assert not np.shares_memory(get_array(ser), get_array(result)) else: # mutating shallow copy does mutate original result.iloc[0] = 0 assert ser.iloc[0] == 0 # and still shares memory - assert np.shares_memory(ser.values, result.values) + assert np.shares_memory(get_array(ser), get_array(result)) # the same when modifying the parent result = Series(ser, dtype=dtype) @@ -90,6 +90,38 @@ def test_series_from_series_with_reindex(using_copy_on_write): assert not result._mgr.blocks[0].refs.has_reference() +@pytest.mark.parametrize("fastpath", [False, True]) +@pytest.mark.parametrize("dtype", [None, "int64"]) +@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)]) +@pytest.mark.parametrize( + "arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")] +) +def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr): + if idx is None or dtype is not None: + fastpath = False + ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath) + ser_orig = ser.copy() + data = getattr(arr, "_data", arr) + if using_copy_on_write: + assert not np.shares_memory(get_array(ser), data) + else: + assert np.shares_memory(get_array(ser), data) + + arr[0] = 100 + if using_copy_on_write: + tm.assert_series_equal(ser, ser_orig) + else: + expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype) + tm.assert_series_equal(ser, expected) + + +@pytest.mark.parametrize("copy", [True, False, None]) +def test_series_from_array_different_dtype(using_copy_on_write, copy): + arr = np.array([1, 2, 3], dtype="int64") + ser = Series(arr, dtype="int32", copy=copy) + assert not np.shares_memory(get_array(ser), arr) + + @pytest.mark.parametrize( "idx", [ diff --git a/pandas/tests/extension/base/constructors.py b/pandas/tests/extension/base/constructors.py index 29766ff392296..1f85c89ef38be 100644 --- a/pandas/tests/extension/base/constructors.py +++ b/pandas/tests/extension/base/constructors.py @@ -22,7 +22,7 @@ def test_array_from_scalars(self, data): assert isinstance(result, type(data)) def test_series_constructor(self, data): - result = pd.Series(data) + result = pd.Series(data, copy=False) assert result.dtype == data.dtype assert len(result) == len(data) if hasattr(result._mgr, "blocks"): diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index ca867ffb77296..5e4fdfece1596 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -254,7 +254,7 @@ def test_fillna_copy_frame(self, data_missing): def test_fillna_copy_series(self, data_missing): arr = data_missing.take([1, 1]) - ser = pd.Series(arr) + ser = pd.Series(arr, copy=False) ser_orig = ser.copy() filled_val = ser[0] diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 2aeb2af567ea0..b92df30fc5c7f 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -288,7 +288,7 @@ def test_fillna_copy_frame(self, data_missing, using_copy_on_write): def test_fillna_copy_series(self, data_missing, using_copy_on_write): arr = data_missing.take([1, 1]) - ser = pd.Series(arr) + ser = pd.Series(arr, copy=False) filled_val = ser[0] result = ser.fillna(filled_val) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index f60c38c041fcf..2496766eae034 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1362,7 +1362,7 @@ def check_can_hold_element(self, obj, elem, inplace: bool): def check_series_setitem(self, elem, index: Index, inplace: bool): arr = index._data.copy() - ser = Series(arr) + ser = Series(arr, copy=False) self.check_can_hold_element(ser, elem, inplace) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index fae29c124df71..0d84ecf955700 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2812,7 +2812,7 @@ def __getitem__(self, ix): def dtype(self): return DtypeStub() - series = Series(ExtTypeStub()) + series = Series(ExtTypeStub(), copy=False) res = repr(series) # This line crashed before #33770 was fixed. expected = "0 [False True]\n" + "1 [ True False]\n" + "dtype: DtypeStub" assert res == expected diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index dd6aef04a2e6a..70cb173483692 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -70,7 +70,7 @@ def test_td64_mean(self, box): tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D") tdarr = tdi._data - obj = box(tdarr) + obj = box(tdarr, copy=False) result = obj.mean() expected = np.array(tdarr).mean() diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 636ebfded026d..39cbf2b7bac10 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -578,7 +578,7 @@ def test_setitem_scalar_into_readonly_backing_data(): array = np.zeros(5) array.flags.writeable = False # make the array immutable - series = Series(array) + series = Series(array, copy=False) for n in series.index: msg = "assignment destination is read-only" @@ -593,7 +593,7 @@ def test_setitem_slice_into_readonly_backing_data(): array = np.zeros(5) array.flags.writeable = False # make the array immutable - series = Series(array) + series = Series(array, copy=False) msg = "assignment destination is read-only" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1ec8d990add3a..2d91a4ef6c58e 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -527,7 +527,7 @@ def test_categorical_sideeffects_free(self): # however, copy is False by default # so this WILL change values cat = Categorical(["a", "b", "c", "a"]) - s = Series(cat) + s = Series(cat, copy=False) assert s.values is cat s = s.cat.rename_categories([1, 2, 3]) assert s.values is not cat