From 36afd118a7946d1aeaa5a206fc951a3f12ba723a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 09:44:00 +0200 Subject: [PATCH 1/2] BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array --- pandas/_libs/lib.pyx | 18 ++++++++++++------ pandas/tests/copy_view/test_astype.py | 15 ++++++++++++--- pandas/tests/libs/test_lib.py | 14 ++++++++++++++ 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1a2a0142c52e..8a0a129c2f294 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array( result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: - already_copied = False + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False elif not copy and not result.flags.writeable: # Weird edge case where result is a view already_copied = False diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index de56d5e4a07ee..8ef4a1ff741d7 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -111,7 +111,7 @@ def test_astype_string_and_object_update_original(dtype, new_dtype): tm.assert_frame_equal(df2, df_orig) -def test_astype_string_copy_on_pickle_roundrip(): +def test_astype_str_copy_on_pickle_roundrip(): # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -120,14 +120,23 @@ def test_astype_string_copy_on_pickle_roundrip(): tm.assert_series_equal(base, base_copy) +def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + @td.skip_if_no("pyarrow") -def test_astype_string_read_only_on_pickle_roundrip(): +def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter read-only array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) base_copy = pickle.loads(pickle.dumps(base)) base_copy._values.flags.writeable = False - base_copy.astype("string[pyarrow]") + base_copy.astype(any_string_dtype) tm.assert_series_equal(base, base_copy) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 8583d8bcc052c..17dae1879f3b8 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -283,3 +285,15 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_ensure_string_array_copy(): + # ensure the original array is not modified in case of copy=False with + # pickle-roundtripped object dtype array + # https://github.com/pandas-dev/pandas/issues/54654 + arr = np.array(["a", None], dtype=object) + arr = pickle.loads(pickle.dumps(arr)) + result = lib.ensure_string_array(arr, copy=False) + assert not np.shares_memory(arr, result) + assert arr[1] is None + assert result[1] is np.nan From cf40333f8f4455e6eaf5554020a17a3ef085fda1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 19:50:15 +0200 Subject: [PATCH 2/2] update --- pandas/tests/copy_view/test_astype.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 8ef4a1ff741d7..80c30f2d0c26e 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -7,7 +7,6 @@ from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -112,6 +111,7 @@ def test_astype_string_and_object_update_original(dtype, new_dtype): def test_astype_str_copy_on_pickle_roundrip(): + # TODO(infer_string) this test can be removed after 3.0 (once str is the default) # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -129,7 +129,6 @@ def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): tm.assert_series_equal(base, base_copy) -@td.skip_if_no("pyarrow") def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter read-only array inplace