From bfe51f86a78466627df0548d24e9ed293864cf21 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 1 Sep 2015 12:23:20 -0400 Subject: [PATCH 1/4] COMPAT: remove SettingWithCopy warning, and use copy-on-write where applicable, #10954 --- pandas/core/common.py | 7 +- pandas/core/config_init.py | 9 +- pandas/core/frame.py | 14 +-- pandas/core/generic.py | 156 +++++++++++++------------------- pandas/core/groupby.py | 26 ++++-- pandas/core/series.py | 4 +- pandas/src/reduce.pyx | 2 + pandas/tests/test_frame.py | 59 +++++------- pandas/tests/test_groupby.py | 7 +- pandas/tests/test_indexing.py | 91 ++++++++++--------- pandas/tests/test_multilevel.py | 18 ++-- pandas/tests/test_series.py | 11 +-- pandas/tseries/common.py | 19 ++-- pandas/util/testing.py | 2 +- 14 files changed, 193 insertions(+), 232 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 77536fb391f93..492025426f59a 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -26,15 +26,12 @@ class PandasError(Exception): pass - -class SettingWithCopyError(ValueError): +class SettingImmutableError(ValueError): pass - -class SettingWithCopyWarning(Warning): +class SettingWithCopyError(ValueError): pass - class AmbiguousIndexError(PandasError, KeyError): pass diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 03eaa45582bef..889c41f64055a 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -321,13 +321,12 @@ def use_inf_as_null_cb(key): # user warnings chained_assignment = """ : string - Raise an exception, warn, or no action if trying to use chained assignment, - The default is warn + this option has been deprecated and has no effect """ -with cf.config_prefix('mode'): - cf.register_option('chained_assignment', 'warn', chained_assignment, - validator=is_one_of_factory([None, 'warn', 'raise'])) +cf.register_option('mode.chained_assignment', 'warn', chained_assignment, + validator=is_one_of_factory([None, 'warn', 'raise'])) +cf.deprecate_option('mode.chained_assignment', chained_assignment) # Set up the io.excel specific configuration. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index acf5e69bf05e3..741ed3413728e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2229,7 +2229,7 @@ def __setitem__(self, key, value): self._set_item(key, value) def _setitem_slice(self, key, value): - self._check_setitem_copy() + self._check_copy_on_write() self.ix._setitem_with_indexer(key, value) def _setitem_array(self, key, value): @@ -2240,7 +2240,7 @@ def _setitem_array(self, key, value): (len(key), len(self.index))) key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] - self._check_setitem_copy() + self._check_copy_on_write() self.ix._setitem_with_indexer(indexer, value) else: if isinstance(value, DataFrame): @@ -2250,7 +2250,7 @@ def _setitem_array(self, key, value): self[k1] = value[k2] else: indexer = self.ix._convert_to_indexer(key, axis=1) - self._check_setitem_copy() + self._check_copy_on_write() self.ix._setitem_with_indexer((slice(None), indexer), value) def _setitem_frame(self, key, value): @@ -2260,7 +2260,7 @@ def _setitem_frame(self, key, value): raise TypeError('Must pass DataFrame with boolean values only') self._check_inplace_setting(value) - self._check_setitem_copy() + self._check_copy_on_write() self.where(-key, value, inplace=True) def _ensure_valid_index(self, value): @@ -2296,12 +2296,6 @@ def _set_item(self, key, value): value = self._sanitize_column(key, value) NDFrame._set_item(self, key, value) - # check if we are modifying a copy - # try to set first as we want an invalid - # value exeption to occur first - if len(self): - self._check_setitem_copy() - def insert(self, loc, column, value, allow_duplicates=False): """ Insert column into DataFrame at specified location. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9c170286006f2..1b1a21e976180 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3,6 +3,7 @@ import operator import weakref import gc +import inspect import numpy as np import pandas.lib as lib @@ -21,8 +22,7 @@ from pandas.core.common import (isnull, notnull, is_list_like, _values_from_object, _maybe_promote, _maybe_box_datetimelike, ABCSeries, - SettingWithCopyError, SettingWithCopyWarning, - AbstractMethodError) + AbstractMethodError, SettingWithCopyError) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.core import config @@ -79,12 +79,13 @@ class NDFrame(PandasObject): copy : boolean, default False """ _internal_names = ['_data', '_cacher', '_item_cache', '_cache', - 'is_copy', '_subtyp', '_index', + 'is_copy', '_subtyp', '_index', '_allow_copy_on_write', '_default_kind', '_default_fill_value', '_metadata', '__array_struct__', '__array_interface__'] _internal_names_set = set(_internal_names) _accessors = frozenset([]) _metadata = [] + _allow_copy_on_write = True is_copy = None def __init__(self, data, axes=None, copy=False, dtype=None, @@ -1173,7 +1174,7 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): pass if verify_is_copy: - self._check_setitem_copy(stacklevel=5, t='referant') + self._check_copy_on_write() if clear: self._clear_item_cache() @@ -1202,6 +1203,8 @@ def _slice(self, slobj, axis=0, kind=None): return result def _set_item(self, key, value): + + self._check_copy_on_write() self._data.set(key, value) self._clear_item_cache() @@ -1214,10 +1217,54 @@ def _set_is_copy(self, ref=None, copy=True): else: self.is_copy = None + def _check_copy_on_write(self): + + # we could have a copy-on-write scenario + if self.is_copy and self._allow_copy_on_write: + + # we have an exception + if isinstance(self.is_copy, Exception): + raise self.is_copy + + def get_names_for_obj(__really_unused_name__342424__): + """Returns all named references for self""" + + removals = set(["__really_unused_name__342424__", "__really_unused_name__xxxxx__", "self"]) + refs = gc.get_referrers(__really_unused_name__342424__) + + names = [] + for ref in refs: + if inspect.isframe(ref): + for name, __really_unused_name__xxxxx__ in ref.f_locals.iteritems(): + if __really_unused_name__xxxxx__ is __really_unused_name__342424__: + names.append(name) + + for name, __really_unused_name__xxxxx__ in globals().iteritems(): + if __really_unused_name__xxxxx__ is __really_unused_name__342424__: + names.append(name) + + return set(names) - removals + + # collect garbage + # if we don't have references, then we have a reassignment case + # e.g. df = df.ix[....]; since the reference is gone + # we can just copy and be done + + # otherwise we have chained indexing, raise and error + gc.collect(2) + if self.is_copy() is not None: + names = get_names_for_obj(self) + if not len(names): + raise SettingWithCopyError("chained indexing detected, you can fix this ......") + + # provide copy-on-write + self._data = self._data.copy() + self.is_copy = None + def _check_is_chained_assignment_possible(self): """ check if we are a view, have a cacher, and are of mixed type - if so, then force a setitem_copy check + if so, then force a copy_on_write check should be called just near setting a value @@ -1227,91 +1274,12 @@ def _check_is_chained_assignment_possible(self): if self._is_view and self._is_cached: ref = self._get_cacher() if ref is not None and ref._is_mixed_type: - self._check_setitem_copy(stacklevel=4, t='referant', force=True) + self._check_copy_on_write() return True elif self.is_copy: - self._check_setitem_copy(stacklevel=4, t='referant') + self._check_copy_on_write() return False - def _check_setitem_copy(self, stacklevel=4, t='setting', force=False): - """ - - Parameters - ---------- - stacklevel : integer, default 4 - the level to show of the stack when the error is output - t : string, the type of setting error - force : boolean, default False - if True, then force showing an error - - validate if we are doing a settitem on a chained copy. - - If you call this function, be sure to set the stacklevel such that the - user will see the error *at the level of setting* - - It is technically possible to figure out that we are setting on - a copy even WITH a multi-dtyped pandas object. In other words, some blocks - may be views while other are not. Currently _is_view will ALWAYS return False - for multi-blocks to avoid having to handle this case. - - df = DataFrame(np.arange(0,9), columns=['count']) - df['group'] = 'b' - - # this technically need not raise SettingWithCopy if both are view (which is not - # generally guaranteed but is usually True - # however, this is in general not a good practice and we recommend using .loc - df.iloc[0:5]['group'] = 'a' - - """ - - if force or self.is_copy: - - value = config.get_option('mode.chained_assignment') - if value is None: - return - - # see if the copy is not actually refererd; if so, then disolve - # the copy weakref - try: - gc.collect(2) - if not gc.get_referents(self.is_copy()): - self.is_copy = None - return - except: - pass - - # we might be a false positive - try: - if self.is_copy().shape == self.shape: - self.is_copy = None - return - except: - pass - - # a custom message - if isinstance(self.is_copy, string_types): - t = self.is_copy - - elif t == 'referant': - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame\n\n" - "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") - - else: - t = ("\n" - "A value is trying to be set on a copy of a slice from a " - "DataFrame.\n" - "Try using .loc[row_indexer,col_indexer] = value instead\n\n" - "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy") - - if value == 'raise': - raise SettingWithCopyError(t) - elif value == 'warn': - warnings.warn(t, SettingWithCopyWarning, stacklevel=stacklevel) - def __delitem__(self, key): """ Delete item @@ -3376,11 +3344,11 @@ def resample(self, rule, how=None, axis=0, fill_method=None, For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0 - + Examples -------- - + Start by creating a series with 9 one minute timestamps. >>> index = pd.date_range('1/1/2000', periods=9, freq='T') @@ -3409,11 +3377,11 @@ def resample(self, rule, how=None, axis=0, fill_method=None, Downsample the series into 3 minute bins as above, but label each bin using the right edge instead of the left. Please note that the value in the bucket used as the label is not included in the bucket, - which it labels. For example, in the original series the + which it labels. For example, in the original series the bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed - value in the resampled bucket with the label``2000-01-01 00:03:00`` + value in the resampled bucket with the label``2000-01-01 00:03:00`` does not include 3 (if it did, the summed value would be 6, not 3). - To include this value close the right side of the bin interval as + To include this value close the right side of the bin interval as illustrated in the example below this one. >>> series.resample('3T', how='sum', label='right') @@ -3424,7 +3392,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, Downsample the series into 3 minute bins as above, but close the right side of the bin interval. - + >>> series.resample('3T', how='sum', label='right', closed='right') 2000-01-01 00:00:00 0 2000-01-01 00:03:00 6 @@ -3453,7 +3421,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, 2000-01-01 00:02:00 2 Freq: 30S, dtype: int64 - Upsample the series into 30 second bins and fill the + Upsample the series into 30 second bins and fill the ``NaN`` values using the ``bfill`` method. >>> series.resample('30S', fill_method='bfill')[0:5] @@ -3468,7 +3436,7 @@ def resample(self, rule, how=None, axis=0, fill_method=None, >>> def custom_resampler(array_like): ... return np.sum(array_like)+5 - + >>> series.resample('3T', how=custom_resampler) 2000-01-01 00:00:00 8 2000-01-01 00:03:00 17 diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 534117b8e9249..5437bfe11bc1d 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -688,13 +688,14 @@ def apply(self, func, *args, **kwargs): def f(g): return func(g, *args, **kwargs) - # ignore SettingWithCopy here in case the user mutates - with option_context('mode.chained_assignment',None): - return self._python_apply_general(f) + return self._python_apply_general(f) + def _python_apply_general(self, f): + keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) + self._selected_obj._allow_copy_on_write = True return self._wrap_applied_output(keys, values, not_indexed_same=mutated) @@ -3591,6 +3592,15 @@ def sort_idx(self): # Counting sort indexer return _get_group_index_sorter(self.labels, self.ngroups) + + def _set_cow(self, data): + # we may mutate, so don't allow cow + try: + data._allow_copy_on_write=False + except AttributeError: + pass + return data + def __iter__(self): sdata = self._get_sorted_data() @@ -3612,7 +3622,7 @@ def _get_sorted_data(self): return self.data.take(self.sort_idx, axis=self.axis, convert=False) def _chop(self, sdata, slice_obj): - return sdata.iloc[slice_obj] + return self._set_cow(sdata.iloc[slice_obj]) def apply(self, f): raise AbstractMethodError(self) @@ -3625,7 +3635,7 @@ class ArraySplitter(DataSplitter): class SeriesSplitter(DataSplitter): def _chop(self, sdata, slice_obj): - return sdata._get_values(slice_obj).to_dense() + return self._set_cow(sdata._get_values(slice_obj).to_dense()) class FrameSplitter(DataSplitter): @@ -3648,9 +3658,9 @@ def fast_apply(self, f, names): def _chop(self, sdata, slice_obj): if self.axis == 0: - return sdata.iloc[slice_obj] + return self._set_cow(sdata.iloc[slice_obj]) else: - return sdata._slice(slice_obj, axis=1) # ix[:, slice_obj] + return self._set_cow(sdata._slice(slice_obj, axis=1)) # ix[:, slice_obj] class NDFrameSplitter(DataSplitter): @@ -3671,7 +3681,7 @@ def _get_sorted_data(self): return sorted_data def _chop(self, sdata, slice_obj): - return self.factory(sdata.get_slice(slice_obj, axis=self.axis)) + return self._set_cow(self.factory(sdata.get_slice(slice_obj, axis=self.axis))) def get_splitter(data, *args, **kwargs): diff --git a/pandas/core/series.py b/pandas/core/series.py index 2890730956c75..f7621d249049e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -21,7 +21,7 @@ _possibly_convert_platform, _try_sort, is_int64_dtype, ABCSparseArray, _maybe_match_name, - _coerce_to_dtype, SettingWithCopyError, + _coerce_to_dtype, SettingImmutableError, _maybe_box_datetimelike, ABCDataFrame, _dict_compat) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, @@ -635,7 +635,7 @@ def setitem(key, value): try: self._set_with_engine(key, value) return - except (SettingWithCopyError): + except (SettingImmutableError): raise except (KeyError, ValueError): values = self.values diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index eb736e4569009..3115d2d76602c 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -488,6 +488,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if our low-level mucking is going to cause a segfault if n > 0: chunk = frame.iloc[starts[0]:ends[0]] + chunk._allow_copy_on_write = False shape_before = chunk.shape try: result = f(chunk) @@ -508,6 +509,7 @@ def apply_frame_axis0(object frame, object f, object names, item_cache.clear() # ugh object.__setattr__(slider.dummy, 'name', names[i]) + object.__setattr__(slider.dummy, '_allow_copy_on_write', False) piece = f(slider.dummy) # I'm paying the price for index-sharing, ugh diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 9bdb7f08fe7cf..eb2d53f0db6d3 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -552,12 +552,10 @@ def test_setitem(self): self.frame['col8'] = 'foo' assert((self.frame['col8'] == 'foo').all()) - # this is partially a view (e.g. some blocks are view) - # so raise/warn + # this is copy-on-write smaller = self.frame[:2] - def f(): - smaller['col10'] = ['1', '2'] - self.assertRaises(com.SettingWithCopyError, f) + smaller['col10'] = ['1', '2'] + self.assertEqual(smaller['col10'].dtype, np.object_) self.assertTrue((smaller['col10'] == ['1', '2']).all()) @@ -999,13 +997,11 @@ def test_fancy_getitem_slice_mixed(self): sliced = self.mixed_frame.ix[:, -3:] self.assertEqual(sliced['D'].dtype, np.float64) - # get view with single block - # setting it triggers setting with copy + # this is copy-on-write sliced = self.frame.ix[:, -3:] - def f(): - sliced['C'] = 4. - self.assertRaises(com.SettingWithCopyError, f) - self.assertTrue((self.frame['C'] == 4).all()) + sliced['C'] = 4. + self.assertFalse((self.frame['C'] == 4).all()) + self.assertTrue((sliced['C'] == 4).all()) def test_fancy_setitem_int_labels(self): # integer index defers to label-based indexing @@ -1798,14 +1794,10 @@ def test_irow(self): expected = df.ix[8:14] assert_frame_equal(result, expected) - # verify slice is view - # setting it makes it raise/warn - def f(): - result[2] = 0. - self.assertRaises(com.SettingWithCopyError, f) - exp_col = df[2].copy() - exp_col[4:8] = 0. - assert_series_equal(df[2], exp_col) + # copy-on-write for a slice + result[2] = 0. + self.assertFalse((df[2] == 0).all()) + self.assertTrue((result[2] == 0).all()) # list of integers result = df.iloc[[1, 2, 4, 6]] @@ -1833,12 +1825,10 @@ def test_icol(self): expected = df.ix[:, 8:14] assert_frame_equal(result, expected) - # verify slice is view - # and that we are setting a copy - def f(): - result[8] = 0. - self.assertRaises(com.SettingWithCopyError, f) - self.assertTrue((df[8] == 0).all()) + # we have a slice, but copy-on-write + result[8] = 0. + self.assertFalse((df[8] == 0).all()) + self.assertTrue((result[8] == 0).all()) # list of integers result = df.iloc[:, [1, 2, 4, 6]] @@ -14489,16 +14479,15 @@ def test_idxmax(self): def test_stale_cached_series_bug_473(self): # this is chained, but ok - with option_context('chained_assignment',None): - Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'), - columns=('e', 'f', 'g', 'h')) - repr(Y) - Y['e'] = Y['e'].astype('object') - Y['g']['c'] = np.NaN - repr(Y) - result = Y.sum() - exp = Y['g'].sum() - self.assertTrue(isnull(Y['g']['c'])) + Y = DataFrame(np.random.random((4, 4)), index=('a', 'b', 'c', 'd'), + columns=('e', 'f', 'g', 'h')) + repr(Y) + Y['e'] = Y['e'].astype('object') + Y['g']['c'] = np.NaN + repr(Y) + result = Y.sum() + exp = Y['g'].sum() + self.assertTrue(isnull(Y['g']['c'])) def test_index_namedtuple(self): from collections import namedtuple diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 41703b3b5a3b7..cd770bf449ec7 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2744,10 +2744,9 @@ def f(group): self.assertEqual(result['d'].dtype, np.float64) # this is by definition a mutating operation! - with option_context('mode.chained_assignment',None): - for key, group in grouped: - res = f(group) - assert_frame_equal(res, result.ix[key]) + for key, group in grouped: + res = f(group) + assert_frame_equal(res, result.ix[key]) def test_groupby_wrong_multi_labels(self): from pandas import read_csv diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 6a9d4096ad4b3..83d7fbda45591 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -2851,12 +2851,10 @@ def test_ix_assign_column_mixed(self): assert_frame_equal(df,expected) # ok, but chained assignments are dangerous - # if we turn off chained assignement it will work - with option_context('chained_assignment',None): - df = pd.DataFrame({'a': lrange(4) }) - df['b'] = np.nan - df['b'].ix[[1,3]] = [100,-100] - assert_frame_equal(df,expected) + df = pd.DataFrame({'a': lrange(4) }) + df['b'] = np.nan + df['b'].ix[[1,3]] = [100,-100] + assert_frame_equal(df,expected) def test_ix_get_set_consistency(self): @@ -3672,25 +3670,22 @@ def test_cache_updating(self): def test_slice_consolidate_invalidate_item_cache(self): - # this is chained assignment, but will 'work' - with option_context('chained_assignment',None): - - # #3970 - df = DataFrame({ "aa":lrange(5), "bb":[2.2]*5}) + # #3970 + df = DataFrame({ "aa":lrange(5), "bb":[2.2]*5}) - # Creates a second float block - df["cc"] = 0.0 + # Creates a second float block + df["cc"] = 0.0 - # caches a reference to the 'bb' series - df["bb"] + # caches a reference to the 'bb' series + df["bb"] - # repr machinery triggers consolidation - repr(df) + # repr machinery triggers consolidation + repr(df) - # Assignment to wrong series - df['bb'].iloc[0] = 0.17 - df._clear_item_cache() - self.assertAlmostEqual(df['bb'][0], 0.17) + # Assignment to wrong series + df['bb'].iloc[0] = 0.17 + df._clear_item_cache() + self.assertAlmostEqual(df['bb'][0], 0.17) def test_setitem_cache_updating(self): # GH 5424 @@ -3776,9 +3771,32 @@ def test_setitem_chained_setfault(self): result = df.head() assert_frame_equal(result, expected) - def test_detect_chained_assignment(self): + def test_chain_assignment_yields_copy_on_write(self): + + # 10954 + df = DataFrame({'col1':[1,2], 'col2':[3,4]}) + intermediate = df.loc[1:1,] + intermediate['col1'] = -99 + + # reference is broken + self.assertIsNone(df.is_copy) - pd.set_option('chained_assignment','raise') + # local assignment + expected = DataFrame([[-99,4]],index=[1],columns=['col1','col2']) + assert_frame_equal(intermediate, expected) + + # unchanged + expected = DataFrame({'col1':[1,2], 'col2':[3,4]}) + assert_frame_equal(df, expected) + + # chained assignment + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + + def f(): + df.loc[1:1,]['col1'] = -99 + self.assertRaises(com.SettingWithCopyError, f) + + def test_detect_chained_assignment(self): # work with the chain expected = DataFrame([[-5,1],[-6,3]],columns=list('AB')) @@ -3791,12 +3809,8 @@ def test_detect_chained_assignment(self): # test with the chaining df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) self.assertIsNone(df.is_copy) - def f(): - df['A'][0] = -5 - self.assertRaises(com.SettingWithCopyError, f) - def f(): - df['A'][1] = np.nan - self.assertRaises(com.SettingWithCopyError, f) + df['A'][0] = -5 + df['A'][1] = np.nan self.assertIsNone(df['A'].is_copy) # using a copy (the chain), fails @@ -3821,9 +3835,7 @@ def f(): expected = DataFrame({'A':[111,'bbb','ccc'],'B':[1,2,3]}) df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) - def f(): - df['A'][0] = 111 - self.assertRaises(com.SettingWithCopyError, f) + df['A'][0] = 111 def f(): df.loc[0]['A'] = 111 self.assertRaises(com.SettingWithCopyError, f) @@ -3942,8 +3954,13 @@ def f(): def f(): df.ix[2]['C'] = 'foo' self.assertRaises(com.SettingWithCopyError, f) + df['C'][2] = 'foo' + + def test_detect_chained_assignment2(self): + + df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) def f(): - df['C'][2] = 'foo' + df.loc[0]['A'] = 111 self.assertRaises(com.SettingWithCopyError, f) def test_setting_with_copy_bug(self): @@ -3964,14 +3981,6 @@ def f(): # this should not raise df2['y'] = ['g', 'h', 'i'] - def test_detect_chained_assignment_warnings(self): - - # warnings - with option_context('chained_assignment','warn'): - df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) - with tm.assert_produces_warning(expected_warning=com.SettingWithCopyWarning): - df.loc[0]['A'] = 111 - def test_float64index_slicing_bug(self): # GH 5557, related to slicing a float index ser = {256: 2321.0, 1: 78.0, 2: 2716.0, 3: 0.0, 4: 369.0, 5: 0.0, 6: 269.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 3536.0, 11: 0.0, 12: 24.0, 13: 0.0, 14: 931.0, 15: 0.0, 16: 101.0, 17: 78.0, 18: 9643.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 63761.0, 23: 0.0, 24: 446.0, 25: 0.0, 26: 34773.0, 27: 0.0, 28: 729.0, 29: 78.0, 30: 0.0, 31: 0.0, 32: 3374.0, 33: 0.0, 34: 1391.0, 35: 0.0, 36: 361.0, 37: 0.0, 38: 61808.0, 39: 0.0, 40: 0.0, 41: 0.0, 42: 6677.0, 43: 0.0, 44: 802.0, 45: 0.0, 46: 2691.0, 47: 0.0, 48: 3582.0, 49: 0.0, 50: 734.0, 51: 0.0, 52: 627.0, 53: 70.0, 54: 2584.0, 55: 0.0, 56: 324.0, 57: 0.0, 58: 605.0, 59: 0.0, 60: 0.0, 61: 0.0, 62: 3989.0, 63: 10.0, 64: 42.0, 65: 0.0, 66: 904.0, 67: 0.0, 68: 88.0, 69: 70.0, 70: 8172.0, 71: 0.0, 72: 0.0, 73: 0.0, 74: 64902.0, 75: 0.0, 76: 347.0, 77: 0.0, 78: 36605.0, 79: 0.0, 80: 379.0, 81: 70.0, 82: 0.0, 83: 0.0, 84: 3001.0, 85: 0.0, 86: 1630.0, 87: 7.0, 88: 364.0, 89: 0.0, 90: 67404.0, 91: 9.0, 92: 0.0, 93: 0.0, 94: 7685.0, 95: 0.0, 96: 1017.0, 97: 0.0, 98: 2831.0, 99: 0.0, 100: 2963.0, 101: 0.0, 102: 854.0, 103: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0, 108: 0.0, 109: 0.0, 110: 0.0, 111: 0.0, 112: 0.0, 113: 0.0, 114: 0.0, 115: 0.0, 116: 0.0, 117: 0.0, 118: 0.0, 119: 0.0, 120: 0.0, 121: 0.0, 122: 0.0, 123: 0.0, 124: 0.0, 125: 0.0, 126: 67744.0, 127: 22.0, 128: 264.0, 129: 0.0, 260: 197.0, 268: 0.0, 265: 0.0, 269: 0.0, 261: 0.0, 266: 1198.0, 267: 0.0, 262: 2629.0, 258: 775.0, 257: 0.0, 263: 0.0, 259: 0.0, 264: 163.0, 250: 10326.0, 251: 0.0, 252: 1228.0, 253: 0.0, 254: 2769.0, 255: 0.0} diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1bce047f3bf96..0f9f7c94f4ff1 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -539,11 +539,9 @@ def test_xs_level(self): # this is a copy in 0.14 result = self.frame.xs('two', level='second') - # setting this will give a SettingWithCopyError - # as we are trying to write a view - def f(x): - x[:] = 10 - self.assertRaises(com.SettingWithCopyError, f, result) + # this is copy-on-write + result[:] = 10 + self.assertTrue((result.values == 10).all()) def test_xs_level_multiple(self): from pandas import read_table @@ -562,11 +560,9 @@ def test_xs_level_multiple(self): # this is a copy in 0.14 result = df.xs(('a', 4), level=['one', 'four']) - # setting this will give a SettingWithCopyError - # as we are trying to write a view - def f(x): - x[:] = 10 - self.assertRaises(com.SettingWithCopyError, f, result) + # copy-on-write + result[:] = 10 + self.assertTrue((result.values == 10).all()) # GH2107 dates = lrange(20111201, 20111205) @@ -1412,7 +1408,7 @@ def test_frame_getitem_view(self): df['foo', 'four'] = 'foo' df = df.sortlevel(0, axis=1) - # this will work, but will raise/warn as its chained assignment + # chained assignment def f(): df['foo']['one'] = 2 return df diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 86eafdf7ca2c8..fda9039dbb19b 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -204,11 +204,10 @@ def get_dir(s): with tm.assertRaisesRegexp(ValueError, "modifications"): s.dt.hour = 5 - # trying to set a copy - with pd.option_context('chained_assignment','raise'): - def f(): - s.dt.hour[0] = 5 - self.assertRaises(com.SettingWithCopyError, f) + # trying to set an immutable + def f(): + s.dt.hour[0] = 5 + self.assertRaises(com.SettingImmutableError, f) def test_strftime(self): # GH 10086 @@ -4447,7 +4446,6 @@ def test_underlying_data_conversion(self): # GH 3970 # these are chained assignments as well - pd.set_option('chained_assignment',None) df = DataFrame({ "aa":range(5), "bb":[2.2]*5}) df["cc"] = 0.0 ck = [True]*len(df) @@ -4455,7 +4453,6 @@ def test_underlying_data_conversion(self): df_tmp = df.iloc[ck] df["bb"].iloc[0] = .15 self.assertEqual(df['bb'].iloc[0], 0.15) - pd.set_option('chained_assignment','raise') # GH 3217 df = DataFrame(dict(a = [1,3], b = [np.nan, 2])) diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 9a282bec2e9e4..25bd52f450438 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -9,7 +9,7 @@ from pandas import tslib from pandas.core.common import (_NS_DTYPE, _TD_DTYPE, is_period_arraylike, is_datetime_arraylike, is_integer_dtype, is_list_like, - get_dtype_kinds) + get_dtype_kinds, SettingImmutableError) def is_datetimelike(data): """ return a boolean if we can be successfully converted to a datetimelike """ @@ -76,15 +76,16 @@ def _delegate_property_get(self, name): # return the result as a Series, which is by definition a copy result = Series(result, index=self.index) - # setting this object will show a SettingWithCopyWarning/Error - result.is_copy = ("modifications to a property of a datetimelike object are not " - "supported and are discarded. Change values on the original.") + # setting this object will show a ValueError if accessed + result.is_copy = SettingImmutableError("modifications to a property of a datetimelike object are not " + "supported and are discarded. Change values on the original.") + return result def _delegate_property_set(self, name, value, *args, **kwargs): - raise ValueError("modifications to a property of a datetimelike object are not " - "supported. Change values on the original.") + raise SettingImmutableError("modifications to a property of a datetimelike object are not " + "supported. Change values on the original.") def _delegate_method(self, name, *args, **kwargs): from pandas import Series @@ -97,9 +98,9 @@ def _delegate_method(self, name, *args, **kwargs): result = Series(result, index=self.index) - # setting this object will show a SettingWithCopyWarning/Error - result.is_copy = ("modifications to a method of a datetimelike object are not " - "supported and are discarded. Change values on the original.") + # setting this object will show a SettingImmutableError + result.is_copy = SettingImmutableError("modifications to a method of a datetimelike object are not " + "supported and are discarded. Change values on the original.") return result diff --git a/pandas/util/testing.py b/pandas/util/testing.py index aaa83da036c2f..f1fe9d74af1f3 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -64,7 +64,7 @@ class TestCase(unittest.TestCase): @classmethod def setUpClass(cls): - pd.set_option('chained_assignment', 'raise') + pass @classmethod def tearDownClass(cls): From 3a0b7e096ffa680aa8341405a004d92c310d58d2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 2 Sep 2015 11:00:15 -0400 Subject: [PATCH 2/4] allow chained assignments --- pandas/core/frame.py | 15 +++++++- pandas/core/generic.py | 23 ++++++------ pandas/core/series.py | 13 ++++++- pandas/tests/test_indexing.py | 70 +++++++++++++++++++++-------------- 4 files changed, 79 insertions(+), 42 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 741ed3413728e..a52b4c351f503 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2293,8 +2293,19 @@ def _set_item(self, key, value): """ self._ensure_valid_index(value) - value = self._sanitize_column(key, value) - NDFrame._set_item(self, key, value) + svalue = self._sanitize_column(key, value) + try: + NDFrame._set_item(self, key, svalue) + except com.SettingWithCopyError: + + # if we have a multi-index (which potentially has dropped levels) + # need to raise + if isinstance(self.is_copy().columns, MultiIndex): + raise + + # we have a chained assignment + # assign back to the original + self.is_copy().loc[self.index,key] = value def insert(self, loc, column, value, allow_duplicates=False): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1b1a21e976180..9e5c2db4139c0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1196,10 +1196,9 @@ def _slice(self, slobj, axis=0, kind=None): result = self._constructor(self._data.get_slice(slobj, axis=axis)) result = result.__finalize__(self) - # this could be a view - # but only in a single-dtyped view slicable case - is_copy = axis!=0 or result._is_view - result._set_is_copy(self, copy=is_copy) + # mark this as a copy if we are not axis=0 + is_copy = axis!=0 + result._set_is_copy(self) return result def _set_item(self, key, value): @@ -1235,11 +1234,15 @@ def get_names_for_obj(__really_unused_name__342424__): names = [] for ref in refs: if inspect.isframe(ref): - for name, __really_unused_name__xxxxx__ in ref.f_locals.iteritems(): + for name, __really_unused_name__xxxxx__ in compat.iteritems(ref.f_locals): + if __really_unused_name__xxxxx__ is __really_unused_name__342424__: + names.append(name) + elif isinstance(ref, dict): + for name, __really_unused_name__xxxxx__ in compat.iteritems(ref): if __really_unused_name__xxxxx__ is __really_unused_name__342424__: names.append(name) - for name, __really_unused_name__xxxxx__ in globals().iteritems(): + for name, __really_unused_name__xxxxx__ in compat.iteritems(globals()): if __really_unused_name__xxxxx__ is __really_unused_name__342424__: names.append(name) @@ -1271,9 +1274,9 @@ def _check_is_chained_assignment_possible(self): will return a boolean if it we are a view and are cached, but a single-dtype meaning that the cacher should be updated following setting """ - if self._is_view and self._is_cached: + if self._is_cached: ref = self._get_cacher() - if ref is not None and ref._is_mixed_type: + if ref is not None: self._check_copy_on_write() return True elif self.is_copy: @@ -1482,9 +1485,7 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): result = self.iloc[loc] result.index = new_index - # this could be a view - # but only in a single-dtyped view slicable case - result._set_is_copy(self, copy=not result._is_view) + result._set_is_copy(self) return result _xs = xs diff --git a/pandas/core/series.py b/pandas/core/series.py index f7621d249049e..8dd40e0ed579a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -21,7 +21,8 @@ _possibly_convert_platform, _try_sort, is_int64_dtype, ABCSparseArray, _maybe_match_name, - _coerce_to_dtype, SettingImmutableError, + _coerce_to_dtype, + SettingImmutableError, SettingWithCopyError, _maybe_box_datetimelike, ABCDataFrame, _dict_compat) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, @@ -683,7 +684,15 @@ def setitem(key, value): self._set_with(key, value) # do the setitem - cacher_needs_updating = self._check_is_chained_assignment_possible() + try: + cacher_needs_updating = self._check_is_chained_assignment_possible() + except (SettingWithCopyError): + + # we have a chained assignment + # assign back to the original + self.is_copy().loc[self.name,key] = value + return + setitem(key, value) if cacher_needs_updating: self._maybe_update_cacher() diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 83d7fbda45591..6c1f4f49ee712 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3780,6 +3780,7 @@ def test_chain_assignment_yields_copy_on_write(self): # reference is broken self.assertIsNone(df.is_copy) + self.assertIsNone(intermediate.is_copy) # local assignment expected = DataFrame([[-99,4]],index=[1],columns=['col1','col2']) @@ -3790,11 +3791,31 @@ def test_chain_assignment_yields_copy_on_write(self): assert_frame_equal(df, expected) # chained assignment + # but one that we can deal with df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + df.loc[1:1,]['col1'] = -99 + expected = DataFrame({'col1':[1,-99], 'col2':[3,4]}) + assert_frame_equal(df, expected) - def f(): - df.loc[1:1,]['col1'] = -99 - self.assertRaises(com.SettingWithCopyError, f) + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + df.loc[1]['col1'] = -99 + assert_frame_equal(df, expected) + + # changed via a scalar accessor + df = DataFrame({'col1':[1,2], 'col2':[3,4]}) + expected = df.copy() + + s2 = df.loc[0] + s2.iloc[0] = -99 + assert_frame_equal(df, expected) + expected = Series([-99,3],index=['col1','col2'],name=0) + assert_series_equal(s2, expected) + + # change dtype + df = pd.DataFrame({'col1':[1,2], 'col2':[3,4]}) + expected = DataFrame({'col1':[1, 2], 'col2':[3,'foo']}) + df.loc[1]['col2'] = 'foo' + assert_frame_equal(df, expected) def test_detect_chained_assignment(self): @@ -3815,9 +3836,8 @@ def test_detect_chained_assignment(self): # using a copy (the chain), fails df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) - def f(): - df.loc[0]['A'] = -5 - self.assertRaises(com.SettingWithCopyError, f) + df.loc[0]['A'] = -5 + self.assertEqual(df.loc[0,'A'], -5) # doc example df = DataFrame({'a' : ['one', 'one', 'two', @@ -3828,18 +3848,20 @@ def f(): 'three', 'two', 'one', 'six'], 'c' : [42,42,2,3,4,42,6]}) - def f(): - indexer = df.a.str.startswith('o') - df[indexer]['c'] = 42 - self.assertRaises(com.SettingWithCopyError, f) + indexer = df.a.str.startswith('o') + df[indexer]['c'] = 42 + assert_frame_equal(df, expected) expected = DataFrame({'A':[111,'bbb','ccc'],'B':[1,2,3]}) df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) df['A'][0] = 111 - def f(): - df.loc[0]['A'] = 111 - self.assertRaises(com.SettingWithCopyError, f) + assert_frame_equal(df, expected) + + df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) + df.loc[0]['A'] = 111 + assert_frame_equal(df,expected) + df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) df.loc[0,'A'] = 111 assert_frame_equal(df,expected) @@ -3940,28 +3962,22 @@ def f(): # from SO: http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0,9), columns=['count']) df['group'] = 'b' - def f(): - df.iloc[0:5]['group'] = 'a' - self.assertRaises(com.SettingWithCopyError, f) + df.iloc[0:5]['group'] = 'a' # mixed type setting # same dtype & changing dtype df = DataFrame(dict(A=date_range('20130101',periods=5),B=np.random.randn(5),C=np.arange(5,dtype='int64'),D=list('abcde'))) + df.ix[2]['D'] = 'foo' - def f(): - df.ix[2]['D'] = 'foo' - self.assertRaises(com.SettingWithCopyError, f) - def f(): - df.ix[2]['C'] = 'foo' - self.assertRaises(com.SettingWithCopyError, f) - df['C'][2] = 'foo' + df = DataFrame(dict(A=date_range('20130101',periods=5),B=np.random.randn(5),C=np.arange(5,dtype='int64'),D=list('abcde'))) + df.ix[2]['C'] = 'foo' - def test_detect_chained_assignment2(self): + df = DataFrame(dict(A=date_range('20130101',periods=5),B=np.random.randn(5),C=np.arange(5,dtype='int64'),D=list('abcde'))) + df['C'][2] = 'foo' df = DataFrame({'A':['aaa','bbb','ccc'],'B':[1,2,3]}) - def f(): - df.loc[0]['A'] = 111 - self.assertRaises(com.SettingWithCopyError, f) + df.loc[0]['A'] = 111 + self.assertEqual(df.loc[0,'A'],111) def test_setting_with_copy_bug(self): From 5d6e3f572db2dc99ce2b3746d90cfe0c04f9d6e8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 2 Sep 2015 18:02:33 -0400 Subject: [PATCH 3/4] is_copy -> _parent --- pandas/core/frame.py | 8 ++--- pandas/core/generic.py | 68 ++++++++++++++++++++--------------- pandas/core/groupby.py | 4 +-- pandas/core/indexing.py | 2 +- pandas/core/ops.py | 2 +- pandas/core/panel.py | 3 +- pandas/core/series.py | 2 +- pandas/src/reduce.pyx | 4 +-- pandas/tests/test_index.py | 4 +-- pandas/tests/test_indexing.py | 30 ++++++++-------- pandas/tests/test_panel.py | 2 +- pandas/tests/test_panel4d.py | 2 +- pandas/tseries/common.py | 4 +-- 13 files changed, 73 insertions(+), 62 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a52b4c351f503..ade174368a759 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1825,7 +1825,7 @@ def _ixs(self, i, axis=0): copy = isinstance(new_values,np.ndarray) and new_values.base is None result = Series(new_values, index=self.columns, name=self.index[i], dtype=new_values.dtype) - result._set_is_copy(self, copy=copy) + result._set_parent(self, copy=copy) return result # icol @@ -1957,7 +1957,7 @@ def _getitem_multilevel(self, key): if isinstance(result, Series): result = self._constructor_sliced(result, index=self.index, name=key) - result._set_is_copy(self) + result._set_parent(self) return result else: return self._get_item_cache(key) @@ -2300,12 +2300,12 @@ def _set_item(self, key, value): # if we have a multi-index (which potentially has dropped levels) # need to raise - if isinstance(self.is_copy().columns, MultiIndex): + if isinstance(self._parent().columns, MultiIndex): raise # we have a chained assignment # assign back to the original - self.is_copy().loc[self.index,key] = value + self._parent().loc[self.index,key] = value def insert(self, loc, column, value, allow_duplicates=False): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9e5c2db4139c0..be49852e4baf8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -78,15 +78,15 @@ class NDFrame(PandasObject): axes : list copy : boolean, default False """ - _internal_names = ['_data', '_cacher', '_item_cache', '_cache', - 'is_copy', '_subtyp', '_index', '_allow_copy_on_write', + _internal_names = ['_data', '_cacher', '_item_cache', '_cache', '_parent', + '_subtyp', '_index', '_parent_copy_on_write', '_default_kind', '_default_fill_value', '_metadata', '__array_struct__', '__array_interface__'] _internal_names_set = set(_internal_names) _accessors = frozenset([]) _metadata = [] - _allow_copy_on_write = True - is_copy = None + _parent_copy_on_write = True + _parent = None def __init__(self, data, axes=None, copy=False, dtype=None, fastpath=False): @@ -101,10 +101,22 @@ def __init__(self, data, axes=None, copy=False, dtype=None, for i, ax in enumerate(axes): data = data.reindex_axis(ax, axis=i) - object.__setattr__(self, 'is_copy', None) + object.__setattr__(self, '_parent', None) object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) + def _get_is_copy(self): + warnings.warn("is_copy is deprecated will be removed in a future release", + FutureWarning) + return None + + def _set_is_copy(self, v): + warnings.warn("is_copy is deprecated will be removed in a future release", + FutureWarning) + pass + + is_copy = property(fget=_get_is_copy, fset=_set_is_copy) + def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -1092,7 +1104,7 @@ def _get_item_cache(self, item): res._set_as_cached(item, self) # for a chain - res.is_copy = self.is_copy + res._parent = self._parent return res def _set_as_cached(self, item, cacher): @@ -1144,7 +1156,7 @@ def _is_view(self): """ boolean : return if I am a view of another array """ return self._data.is_view - def _maybe_update_cacher(self, clear=False, verify_is_copy=True): + def _maybe_update_cacher(self, clear=False, verify_parent=True): """ see if we need to update our parent cacher @@ -1154,8 +1166,8 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): ---------- clear : boolean, default False clear the item cache - verify_is_copy : boolean, default True - provide is_copy checks + verify_parent : boolean, default True + provide parent checks """ @@ -1173,7 +1185,7 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): except: pass - if verify_is_copy: + if verify_parent: self._check_copy_on_write() if clear: @@ -1197,8 +1209,8 @@ def _slice(self, slobj, axis=0, kind=None): result = result.__finalize__(self) # mark this as a copy if we are not axis=0 - is_copy = axis!=0 - result._set_is_copy(self) + parent = axis!=0 + result._set_parent(self) return result def _set_item(self, key, value): @@ -1207,23 +1219,23 @@ def _set_item(self, key, value): self._data.set(key, value) self._clear_item_cache() - def _set_is_copy(self, ref=None, copy=True): + def _set_parent(self, ref=None, copy=True): if not copy: - self.is_copy = None + self._parent = None else: if ref is not None: - self.is_copy = weakref.ref(ref) + self._parent = weakref.ref(ref) else: - self.is_copy = None + self._parent = None def _check_copy_on_write(self): # we could have a copy-on-write scenario - if self.is_copy and self._allow_copy_on_write: + if self._parent and self._parent_copy_on_write: # we have an exception - if isinstance(self.is_copy, Exception): - raise self.is_copy + if isinstance(self._parent, Exception): + raise self._parent def get_names_for_obj(__really_unused_name__342424__): """Returns all named references for self""" @@ -1255,14 +1267,14 @@ def get_names_for_obj(__really_unused_name__342424__): # otherwise we have chained indexing, raise and error gc.collect(2) - if self.is_copy() is not None: + if self._parent() is not None: names = get_names_for_obj(self) if not len(names): raise SettingWithCopyError("chained indexing detected, you can fix this ......") # provide copy-on-write self._data = self._data.copy() - self.is_copy = None + self._parent = None def _check_is_chained_assignment_possible(self): """ @@ -1279,7 +1291,7 @@ def _check_is_chained_assignment_possible(self): if ref is not None: self._check_copy_on_write() return True - elif self.is_copy: + elif self._parent: self._check_copy_on_write() return False @@ -1342,7 +1354,7 @@ def take(self, indices, axis=0, convert=True, is_copy=True): # maybe set copy if we didn't actually change the index if is_copy: if not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) + result._set_parent(self) return result @@ -1485,7 +1497,7 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): result = self.iloc[loc] result.index = new_index - result._set_is_copy(self) + result._set_parent(self) return result _xs = xs @@ -1608,14 +1620,14 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): else: return result - def _update_inplace(self, result, verify_is_copy=True): + def _update_inplace(self, result, verify_parent=True): """ replace self internals with result. Parameters ---------- - verify_is_copy : boolean, default True - provide is_copy checks + verify_parent : boolean, default True + provide parent checks """ # NOTE: This does *not* call __finalize__ and that's an explicit @@ -1624,7 +1636,7 @@ def _update_inplace(self, result, verify_is_copy=True): self._reset_cache() self._clear_item_cache() self._data = getattr(result,'_data',result) - self._maybe_update_cacher(verify_is_copy=verify_is_copy) + self._maybe_update_cacher(verify_parent=verify_parent) def add_prefix(self, prefix): """ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5437bfe11bc1d..61c2c9b4e1474 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -695,7 +695,7 @@ def _python_apply_general(self, f): keys, values, mutated = self.grouper.apply(f, self._selected_obj, self.axis) - self._selected_obj._allow_copy_on_write = True + self._selected_obj._parent_copy_on_write = True return self._wrap_applied_output(keys, values, not_indexed_same=mutated) @@ -3596,7 +3596,7 @@ def sort_idx(self): def _set_cow(self, data): # we may mutate, so don't allow cow try: - data._allow_copy_on_write=False + data._parent_copy_on_write=False except AttributeError: pass return data diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b8ee831cdc12c..9c32a73a875ff 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -273,7 +273,7 @@ def _setitem_with_indexer(self, indexer, value): labels = index.insert(len(index),key) self.obj._data = self.obj.reindex_axis(labels, i)._data self.obj._maybe_update_cacher(clear=True) - self.obj.is_copy=None + self.obj._parent=None nindexer.append(labels.get_loc(key)) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 8e3dd3836855c..aeb218b56e3e6 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -191,7 +191,7 @@ def f(self, other): # this makes sure that we are aligned like the input # we are updating inplace so we want to ignore is_copy self._update_inplace(result.reindex_like(self,copy=False)._data, - verify_is_copy=False) + verify_parent=False) return self return f diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 1293b4034b84e..b676aad7599c5 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -812,8 +812,7 @@ def xs(self, key, axis=1, copy=None): axis_number = self._get_axis_number(axis) new_data = self._data.xs(key, axis=axis_number, copy=False) result = self._construct_return_type(new_data) - copy = new_data.is_mixed_type - result._set_is_copy(self, copy=copy) + result._set_parent(self) return result _xs = xs diff --git a/pandas/core/series.py b/pandas/core/series.py index 8dd40e0ed579a..b7431cad03525 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -690,7 +690,7 @@ def setitem(key, value): # we have a chained assignment # assign back to the original - self.is_copy().loc[self.name,key] = value + self._parent().loc[self.name,key] = value return setitem(key, value) diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index 3115d2d76602c..5e09c87a629fb 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -488,7 +488,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if our low-level mucking is going to cause a segfault if n > 0: chunk = frame.iloc[starts[0]:ends[0]] - chunk._allow_copy_on_write = False + chunk._parent_copy_on_write = False shape_before = chunk.shape try: result = f(chunk) @@ -509,7 +509,7 @@ def apply_frame_axis0(object frame, object f, object names, item_cache.clear() # ugh object.__setattr__(slider.dummy, 'name', names[i]) - object.__setattr__(slider.dummy, '_allow_copy_on_write', False) + object.__setattr__(slider.dummy, '_parent_copy_on_write', False) piece = f(slider.dummy) # I'm paying the price for index-sharing, ugh diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 30a5716831087..62caa45d6bd82 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -3534,10 +3534,10 @@ def test_set_value_keeps_names(self): columns=['one', 'two', 'three', 'four'], index=idx) df = df.sortlevel() - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) self.assertEqual(df.index.names, ('Name', 'Number')) df = df.set_value(('grethe', '4'), 'one', 99.34) - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) self.assertEqual(df.index.names, ('Name', 'Number')) def test_names(self): diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index 6c1f4f49ee712..cb7946268e8f0 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3779,8 +3779,8 @@ def test_chain_assignment_yields_copy_on_write(self): intermediate['col1'] = -99 # reference is broken - self.assertIsNone(df.is_copy) - self.assertIsNone(intermediate.is_copy) + self.assertIsNone(df._parent) + self.assertIsNone(intermediate._parent) # local assignment expected = DataFrame([[-99,4]],index=[1],columns=['col1','col2']) @@ -3822,17 +3822,17 @@ def test_detect_chained_assignment(self): # work with the chain expected = DataFrame([[-5,1],[-6,3]],columns=list('AB')) df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),dtype='int64') - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) df['A'][0] = -5 df['A'][1] = -6 assert_frame_equal(df, expected) # test with the chaining df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) df['A'][0] = -5 df['A'][1] = np.nan - self.assertIsNone(df['A'].is_copy) + self.assertIsNone(df['A']._parent) # using a copy (the chain), fails df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) @@ -3843,7 +3843,7 @@ def test_detect_chained_assignment(self): df = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'c' : Series(range(7),dtype='int64') }) - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) expected = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'c' : [42,42,2,3,4,42,6]}) @@ -3868,7 +3868,7 @@ def test_detect_chained_assignment(self): # make sure that is_copy is picked up reconstruction # GH5475 df = DataFrame({"A": [1,2]}) - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) with tm.ensure_clean('__tmp__pickle') as path: df.to_pickle(path) df2 = pd.read_pickle(path) @@ -3892,34 +3892,34 @@ def random_text(nobs=100): # always a copy x = df.iloc[[0,1,2]] - self.assertIsNotNone(x.is_copy) + self.assertIsNotNone(x._parent) x = df.iloc[[0,1,2,4]] - self.assertIsNotNone(x.is_copy) + self.assertIsNotNone(x._parent) # explicity copy indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer].copy() - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) df['letters'] = df['letters'].apply(str.lower) # implicity take df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer] - self.assertIsNotNone(df.is_copy) + self.assertIsNotNone(df._parent) df['letters'] = df['letters'].apply(str.lower) # implicity take 2 df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer] - self.assertIsNotNone(df.is_copy) + self.assertIsNotNone(df._parent) df.loc[:,'letters'] = df['letters'].apply(str.lower) # should be ok even though it's a copy! - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) df['letters'] = df['letters'].apply(str.lower) - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) @@ -3927,7 +3927,7 @@ def random_text(nobs=100): # an identical take, so no copy df = DataFrame({'a' : [1]}).dropna() - self.assertIsNone(df.is_copy) + self.assertIsNone(df._parent) df['a'] += 1 # inplace ops diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 9cdc769dd7d74..72954f8dcb3ba 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -556,7 +556,7 @@ def test_xs(self): # mixed-type yields a copy self.panel['strings'] = 'foo' result = self.panel.xs('D', axis=2) - self.assertIsNotNone(result.is_copy) + self.assertIsNotNone(result._parent) def test_getitem_fancy_labels(self): p = self.panel diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 289f7f134aa27..88e1277edd89b 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -514,7 +514,7 @@ def test_xs(self): # mixed-type self.panel4d['strings'] = 'foo' result = self.panel4d.xs('D', axis=3) - self.assertIsNotNone(result.is_copy) + self.assertIsNotNone(result._parent) def test_getitem_fancy_labels(self): panel4d = self.panel4d diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 25bd52f450438..2692e38389106 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -77,7 +77,7 @@ def _delegate_property_get(self, name): result = Series(result, index=self.index) # setting this object will show a ValueError if accessed - result.is_copy = SettingImmutableError("modifications to a property of a datetimelike object are not " + result._parent = SettingImmutableError("modifications to a property of a datetimelike object are not " "supported and are discarded. Change values on the original.") @@ -99,7 +99,7 @@ def _delegate_method(self, name, *args, **kwargs): result = Series(result, index=self.index) # setting this object will show a SettingImmutableError - result.is_copy = SettingImmutableError("modifications to a method of a datetimelike object are not " + result._parent = SettingImmutableError("modifications to a method of a datetimelike object are not " "supported and are discarded. Change values on the original.") return result From 2fcd89b4d862894d49228cf3c6ae7235ccbc6a0a Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 2 Sep 2015 21:06:13 -0400 Subject: [PATCH 4/4] make _parent a list of the parents --- pandas/core/frame.py | 7 ++++--- pandas/core/generic.py | 39 +++++++++++++++++------------------ pandas/core/indexing.py | 2 +- pandas/core/series.py | 12 +++++++---- pandas/tests/test_indexing.py | 26 +++++++++++++---------- pandas/tests/test_series.py | 10 ++++----- 6 files changed, 52 insertions(+), 44 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ade174368a759..8ea0321feae1b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2300,12 +2300,13 @@ def _set_item(self, key, value): # if we have a multi-index (which potentially has dropped levels) # need to raise - if isinstance(self._parent().columns, MultiIndex): - raise + for p in self._parent: + if isinstance(getattr(p(),'columns',None), MultiIndex): + raise # we have a chained assignment # assign back to the original - self._parent().loc[self.index,key] = value + self._parent[0]().loc[self.index,key] = value def insert(self, loc, column, value, allow_duplicates=False): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index be49852e4baf8..37d07e5d6611b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -86,7 +86,7 @@ class NDFrame(PandasObject): _accessors = frozenset([]) _metadata = [] _parent_copy_on_write = True - _parent = None + _parent = [] def __init__(self, data, axes=None, copy=False, dtype=None, fastpath=False): @@ -101,7 +101,7 @@ def __init__(self, data, axes=None, copy=False, dtype=None, for i, ax in enumerate(axes): data = data.reindex_axis(ax, axis=i) - object.__setattr__(self, '_parent', None) + object.__setattr__(self, '_parent', []) object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) @@ -1104,7 +1104,7 @@ def _get_item_cache(self, item): res._set_as_cached(item, self) # for a chain - res._parent = self._parent + res._set_parent(self) return res def _set_as_cached(self, item, cacher): @@ -1205,13 +1205,8 @@ def _slice(self, slobj, axis=0, kind=None): """ axis = self._get_block_manager_axis(axis) - result = self._constructor(self._data.get_slice(slobj, axis=axis)) - result = result.__finalize__(self) - - # mark this as a copy if we are not axis=0 - parent = axis!=0 - result._set_parent(self) - return result + data = self._data.get_slice(slobj, axis=axis) + return self._constructor(data)._set_parent(self).__finalize__(self) def _set_item(self, key, value): @@ -1220,13 +1215,10 @@ def _set_item(self, key, value): self._clear_item_cache() def _set_parent(self, ref=None, copy=True): - if not copy: - self._parent = None - else: - if ref is not None: - self._parent = weakref.ref(ref) - else: - self._parent = None + if ref is not None: + self._parent.extend(ref._parent) + self._parent.append(weakref.ref(ref)) + return self def _check_copy_on_write(self): @@ -1266,15 +1258,22 @@ def get_names_for_obj(__really_unused_name__342424__): # we can just copy and be done # otherwise we have chained indexing, raise and error + def error(): + raise SettingWithCopyError("chained indexing detected, you can fix this ......") + gc.collect(2) - if self._parent() is not None: + if len(self._parent) > 1: + error() + + p = self._parent[0]() + if p is not None: names = get_names_for_obj(self) if not len(names): - raise SettingWithCopyError("chained indexing detected, you can fix this ......") + error() # provide copy-on-write self._data = self._data.copy() - self._parent = None + self._parent = [] def _check_is_chained_assignment_possible(self): """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9c32a73a875ff..a641f5126454b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -273,7 +273,7 @@ def _setitem_with_indexer(self, indexer, value): labels = index.insert(len(index),key) self.obj._data = self.obj.reindex_axis(labels, i)._data self.obj._maybe_update_cacher(clear=True) - self.obj._parent=None + self.obj._parent=[] nindexer.append(labels.get_loc(key)) diff --git a/pandas/core/series.py b/pandas/core/series.py index b7431cad03525..030b0c117efc4 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -528,7 +528,7 @@ def __getitem__(self, key): if not self.index.is_unique: result = self._constructor(result, index=[key]*len(result) - ,dtype=self.dtype).__finalize__(self) + ,dtype=self.dtype)._set_parent(self).__finalize__(self) return result except InvalidIndexError: @@ -621,12 +621,12 @@ def _get_values_tuple(self, key): # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) return self._constructor(self.values[indexer], - index=new_index).__finalize__(self) + index=new_index)._set_parent(self).__finalize__(self) def _get_values(self, indexer): try: return self._constructor(self._data.get_slice(indexer), - fastpath=True).__finalize__(self) + fastpath=True)._set_parent(self).__finalize__(self) except Exception: return self.values[indexer] @@ -690,7 +690,11 @@ def setitem(key, value): # we have a chained assignment # assign back to the original - self._parent().loc[self.name,key] = value + obj = self._parent[0]() + if isinstance(obj, Series): + obj.loc[key] = value + else: + obj.loc[self.name,key] = value return setitem(key, value) diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index cb7946268e8f0..cecb137b23df4 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3776,11 +3776,15 @@ def test_chain_assignment_yields_copy_on_write(self): # 10954 df = DataFrame({'col1':[1,2], 'col2':[3,4]}) intermediate = df.loc[1:1,] + + # refrence created + self.assertFalse(len(df._parent)) + self.assertTrue(len(intermediate._parent) == 1) intermediate['col1'] = -99 # reference is broken - self.assertIsNone(df._parent) - self.assertIsNone(intermediate._parent) + self.assertFalse(len(df._parent)) + self.assertFalse(len(intermediate._parent)) # local assignment expected = DataFrame([[-99,4]],index=[1],columns=['col1','col2']) @@ -3822,17 +3826,17 @@ def test_detect_chained_assignment(self): # work with the chain expected = DataFrame([[-5,1],[-6,3]],columns=list('AB')) df = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),dtype='int64') - self.assertIsNone(df._parent) + self.assertFalse(len(df._parent)) df['A'][0] = -5 df['A'][1] = -6 assert_frame_equal(df, expected) # test with the chaining df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) - self.assertIsNone(df._parent) + self.assertFalse(len(df._parent)) df['A'][0] = -5 df['A'][1] = np.nan - self.assertIsNone(df['A']._parent) + self.assertFalse(len(df['A']._parent)) # using a copy (the chain), fails df = DataFrame({ 'A' : Series(range(2),dtype='int64'), 'B' : np.array(np.arange(2,4),dtype=np.float64)}) @@ -3843,7 +3847,7 @@ def test_detect_chained_assignment(self): df = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'c' : Series(range(7),dtype='int64') }) - self.assertIsNone(df._parent) + self.assertFalse(len(df._parent)) expected = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], 'c' : [42,42,2,3,4,42,6]}) @@ -3868,7 +3872,7 @@ def test_detect_chained_assignment(self): # make sure that is_copy is picked up reconstruction # GH5475 df = DataFrame({"A": [1,2]}) - self.assertIsNone(df._parent) + self.assertFalse(len(df._parent)) with tm.ensure_clean('__tmp__pickle') as path: df.to_pickle(path) df2 = pd.read_pickle(path) @@ -3899,7 +3903,7 @@ def random_text(nobs=100): # explicity copy indexer = df.letters.apply(lambda x : len(x) > 10) df = df.ix[indexer].copy() - self.assertIsNone(df._parent) + self.assertFalse(len(df._parent)) df['letters'] = df['letters'].apply(str.lower) # implicity take @@ -3917,9 +3921,9 @@ def random_text(nobs=100): df.loc[:,'letters'] = df['letters'].apply(str.lower) # should be ok even though it's a copy! - self.assertIsNone(df._parent) + self.assertFalse(len(df._parent)) df['letters'] = df['letters'].apply(str.lower) - self.assertIsNone(df._parent) + self.assertFalse(len(df._parent)) df = random_text(100000) indexer = df.letters.apply(lambda x : len(x) > 10) @@ -3927,7 +3931,7 @@ def random_text(nobs=100): # an identical take, so no copy df = DataFrame({'a' : [1]}).dropna() - self.assertIsNone(df._parent) + self.assertFalse(len(df._parent)) df['a'] += 1 # inplace ops diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index fda9039dbb19b..61fbb72d967cc 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -157,7 +157,7 @@ def compare(s, name): result = s.dt.to_pytimedelta() self.assertIsInstance(result,np.ndarray) self.assertTrue(result.dtype == object) - + result = s.dt.total_seconds() self.assertIsInstance(result,pd.Series) self.assertTrue(result.dtype == 'float64') @@ -1235,9 +1235,9 @@ def test_iget(self): expected = s.ix[2:4] assert_series_equal(result, expected) - # test slice is a view + # this is copy-on-write result[:] = 0 - self.assertTrue((s[1:3] == 0).all()) + self.assertTrue((s[1:3] != 0).all()) # list of integers result = s.iloc[[0, 2, 3, 4, 5]] @@ -1473,10 +1473,10 @@ def test_slice(self): self.assertTrue(tm.equalContents(numSliceEnd, np.array(self.series)[-10:])) - # test return view + # copy-on-write sl = self.series[10:20] sl[:] = 0 - self.assertTrue((self.series[10:20] == 0).all()) + self.assertTrue((self.series[10:20] != 0).all()) def test_slice_can_reorder_not_uniquely_indexed(self): s = Series(1, index=['a', 'a', 'b', 'b', 'c'])