From d7b22f360e6edfa7a964a6a67f4f094d139bf7d4 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 Jan 2022 10:04:11 -0800 Subject: [PATCH 1/3] REF: mask values in loc.__setitem__ with bool indexer --- pandas/core/indexing.py | 58 +++++++++++++++++++++++++++++++-- pandas/core/internals/blocks.py | 2 +- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5f9bc142c5836..0678ea95c2f7c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -53,6 +53,7 @@ is_empty_indexer, is_exact_shape_match, is_list_like_indexer, + is_scalar_indexer, length_of_indexer, ) from pandas.core.indexes.api import ( @@ -669,6 +670,56 @@ def _get_setitem_indexer(self, key): return self._convert_to_indexer(key, axis=0) + @final + def _maybe_mask_setitem_value(self, indexer, value): + """ + If we have obj.iloc[mask] = arraylike and arraylike has the same + length as obj, we treat this as obj.iloc[mask] = arraylike[mask], + similar to Series.__setitem__. + """ + # TODO: why do we only do this for loc, not iloc? + # ser = pd.Series(range(5)) + # mask = np.array([True, False, True, False, True]) + # ser[mask] = ser * 2 # <- works + # ser.loc[mask] = ser * 2 # <- works + # ser.iloc[mask] = ser * 2 # <- raises + + if isinstance(indexer, tuple) and len(indexer) == 2: + pi, icols = indexer + ndim = getattr(value, "ndim", 0) + if ndim == 0: + pass + elif com.is_bool_indexer(pi) and len(value) == len(pi): + newkey = pi.nonzero()[0] + + if is_scalar_indexer(icols, self.ndim - 1) and ndim == 1: + # e.g. test_loc_setitem_boolean_mask_allfalse + value = value[pi] + indexer = (newkey, icols) + + elif ( + isinstance(icols, np.ndarray) + and icols.dtype.kind == "i" + and len(icols) == 1 + ): + if ndim == 1: + value = value[pi] + indexer = (newkey, icols) + # TODO: is it sketchy that icols is an ndarray and value + # is 1D? i.e. should we require value to be 2D with + # value.shape[1] == 1? + + elif ndim == 2 and value.shape[1] == 1: + if isinstance(value, ABCDataFrame): + value = value.iloc[newkey] + else: + value = value[pi] + indexer = (newkey, icols) + elif com.is_bool_indexer(indexer): + indexer = indexer.nonzero()[0] + + return indexer, value + @final def _tupleize_axis_indexer(self, key) -> tuple: """ @@ -727,6 +778,10 @@ def __setitem__(self, key, value): indexer = self._get_setitem_indexer(key) self._has_valid_setitem_indexer(key) + if self.name == "loc": + # TODO: should we do this for iloc too? + indexer, value = self._maybe_mask_setitem_value(indexer, value) + iloc = self if self.name == "iloc" else self.obj.iloc iloc._setitem_with_indexer(indexer, value, self.name) @@ -1299,8 +1354,7 @@ def _convert_to_indexer(self, key, axis: int): if com.is_bool_indexer(key): key = check_bool_indexer(labels, key) - (inds,) = key.nonzero() - return inds + return key else: return self._get_listlike_indexer(key, axis)[1] else: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index de4abdd1ba024..31717cb29007d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -931,7 +931,7 @@ def setitem(self, indexer, value): if is_empty_indexer(indexer): # GH#8669 empty indexers, test_loc_setitem_boolean_mask_allfalse - pass + values[indexer] = value elif is_scalar_indexer(indexer, self.ndim): # setting a single element for each dim and with a rhs that could From 92a7e3aa74b8834715b8a5828cdb9bd8667b4cc8 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 20 Jan 2022 15:08:50 -0800 Subject: [PATCH 2/3] handle reindexing --- pandas/core/indexing.py | 57 +++++++++++++++++++------------ pandas/tests/indexing/test_loc.py | 25 ++++++++++++++ 2 files changed, 61 insertions(+), 21 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0678ea95c2f7c..f1cce30365bb8 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -673,28 +673,33 @@ def _get_setitem_indexer(self, key): @final def _maybe_mask_setitem_value(self, indexer, value): """ - If we have obj.iloc[mask] = arraylike and arraylike has the same - length as obj, we treat this as obj.iloc[mask] = arraylike[mask], + If we have obj.iloc[mask] = series_or_frame and series_or_frame has the + same length as obj, we treat this as obj.iloc[mask] = series_or_frame[mask], similar to Series.__setitem__. + + Note this is only for loc, not iloc. """ - # TODO: why do we only do this for loc, not iloc? - # ser = pd.Series(range(5)) - # mask = np.array([True, False, True, False, True]) - # ser[mask] = ser * 2 # <- works - # ser.loc[mask] = ser * 2 # <- works - # ser.iloc[mask] = ser * 2 # <- raises - if isinstance(indexer, tuple) and len(indexer) == 2: + if ( + isinstance(indexer, tuple) + and len(indexer) == 2 + and isinstance(value, (ABCSeries, ABCDataFrame)) + ): pi, icols = indexer - ndim = getattr(value, "ndim", 0) - if ndim == 0: - pass - elif com.is_bool_indexer(pi) and len(value) == len(pi): + ndim = value.ndim + if com.is_bool_indexer(pi) and len(value) == len(pi): newkey = pi.nonzero()[0] if is_scalar_indexer(icols, self.ndim - 1) and ndim == 1: # e.g. test_loc_setitem_boolean_mask_allfalse - value = value[pi] + if len(newkey) == 0: + # FIXME: kludge for test_loc_setitem_boolean_mask_allfalse + # TODO(GH#45333): may be fixed when deprecation is enforced + + value = value.iloc[:0] + else: + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_series(indexer, value) indexer = (newkey, icols) elif ( @@ -703,17 +708,27 @@ def _maybe_mask_setitem_value(self, indexer, value): and len(icols) == 1 ): if ndim == 1: - value = value[pi] + # We implicitly broadcast, though numpy does not, see + # github.com/pandas-dev/pandas/pull/45501#discussion_r789071825 + if len(newkey) == 0: + # FIXME: kludge for + # test_setitem_loc_only_false_indexer_dtype_changed + # TODO(GH#45333): may be fixed when deprecation is enforced + value = value.iloc[:0] + else: + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_series(indexer, value) indexer = (newkey, icols) - # TODO: is it sketchy that icols is an ndarray and value - # is 1D? i.e. should we require value to be 2D with - # value.shape[1] == 1? elif ndim == 2 and value.shape[1] == 1: - if isinstance(value, ABCDataFrame): - value = value.iloc[newkey] + if len(newkey) == 0: + # FIXME: kludge for + # test_loc_setitem_all_false_boolean_two_blocks + # TODO(GH#45333): may be fixed when deprecation is enforced + value = value.iloc[:0] else: - value = value[pi] + # test_loc_setitem_ndframe_values_alignment + value = self.obj.iloc._align_frame(indexer, value) indexer = (newkey, icols) elif com.is_bool_indexer(indexer): indexer = indexer.nonzero()[0] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 4b751fa7d5e3e..73beb04fca81f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -2446,6 +2446,31 @@ def test_loc_setitem_boolean_and_column(self, float_frame): tm.assert_frame_equal(float_frame, expected) + def test_loc_setitem_ndframe_values_alignment(self): + # GH#45501 + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df.loc[[False, False, True], ["a"]] = DataFrame( + {"a": [10, 20, 30]}, index=[2, 1, 0] + ) + + expected = DataFrame({"a": [1, 2, 10], "b": [4, 5, 6]}) + tm.assert_frame_equal(df, expected) + + # same thing with Series RHS + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df.loc[[False, False, True], ["a"]] = Series([10, 11, 12], index=[2, 1, 0]) + tm.assert_frame_equal(df, expected) + + # same thing but setting "a" instead of ["a"] + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df.loc[[False, False, True], "a"] = Series([10, 11, 12], index=[2, 1, 0]) + tm.assert_frame_equal(df, expected) + + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + ser = df["a"] + ser.loc[[False, False, True]] = Series([10, 11, 12], index=[2, 1, 0]) + tm.assert_frame_equal(df, expected) + class TestLocListlike: @pytest.mark.parametrize("box", [lambda x: x, np.asarray, list]) From 46f0f2201768f70e1f87c76508c833a353c34fb5 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 22 Jan 2022 18:14:41 -0800 Subject: [PATCH 3/3] delay masking --- pandas/core/indexing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index f1cce30365bb8..4a251ae0d93da 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -793,10 +793,6 @@ def __setitem__(self, key, value): indexer = self._get_setitem_indexer(key) self._has_valid_setitem_indexer(key) - if self.name == "loc": - # TODO: should we do this for iloc too? - indexer, value = self._maybe_mask_setitem_value(indexer, value) - iloc = self if self.name == "iloc" else self.obj.iloc iloc._setitem_with_indexer(indexer, value, self.name) @@ -1765,6 +1761,10 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): self._setitem_with_indexer_missing(indexer, value) return + if name == "loc": + # must come after setting of missing + indexer, value = self._maybe_mask_setitem_value(indexer, value) + # align and set the values if take_split_path: # We have to operate column-wise