From 0468cadc7f3a225fc04f7ef6b93c1a7bb0cc5c00 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 23 Jun 2015 00:03:23 -0600 Subject: [PATCH] ENH: add tolerance to get_indexer, get_loc and reindex --- doc/source/basics.rst | 24 +++++++ doc/source/whatsnew/v0.17.0.txt | 16 +++++ pandas/core/frame.py | 17 ++--- pandas/core/generic.py | 42 ++++++++++--- pandas/core/index.py | 105 ++++++++++++++++++++++++------- pandas/tests/test_frame.py | 10 +++ pandas/tests/test_index.py | 107 +++++++++++++++++++++++++++++++- pandas/tests/test_series.py | 7 +++ pandas/tseries/base.py | 8 +++ pandas/tseries/index.py | 13 ++-- pandas/tseries/period.py | 22 ++++--- pandas/tseries/tdi.py | 13 ++-- 12 files changed, 328 insertions(+), 56 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 624e10b431de5..71d16a40f0215 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1100,6 +1100,30 @@ Note that the same result could have been achieved using increasing or descreasing. :meth:`~Series.fillna` and :meth:`~Series.interpolate` will not make any checks on the order of the index. +.. _basics.limits_on_reindex_fill: + +Limits on filling while reindexing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``limit`` and ``tolerance`` arguments provide additional control over +filling while reindexing. Limit specifies the maximum count of consecutive +matches: + +.. ipython:: python + + ts2.reindex(ts.index, method='ffill', limit=1) + +In contrast, tolerance specifies the maximum distance between the index and +indexer values: + +.. ipython:: python + + ts2.reindex(ts.index, method='ffill', tolerance='1 day') + +Notice that when used on a ``DatetimeIndex``, ``TimedeltaIndex`` or +``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. +This allows you to specify tolerance with appropriate strings. + .. _basics.drop: Dropping labels from an axis diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 7e69a8044a305..5eb808e450a8a 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -180,6 +180,22 @@ Other enhancements s.drop_duplicates(keep=False) +- Reindex now has a ``tolerance`` argument that allows for finer control of :ref:`basics.limits_on_reindex_fill`: + + .. ipython:: python + + df = pd.DataFrame({'x': range(5), 't': pd.date_range('2000-01-01', periods=5)}) + df.reindex([0.1, 1.9, 3.5], method='nearest', tolerance=0.2) + + When used on a ``DatetimeIndex``, ``TimedeltaIndex`` or ``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. This allows you to specify tolerance with a string: + + .. ipython:: python + + df = df.set_index('t') + df.reindex(pd.to_datetime(['1999-12-31']), method='nearest', tolerance='1 day') + + ``tolerance`` is also exposed by the lower level ``Index.get_indexer`` and ``Index.get_loc`` methods. + .. _whatsnew_0170.api: .. _whatsnew_0170.api_breaking: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 062cbe579785c..8f7aee0cb6f15 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2518,33 +2518,36 @@ def lookup(self, row_labels, col_labels): #---------------------------------------------------------------------- # Reindexing and alignment - def _reindex_axes(self, axes, level, limit, method, fill_value, copy): + def _reindex_axes(self, axes, level, limit, tolerance, method, + fill_value, copy): frame = self columns = axes['columns'] if columns is not None: frame = frame._reindex_columns(columns, copy, level, fill_value, - limit) + limit, tolerance) index = axes['index'] if index is not None: frame = frame._reindex_index(index, method, copy, level, - fill_value, limit) + fill_value, limit, tolerance) return frame def _reindex_index(self, new_index, method, copy, level, fill_value=NA, - limit=None): + limit=None, tolerance=None): new_index, indexer = self.index.reindex(new_index, method, level, - limit=limit) + limit=limit, + tolerance=tolerance) return self._reindex_with_indexers({0: [new_index, indexer]}, copy=copy, fill_value=fill_value, allow_dups=False) def _reindex_columns(self, new_columns, copy, level, fill_value=NA, - limit=None): + limit=None, tolerance=None): new_columns, indexer = self.columns.reindex(new_columns, level=level, - limit=limit) + limit=limit, + tolerance=tolerance) return self._reindex_with_indexers({1: [new_columns, indexer]}, copy=copy, fill_value=fill_value, allow_dups=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2fc288de438b3..27cb2641034dc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -922,7 +922,7 @@ def to_hdf(self, path_or_buf, key, **kwargs): in the store wherever possible fletcher32 : bool, default False If applying compression use the fletcher32 checksum - dropna : boolean, default False. + dropna : boolean, default False. If true, ALL nan rows will not be written to store. """ @@ -1551,7 +1551,8 @@ def select(self, crit, axis=0): return self.reindex(**{axis_name: new_axis}) - def reindex_like(self, other, method=None, copy=True, limit=None): + def reindex_like(self, other, method=None, copy=True, limit=None, + tolerance=None): """ return an object with matching indicies to myself Parameters @@ -1560,7 +1561,12 @@ def reindex_like(self, other, method=None, copy=True, limit=None): method : string or None copy : boolean, default True limit : int, default None - Maximum size gap to forward or backward fill + Maximum number of consecutive labels to fill for inexact matches. + tolerance : optional + Maximum distance between labels of the other object and this + object for inexact matches. + + .. versionadded:: 0.17.0 Notes ----- @@ -1572,7 +1578,8 @@ def reindex_like(self, other, method=None, copy=True, limit=None): reindexed : same as input """ d = other._construct_axes_dict(axes=self._AXIS_ORDERS, - method=method, copy=copy, limit=limit) + method=method, copy=copy, limit=limit, + tolerance=tolerance) return self.reindex(**d) @@ -1736,7 +1743,13 @@ def sort_index(self, axis=0, ascending=True): Value to use for missing values. Defaults to NaN, but can be any "compatible" value limit : int, default None - Maximum size gap to forward or backward fill + Maximum number of consecutive elements to forward or backward fill + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + .. versionadded:: 0.17.0 Examples -------- @@ -1758,6 +1771,7 @@ def reindex(self, *args, **kwargs): level = kwargs.pop('level', None) copy = kwargs.pop('copy', True) limit = kwargs.pop('limit', None) + tolerance = kwargs.pop('tolerance', None) fill_value = kwargs.pop('fill_value', np.nan) if kwargs: @@ -1782,10 +1796,11 @@ def reindex(self, *args, **kwargs): pass # perform the reindex on the axes - return self._reindex_axes(axes, level, limit, + return self._reindex_axes(axes, level, limit, tolerance, method, fill_value, copy).__finalize__(self) - def _reindex_axes(self, axes, level, limit, method, fill_value, copy): + def _reindex_axes(self, axes, level, limit, tolerance, method, + fill_value, copy): """ perform the reinxed for all the axes """ obj = self for a in self._AXIS_ORDERS: @@ -1795,7 +1810,8 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy): ax = self._get_axis(a) new_index, indexer = ax.reindex( - labels, level=level, limit=limit, method=method) + labels, level=level, limit=limit, tolerance=tolerance, + method=method) axis = self._get_axis_number(a) obj = obj._reindex_with_indexers( @@ -1836,7 +1852,13 @@ def _reindex_multi(self, axes, copy, fill_value): Broadcast across a level, matching Index values on the passed MultiIndex level limit : int, default None - Maximum size gap to forward or backward fill + Maximum number of consecutive elements to forward or backward fill + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + .. versionadded:: 0.17.0 Examples -------- @@ -2910,7 +2932,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, use the actual numerical values of the index * 'krogh', 'piecewise_polynomial', 'spline', and 'pchip' are all wrappers around the scipy interpolation methods of similar - names. These use the actual numerical values of the index. See + names. These use the actual numerical values of the index. See the scipy documentation for more on their behavior: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html diff --git a/pandas/core/index.py b/pandas/core/index.py index 12ad8a590c304..ed89d163bf608 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1602,28 +1602,38 @@ def sym_diff(self, other, result_name=None): attribs['freq'] = None return self._shallow_copy(the_diff, infer=True, **attribs) - def get_loc(self, key, method=None): + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label Parameters ---------- key : label - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'} + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. * backfill / bfill: use NEXT index value if no exact match * nearest: use the NEAREST index value if no exact match. Tied distances are broken by preferring the larger index value. + tolerance : optional + Maximum distance from index value for inexact matches. The value of + the index at the matching location most satisfy the equation + ``abs(index[loc] - key) <= tolerance``. + + .. versionadded:: 0.17.0 Returns ------- loc : int if unique index, possibly slice or mask if not """ if method is None: + if tolerance is not None: + raise ValueError('tolerance argument only valid if using pad, ' + 'backfill or nearest lookups') return self._engine.get_loc(_values_from_object(key)) - indexer = self.get_indexer([key], method=method) + indexer = self.get_indexer([key], method=method, + tolerance=tolerance) if indexer.ndim > 1 or indexer.size > 1: raise TypeError('get_loc requires scalar valued input') loc = indexer.item() @@ -1692,7 +1702,7 @@ def get_level_values(self, level): self._validate_index_level(level) return self - def get_indexer(self, target, method=None, limit=None): + def get_indexer(self, target, method=None, limit=None, tolerance=None): """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -1701,15 +1711,21 @@ def get_indexer(self, target, method=None, limit=None): Parameters ---------- target : Index - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'} + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional * default: exact matches only. * pad / ffill: find the PREVIOUS index value if no exact match. * backfill / bfill: use NEXT index value if no exact match * nearest: use the NEAREST index value if no exact match. Tied distances are broken by preferring the larger index value. - limit : int - Maximum number of consecuctive labels in ``target`` to match for + limit : int, optional + Maximum number of consecutive labels in ``target`` to match for inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + .. versionadded:: 0.17.0 Examples -------- @@ -1725,36 +1741,54 @@ def get_indexer(self, target, method=None, limit=None): """ method = com._clean_reindex_fill_method(method) target = _ensure_index(target) + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance) pself, ptarget = self._possibly_promote(target) if pself is not self or ptarget is not target: - return pself.get_indexer(ptarget, method=method, limit=limit) + return pself.get_indexer(ptarget, method=method, limit=limit, + tolerance=tolerance) - if not is_dtype_equal(self.dtype,target.dtype): + if not is_dtype_equal(self.dtype, target.dtype): this = self.astype(object) target = target.astype(object) - return this.get_indexer(target, method=method, limit=limit) + return this.get_indexer(target, method=method, limit=limit, + tolerance=tolerance) if not self.is_unique: raise InvalidIndexError('Reindexing only valid with uniquely' ' valued Index objects') if method == 'pad' or method == 'backfill': - indexer = self._get_fill_indexer(target, method, limit) + indexer = self._get_fill_indexer(target, method, limit, tolerance) elif method == 'nearest': - indexer = self._get_nearest_indexer(target, limit) + indexer = self._get_nearest_indexer(target, limit, tolerance) else: + if tolerance is not None: + raise ValueError('tolerance argument only valid if doing pad, ' + 'backfill or nearest reindexing') + if limit is not None: + raise ValueError('limit argument only valid if doing pad, ' + 'backfill or nearest reindexing') + indexer = self._engine.get_indexer(target.values) return com._ensure_platform_int(indexer) - def _get_fill_indexer(self, target, method, limit=None): + def _convert_tolerance(self, tolerance): + # override this method on subclasses + return tolerance + + def _get_fill_indexer(self, target, method, limit=None, tolerance=None): if self.is_monotonic_increasing and target.is_monotonic_increasing: method = (self._engine.get_pad_indexer if method == 'pad' else self._engine.get_backfill_indexer) indexer = method(target.values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) + if tolerance is not None: + indexer = self._filter_indexer_tolerance( + target.values, indexer, tolerance) return indexer def _get_fill_indexer_searchsorted(self, target, method, limit=None): @@ -1787,7 +1821,7 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None): indexer[indexer == len(self)] = -1 return indexer - def _get_nearest_indexer(self, target, limit): + def _get_nearest_indexer(self, target, limit, tolerance): """ Get the indexer for the nearest index labels; requires an index with values that can be subtracted from each other (e.g., not strings or @@ -1804,6 +1838,14 @@ def _get_nearest_indexer(self, target, limit): indexer = np.where(op(left_distances, right_distances) | (right_indexer == -1), left_indexer, right_indexer) + if tolerance is not None: + indexer = self._filter_indexer_tolerance( + target, indexer, tolerance) + return indexer + + def _filter_indexer_tolerance(self, target, indexer, tolerance): + distance = abs(self.values[indexer] - target) + indexer = np.where(distance <= tolerance, indexer, -1) return indexer def get_indexer_non_unique(self, target): @@ -1911,7 +1953,8 @@ def _can_reindex(self, indexer): if not self.is_unique and len(indexer): raise ValueError("cannot reindex from a duplicate axis") - def reindex(self, target, method=None, level=None, limit=None): + def reindex(self, target, method=None, level=None, limit=None, + tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -1951,7 +1994,8 @@ def reindex(self, target, method=None, level=None, limit=None): else: if self.is_unique: indexer = self.get_indexer(target, method=method, - limit=limit) + limit=limit, + tolerance=tolerance) else: if method is not None or limit is not None: raise ValueError("cannot reindex a non-unique index " @@ -3098,7 +3142,8 @@ def _can_reindex(self, indexer): """ always allow reindexing """ pass - def reindex(self, target, method=None, level=None, limit=None): + def reindex(self, target, method=None, level=None, limit=None, + tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -3167,7 +3212,7 @@ def _reindex_non_unique(self, target): return new_target, indexer, new_indexer - def get_indexer(self, target, method=None, limit=None): + def get_indexer(self, target, method=None, limit=None, tolerance=None): """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -3416,6 +3461,14 @@ def _maybe_cast_slice_bound(self, label, side, kind): return label + def _convert_tolerance(self, tolerance): + try: + return float(tolerance) + except ValueError: + raise ValueError('tolerance argument for %s must be numeric: %r' + % (type(self).__name__, tolerance)) + + class Int64Index(NumericIndex): """ @@ -3672,7 +3725,7 @@ def __contains__(self, other): except: return False - def get_loc(self, key, method=None): + def get_loc(self, key, method=None, tolerance=None): try: if np.all(np.isnan(key)): nan_idxs = self._nan_idxs @@ -3684,7 +3737,8 @@ def get_loc(self, key, method=None): return nan_idxs except (TypeError, NotImplementedError): pass - return super(Float64Index, self).get_loc(key, method=method) + return super(Float64Index, self).get_loc(key, method=method, + tolerance=tolerance) @property def is_all_dates(self): @@ -4906,7 +4960,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): return new_index, indexer - def get_indexer(self, target, method=None, limit=None): + def get_indexer(self, target, method=None, limit=None, tolerance=None): """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the @@ -4952,6 +5006,9 @@ def get_indexer(self, target, method=None, limit=None): self_index = self._tuple_index if method == 'pad' or method == 'backfill': + if tolerance is not None: + raise NotImplementedError("tolerance not implemented yet " + 'for MultiIndex') indexer = self_index._get_fill_indexer(target, method, limit) elif method == 'nearest': raise NotImplementedError("method='nearest' not implemented yet " @@ -4961,7 +5018,8 @@ def get_indexer(self, target, method=None, limit=None): return com._ensure_platform_int(indexer) - def reindex(self, target, method=None, level=None, limit=None): + def reindex(self, target, method=None, level=None, limit=None, + tolerance=None): """ Create index with target's values (move/add/delete values as necessary) @@ -5000,7 +5058,8 @@ def reindex(self, target, method=None, level=None, limit=None): else: if self.is_unique: indexer = self.get_indexer(target, method=method, - limit=limit) + limit=limit, + tolerance=tolerance) else: raise Exception( "cannot handle a non-unique multi-index!") diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 8c836ae564e28..1a7bfd2d9c88b 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1929,6 +1929,12 @@ def test_reindex_methods(self): actual = df.reindex(target, method=method) assert_frame_equal(expected, actual) + actual = df.reindex_like(df, method=method, tolerance=0) + assert_frame_equal(df, actual) + + actual = df.reindex(target, method=method, tolerance=1) + assert_frame_equal(expected, actual) + e2 = expected[::-1] actual = df.reindex(target[::-1], method=method) assert_frame_equal(e2, actual) @@ -1944,6 +1950,10 @@ def test_reindex_methods(self): actual = df[::-1].reindex(target, method=switched_method) assert_frame_equal(expected, actual) + expected = pd.DataFrame({'x': [0, 1, 1, np.nan]}, index=target) + actual = df.reindex(target, method='nearest', tolerance=0.2) + assert_frame_equal(expected, actual) + def test_non_monotonic_reindex_methods(self): dr = pd.date_range('2013-08-01', periods=6, freq='B') data = np.random.randn(6,1) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 9a3576a8fd846..688091d39d7c1 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1220,6 +1220,16 @@ def test_get_indexer(self): r2 = idx2.get_indexer(idx1[::-1], method='backfill') assert_almost_equal(r2, e1[::-1]) + def test_get_indexer_invalid(self): + # GH10411 + idx = Index(np.arange(10)) + + with tm.assertRaisesRegexp(ValueError, 'tolerance argument'): + idx.get_indexer([1, 0], tolerance=1) + + with tm.assertRaisesRegexp(ValueError, 'limit argument'): + idx.get_indexer([1, 0], limit=1) + def test_get_indexer_nearest(self): idx = Index(np.arange(10)) @@ -1228,10 +1238,20 @@ def test_get_indexer_nearest(self): actual = idx.get_indexer([0, 5, 9], method=method) tm.assert_numpy_array_equal(actual, [0, 5, 9]) + actual = idx.get_indexer([0, 5, 9], method=method, tolerance=0) + tm.assert_numpy_array_equal(actual, [0, 5, 9]) + for method, expected in zip(all_methods, [[0, 1, 8], [1, 2, 9], [0, 2, 9]]): actual = idx.get_indexer([0.2, 1.8, 8.5], method=method) tm.assert_numpy_array_equal(actual, expected) + actual = idx.get_indexer([0.2, 1.8, 8.5], method=method, tolerance=1) + tm.assert_numpy_array_equal(actual, expected) + + for method, expected in zip(all_methods, [[0, -1, -1], [-1, 2, -1], [0, 2, -1]]): + actual = idx.get_indexer([0.2, 1.8, 8.5], method=method, tolerance=0.2) + tm.assert_numpy_array_equal(actual, expected) + with tm.assertRaisesRegexp(ValueError, 'limit argument'): idx.get_indexer([1, 0], method='nearest', limit=1) @@ -1261,20 +1281,39 @@ def test_get_indexer_strings(self): with tm.assertRaises(TypeError): idx.get_indexer(['a', 'b', 'c', 'd'], method='nearest') + with tm.assertRaises(TypeError): + idx.get_indexer(['a', 'b', 'c', 'd'], method='pad', tolerance=2) + def test_get_loc(self): idx = pd.Index([0, 1, 2]) all_methods = [None, 'pad', 'backfill', 'nearest'] for method in all_methods: self.assertEqual(idx.get_loc(1, method=method), 1) + if method is not None: + self.assertEqual(idx.get_loc(1, method=method, tolerance=0), 1) with tm.assertRaises(TypeError): idx.get_loc([1, 2], method=method) for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: self.assertEqual(idx.get_loc(1.1, method), loc) + for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: + self.assertEqual(idx.get_loc(1.1, method, tolerance=1), loc) + + for method in ['pad', 'backfill', 'nearest']: + with tm.assertRaises(KeyError): + idx.get_loc(1.1, method, tolerance=0.05) + + with tm.assertRaisesRegexp(ValueError, 'must be numeric'): + idx.get_loc(1.1, 'nearest', tolerance='invalid') + with tm.assertRaisesRegexp(ValueError, 'tolerance .* valid if'): + idx.get_loc(1.1, tolerance=1) + idx = pd.Index(['a', 'c']) with tm.assertRaises(TypeError): idx.get_loc('a', method='nearest') + with tm.assertRaises(TypeError): + idx.get_loc('a', method='pad', tolerance='invalid') def test_slice_locs(self): for dtype in [int, float]: @@ -2266,12 +2305,20 @@ def test_get_loc(self): idx = Float64Index([0.0, 1.0, 2.0]) for method in [None, 'pad', 'backfill', 'nearest']: self.assertEqual(idx.get_loc(1, method), 1) + if method is not None: + self.assertEqual(idx.get_loc(1, method, tolerance=0), 1) for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: self.assertEqual(idx.get_loc(1.1, method), loc) + self.assertEqual(idx.get_loc(1.1, method, tolerance=0.9), loc) self.assertRaises(KeyError, idx.get_loc, 'foo') self.assertRaises(KeyError, idx.get_loc, 1.5) + self.assertRaises(KeyError, idx.get_loc, 1.5, + method='pad', tolerance=0.1) + + with tm.assertRaisesRegexp(ValueError, 'must be numeric'): + idx.get_loc(1.4, method='nearest', tolerance='foo') def test_get_loc_na(self): idx = Float64Index([np.nan, 1, 2]) @@ -2838,10 +2885,28 @@ def test_get_loc(self): self.assertEqual(idx.get_loc(idx[1], method), 1) self.assertEqual(idx.get_loc(idx[1].to_pydatetime(), method), 1) self.assertEqual(idx.get_loc(str(idx[1]), method), 1) + if method is not None: + self.assertEqual(idx.get_loc(idx[1], method, + tolerance=pd.Timedelta('0 days')), + 1) self.assertEqual(idx.get_loc('2000-01-01', method='nearest'), 0) self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest'), 1) + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', + tolerance='1 day'), 1) + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', + tolerance=pd.Timedelta('1D')), 1) + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', + tolerance=np.timedelta64(1, 'D')), 1) + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest', + tolerance=timedelta(1)), 1) + with tm.assertRaisesRegexp(ValueError, 'must be convertible'): + idx.get_loc('2000-01-01T12', method='nearest', tolerance='foo') + with tm.assertRaises(KeyError): + idx.get_loc('2000-01-01T03', method='nearest', + tolerance='2 hours') + self.assertEqual(idx.get_loc('2000', method='nearest'), slice(0, 3)) self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 3)) @@ -2878,6 +2943,11 @@ def test_get_indexer(self): tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), [-1, 0, 1]) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), [0, 1, 2]) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), [0, 1, 1]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, 'nearest', tolerance=pd.Timedelta('1 hour')), + [0, -1, 1]) + with tm.assertRaises(ValueError): + idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') def test_roundtrip_pickle_with_tz(self): @@ -2988,6 +3058,22 @@ def test_get_loc(self): self.assertEqual(idx.get_loc(idx[1].to_timestamp().to_pydatetime(), method), 1) self.assertEqual(idx.get_loc(str(idx[1]), method), 1) + idx = pd.period_range('2000-01-01', periods=5)[::2] + self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', + tolerance='1 day'), 1) + self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', + tolerance=pd.Timedelta('1D')), 1) + self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', + tolerance=np.timedelta64(1, 'D')), 1) + self.assertEqual(idx.get_loc('2000-01-02T12', method='nearest', + tolerance=timedelta(1)), 1) + with tm.assertRaisesRegexp(ValueError, 'must be convertible'): + idx.get_loc('2000-01-10', method='nearest', tolerance='foo') + with tm.assertRaisesRegexp(ValueError, 'different freq'): + idx.get_loc('2000-01-10', method='nearest', tolerance='1 hour') + with tm.assertRaises(KeyError): + idx.get_loc('2000-01-10', method='nearest', tolerance='1 day') + def test_get_indexer(self): idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') tm.assert_numpy_array_equal(idx.get_indexer(idx), [0, 1, 2]) @@ -2997,9 +3083,15 @@ def test_get_indexer(self): tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), [-1, 0, 1]) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), [0, 1, 2]) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), [0, 1, 1]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, 'nearest', tolerance='1 hour'), + [0, -1, 1]) with self.assertRaisesRegexp(ValueError, 'different freq'): - idx.asfreq('D').get_indexer(idx) + idx.get_indexer(target, 'nearest', tolerance='1 minute') + + tm.assert_numpy_array_equal( + idx.get_indexer(target, 'nearest', tolerance='1 day'), [0, 1, 1]) def test_repeat(self): # GH10183 @@ -3029,6 +3121,13 @@ def test_get_loc(self): self.assertEqual(idx.get_loc(idx[1].to_pytimedelta(), method), 1) self.assertEqual(idx.get_loc(str(idx[1]), method), 1) + self.assertEqual(idx.get_loc(idx[1], 'pad', tolerance=pd.Timedelta(0)), 1) + self.assertEqual(idx.get_loc(idx[1], 'pad', tolerance=np.timedelta64(0, 's')), 1) + self.assertEqual(idx.get_loc(idx[1], 'pad', tolerance=timedelta(0)), 1) + + with tm.assertRaisesRegexp(ValueError, 'must be convertible'): + idx.get_loc(idx[1], method='nearest', tolerance='foo') + for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: self.assertEqual(idx.get_loc('1 day 1 hour', method), loc) @@ -3040,6 +3139,10 @@ def test_get_indexer(self): tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), [-1, 0, 1]) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), [0, 1, 2]) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), [0, 1, 1]) + tm.assert_numpy_array_equal( + idx.get_indexer(target, 'nearest', + tolerance=pd.Timedelta('1 hour')), + [0, -1, 1]) def test_numeric_compat(self): @@ -4059,6 +4162,8 @@ def test_get_indexer_nearest(self): midx = MultiIndex.from_tuples([('a', 1), ('b', 2)]) with tm.assertRaises(NotImplementedError): midx.get_indexer(['a'], method='nearest') + with tm.assertRaises(NotImplementedError): + midx.get_indexer(['a'], method='pad', tolerance=2) def test_format(self): self.index.format() diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 36a8600e51725..4fa8aaf34846f 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -6465,6 +6465,13 @@ def test_reindex_nearest(self): actual = s.reindex_like(actual, method='nearest') assert_series_equal(expected, actual) + actual = s.reindex_like(actual, method='nearest', tolerance=1) + assert_series_equal(expected, actual) + + actual = s.reindex(target, method='nearest', tolerance=0.2) + expected = Series([0, 1, np.nan, 2], target) + assert_series_equal(expected, actual) + def test_reindex_backfill(self): pass diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 727852ced25b0..c353e66bc2dbb 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -203,6 +203,14 @@ def asobject(self): from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object) + def _convert_tolerance(self, tolerance): + try: + return tslib.Timedelta(tolerance).to_timedelta64() + except ValueError: + raise ValueError('tolerance argument for %s must be convertible ' + 'to Timedelta: %r' + % (type(self).__name__, tolerance)) + def _maybe_mask_results(self, result, fill_value=None, convert=None): """ Parameters diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 19ff9a4b19a3e..0525a29ef3fd0 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1270,7 +1270,7 @@ def get_value_maybe_box(self, series, key): values = self._engine.get_value(_values_from_object(series), key) return _maybe_box(self, values, series, key) - def get_loc(self, key, method=None): + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -1278,10 +1278,15 @@ def get_loc(self, key, method=None): ------- loc : int """ + if tolerance is not None: + # try converting tolerance now, so errors don't get swallowed by + # the try/except clauses below + tolerance = self._convert_tolerance(tolerance) + if isinstance(key, datetime): # needed to localize naive datetimes key = Timestamp(key, tz=self.tz) - return Index.get_loc(self, key, method=method) + return Index.get_loc(self, key, method, tolerance) if isinstance(key, time): if method is not None: @@ -1290,7 +1295,7 @@ def get_loc(self, key, method=None): return self.indexer_at_time(key) try: - return Index.get_loc(self, key, method=method) + return Index.get_loc(self, key, method, tolerance) except (KeyError, ValueError, TypeError): try: return self._get_string_slice(key) @@ -1299,7 +1304,7 @@ def get_loc(self, key, method=None): try: stamp = Timestamp(key, tz=self.tz) - return Index.get_loc(self, stamp, method=method) + return Index.get_loc(self, stamp, method, tolerance) except (KeyError, ValueError): raise KeyError(key) diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index e7b229e91cbc8..56d7d45120fdc 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -502,22 +502,26 @@ def to_timestamp(self, freq=None, how='start'): new_data = period.periodarr_to_dt64arr(new_data.values, base) return DatetimeIndex(new_data, freq='infer', name=self.name) - def _add_delta(self, other): + def _maybe_convert_timedelta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: - return self.shift(nanos // offset_nanos) + return nanos // offset_nanos elif isinstance(other, offsets.DateOffset): freqstr = frequencies.get_standard_freq(other) base = frequencies.get_base_alias(freqstr) if base == self.freq: - return self.shift(other.n) + return other.n raise ValueError("Input has different freq from PeriodIndex(freq={0})".format(self.freq)) + def _add_delta(self, other): + ordinal_delta = self._maybe_convert_timedelta(other) + return self.shift(ordinal_delta) + def shift(self, n): """ Specialized shift which produces an PeriodIndex @@ -586,13 +590,13 @@ def get_value(self, series, key): key = Period(key, self.freq).ordinal return _maybe_box(self, self._engine.get_value(s, key), series, key) - def get_indexer(self, target, method=None, limit=None): + def get_indexer(self, target, method=None, limit=None, tolerance=None): if hasattr(target, 'freq') and target.freq != self.freq: raise ValueError('target and index have different freq: ' '(%s, %s)' % (target.freq, self.freq)) - return Index.get_indexer(self, target, method, limit) + return Index.get_indexer(self, target, method, limit, tolerance) - def get_loc(self, key, method=None): + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -614,7 +618,7 @@ def get_loc(self, key, method=None): key = Period(key, self.freq) try: - return Index.get_loc(self, key.ordinal, method=method) + return Index.get_loc(self, key.ordinal, method, tolerance) except KeyError: raise KeyError(key) @@ -694,6 +698,10 @@ def _get_string_slice(self, key): return slice(self.searchsorted(t1.ordinal, side='left'), self.searchsorted(t2.ordinal, side='right')) + def _convert_tolerance(self, tolerance): + tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance) + return self._maybe_convert_timedelta(tolerance) + def join(self, other, how='left', level=None, return_indexers=False): """ See Index.join diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index d7172dd304b6b..b0c9d8852f8c9 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -645,7 +645,7 @@ def get_value_maybe_box(self, series, key): values = self._engine.get_value(_values_from_object(series), key) return _maybe_box(self, values, series, key) - def get_loc(self, key, method=None): + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -653,12 +653,17 @@ def get_loc(self, key, method=None): ------- loc : int """ + if tolerance is not None: + # try converting tolerance now, so errors don't get swallowed by + # the try/except clauses below + tolerance = self._convert_tolerance(tolerance) + if _is_convertible_to_td(key): key = Timedelta(key) - return Index.get_loc(self, key, method=method) + return Index.get_loc(self, key, method, tolerance) try: - return Index.get_loc(self, key, method=method) + return Index.get_loc(self, key, method, tolerance) except (KeyError, ValueError, TypeError): try: return self._get_string_slice(key) @@ -667,7 +672,7 @@ def get_loc(self, key, method=None): try: stamp = Timedelta(key) - return Index.get_loc(self, stamp, method=method) + return Index.get_loc(self, stamp, method, tolerance) except (KeyError, ValueError): raise KeyError(key)