Merge pull request #10411 from shoyer/max_distance

ENH: tolerance argument for limiting pad, backfill and nearest neighbor reindexing
pandas-dev · Aug 18, 2015 · 5052900 · 5052900
2 parents 931e0e5 + 0468cad
commit 5052900
Show file tree

Hide file tree

Showing 12 changed files with 328 additions and 56 deletions.
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1100,6 +1100,30 @@ Note that the same result could have been achieved using
 increasing or descreasing. :meth:`~Series.fillna` and :meth:`~Series.interpolate`
 will not make any checks on the order of the index.
 
+.. _basics.limits_on_reindex_fill:
+
+Limits on filling while reindexing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ``limit`` and ``tolerance`` arguments provide additional control over
+filling while reindexing. Limit specifies the maximum count of consecutive
+matches:
+
+.. ipython:: python
+
+   ts2.reindex(ts.index, method='ffill', limit=1)
+
+In contrast, tolerance specifies the maximum distance between the index and
+indexer values:
+
+.. ipython:: python
+
+   ts2.reindex(ts.index, method='ffill', tolerance='1 day')
+
+Notice that when used on a ``DatetimeIndex``, ``TimedeltaIndex`` or
+``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible.
+This allows you to specify tolerance with appropriate strings.
+
 .. _basics.drop:
 
 Dropping labels from an axis

diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -182,6 +182,22 @@ Other enhancements
    s.drop_duplicates(keep=False)
 
 
+- Reindex now has a ``tolerance`` argument that allows for finer control of :ref:`basics.limits_on_reindex_fill`:
+
+  .. ipython:: python
+
+     df = pd.DataFrame({'x': range(5), 't': pd.date_range('2000-01-01', periods=5)})
+     df.reindex([0.1, 1.9, 3.5], method='nearest', tolerance=0.2)
+
+  When used on a ``DatetimeIndex``, ``TimedeltaIndex`` or ``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. This allows you to specify tolerance with a string:
+
+  .. ipython:: python
+
+     df = df.set_index('t')
+     df.reindex(pd.to_datetime(['1999-12-31']), method='nearest', tolerance='1 day')
+
+  ``tolerance`` is also exposed by the lower level ``Index.get_indexer`` and ``Index.get_loc`` methods.
+
 .. _whatsnew_0170.api:
 
 .. _whatsnew_0170.api_breaking:

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2518,33 +2518,36 @@ def lookup(self, row_labels, col_labels):
     #----------------------------------------------------------------------
     # Reindexing and alignment
 
-    def _reindex_axes(self, axes, level, limit, method, fill_value, copy):
+    def _reindex_axes(self, axes, level, limit, tolerance, method,
+                      fill_value, copy):
         frame = self
 
         columns = axes['columns']
         if columns is not None:
             frame = frame._reindex_columns(columns, copy, level, fill_value,
-                                           limit)
+                                           limit, tolerance)
 
         index = axes['index']
         if index is not None:
             frame = frame._reindex_index(index, method, copy, level,
-                                         fill_value, limit)
+                                         fill_value, limit, tolerance)
 
         return frame
 
     def _reindex_index(self, new_index, method, copy, level, fill_value=NA,
-                       limit=None):
+                       limit=None, tolerance=None):
         new_index, indexer = self.index.reindex(new_index, method, level,
-                                                limit=limit)
+                                                limit=limit,
+                                                tolerance=tolerance)
         return self._reindex_with_indexers({0: [new_index, indexer]},
                                            copy=copy, fill_value=fill_value,
                                            allow_dups=False)
 
     def _reindex_columns(self, new_columns, copy, level, fill_value=NA,
-                         limit=None):
+                         limit=None, tolerance=None):
         new_columns, indexer = self.columns.reindex(new_columns, level=level,
-                                                    limit=limit)
+                                                    limit=limit,
+                                                    tolerance=tolerance)
         return self._reindex_with_indexers({1: [new_columns, indexer]},
                                            copy=copy, fill_value=fill_value,
                                            allow_dups=False)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -922,7 +922,7 @@ def to_hdf(self, path_or_buf, key, **kwargs):
             in the store wherever possible
         fletcher32 : bool, default False
             If applying compression use the fletcher32 checksum
-        dropna : boolean, default False. 
+        dropna : boolean, default False.
             If true, ALL nan rows will not be written to store.
 
         """
@@ -1551,7 +1551,8 @@ def select(self, crit, axis=0):
 
         return self.reindex(**{axis_name: new_axis})
 
-    def reindex_like(self, other, method=None, copy=True, limit=None):
+    def reindex_like(self, other, method=None, copy=True, limit=None,
+                     tolerance=None):
         """ return an object with matching indicies to myself
 
         Parameters
@@ -1560,7 +1561,12 @@ def reindex_like(self, other, method=None, copy=True, limit=None):
         method : string or None
         copy : boolean, default True
         limit : int, default None
-            Maximum size gap to forward or backward fill
+            Maximum number of consecutive labels to fill for inexact matches.
+        tolerance : optional
+            Maximum distance between labels of the other object and this
+            object for inexact matches.
+
+            .. versionadded:: 0.17.0
 
         Notes
         -----
@@ -1572,7 +1578,8 @@ def reindex_like(self, other, method=None, copy=True, limit=None):
         reindexed : same as input
         """
         d = other._construct_axes_dict(axes=self._AXIS_ORDERS,
-                method=method, copy=copy, limit=limit)
+                method=method, copy=copy, limit=limit,
+                tolerance=tolerance)
 
         return self.reindex(**d)
 
@@ -1736,7 +1743,13 @@ def sort_index(self, axis=0, ascending=True):
             Value to use for missing values. Defaults to NaN, but can be any
             "compatible" value
         limit : int, default None
-            Maximum size gap to forward or backward fill
+            Maximum number of consecutive elements to forward or backward fill
+        tolerance : optional
+            Maximum distance between original and new labels for inexact
+            matches. The values of the index at the matching locations most
+            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+            .. versionadded:: 0.17.0
 
         Examples
         --------
@@ -1758,6 +1771,7 @@ def reindex(self, *args, **kwargs):
         level = kwargs.pop('level', None)
         copy = kwargs.pop('copy', True)
         limit = kwargs.pop('limit', None)
+        tolerance = kwargs.pop('tolerance', None)
         fill_value = kwargs.pop('fill_value', np.nan)
 
         if kwargs:
@@ -1782,10 +1796,11 @@ def reindex(self, *args, **kwargs):
                 pass
 
         # perform the reindex on the axes
-        return self._reindex_axes(axes, level, limit,
+        return self._reindex_axes(axes, level, limit, tolerance,
                                   method, fill_value, copy).__finalize__(self)
 
-    def _reindex_axes(self, axes, level, limit, method, fill_value, copy):
+    def _reindex_axes(self, axes, level, limit, tolerance, method,
+                      fill_value, copy):
         """ perform the reinxed for all the axes """
         obj = self
         for a in self._AXIS_ORDERS:
@@ -1795,7 +1810,8 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy):
 
             ax = self._get_axis(a)
             new_index, indexer = ax.reindex(
-                labels, level=level, limit=limit, method=method)
+                labels, level=level, limit=limit, tolerance=tolerance,
+                method=method)
 
             axis = self._get_axis_number(a)
             obj = obj._reindex_with_indexers(
@@ -1836,7 +1852,13 @@ def _reindex_multi(self, axes, copy, fill_value):
             Broadcast across a level, matching Index values on the
             passed MultiIndex level
         limit : int, default None
-            Maximum size gap to forward or backward fill
+            Maximum number of consecutive elements to forward or backward fill
+        tolerance : optional
+            Maximum distance between original and new labels for inexact
+            matches. The values of the index at the matching locations most
+            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+            .. versionadded:: 0.17.0
 
         Examples
         --------
@@ -2910,7 +2932,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
               use the actual numerical values of the index
             * 'krogh', 'piecewise_polynomial', 'spline', and 'pchip' are all
               wrappers around the scipy interpolation methods of similar
-              names. These use the actual numerical values of the index. See 
+              names. These use the actual numerical values of the index. See
               the scipy documentation for more on their behavior:
               http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation
               http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html