Make common impl. with Index.searchsorted

pandas-dev · Aug 1, 2018 · 0c01d60 · 0c01d60
1 parent 117da18
commit 0c01d60
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 15 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -496,6 +496,7 @@ Performance Improvements
   both when indexing by label (using .loc) and position(.iloc).
   Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
 - Improved performance of :func:`Series.searchsorted` (:issue:`22034`)
+- Improved performance of :func:`Index.searchsorted` when dtype is uint64, float64 or object (:issue:`22034`)
 - Improved performance of :func:`Series.describe` in case of numeric dtypes (:issue:`21274`)
 - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
 - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -15,7 +15,8 @@
     is_list_like,
     is_scalar,
     is_extension_type,
-    is_extension_array_dtype)
+    is_extension_array_dtype,
+    ensure_platform_int)
 
 from pandas.util._validators import validate_bool_kwarg
 from pandas.errors import AbstractMethodError
@@ -1230,7 +1231,9 @@ def factorize(self, sort=False, na_sentinel=-1):
     @Appender(_shared_docs['searchsorted'])
     @deprecate_kwarg(old_arg_name='key', new_arg_name='value')
     def searchsorted(self, value, side='left', sorter=None):
-        # needs coercion on the key (DatetimeIndex does already)
+        if sorter is not None:
+            sorter = ensure_platform_int(sorter)
+        value = com.maybe_convert_numeric_dtype(value, dtype=self.dtype)
         return self.values.searchsorted(value, side=side, sorter=sorter)
 
     def drop_duplicates(self, keep='first', inplace=False):

diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -15,7 +15,9 @@
 from pandas import compat
 from pandas.compat import iteritems, PY36, OrderedDict
 from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass
-from pandas.core.dtypes.common import is_integer
+from pandas.core.dtypes.common import (is_integer, is_integer_dtype, is_float,
+                                       is_float_dtype, is_object_dtype,
+                                       is_scalar)
 from pandas.core.dtypes.inference import _iterable_not_string
 from pandas.core.dtypes.missing import isna, isnull, notnull  # noqa
 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
@@ -430,3 +432,47 @@ def _pipe(obj, func, *args, **kwargs):
         return func(*args, **kwargs)
     else:
         return func(obj, *args, **kwargs)
+
+
+def maybe_convert_numeric_dtype(value, dtype):
+    """
+    Convert value to have dtype 'dtype' if 'dtype' is int or float.
+
+    :func:`numpy.searchsorted` is only fast if value is of same dtype
+    as the searched array. Below we ensure that value has the right
+    dtype for giving fast results for arr.searchsorted.
+
+    Notes
+    -----
+    We do not recast the value if it is a float and the array is dtype int.
+    This is because loc of float 2.1 should be behind loc of int 2,
+    *not* before it.
+
+    Parameters
+    ----------
+    value : scalar or list-like
+    dtype : a dtype
+
+    Returns
+    -------
+    return_value : value as array with appropriate dtype for fast calling
+        of numpy.searchsorted
+    """
+    if is_float_dtype(dtype):
+        value = np.asarray(value, dtype=dtype)
+    elif is_integer_dtype(dtype):
+        # check bounds
+        iinfo = np.iinfo(dtype)
+        iinfo_val = np.array([value]) if is_scalar(value) else value
+        if (iinfo_val < iinfo.min).any() or (iinfo_val > iinfo.max).any():
+            msg = "Value {} out of bound for dtype {}".format(value, dtype)
+            raise ValueError(msg)
+
+        # convert value
+        if is_integer(value) or is_integer_dtype(value):
+            value = np.asarray(value, dtype=dtype)
+        elif hasattr(value, 'is_integer') and value.is_integer():
+            # float 2.0 can be converted to int 2
+            # but float 2.2 should *not* be converted to int 2.0
+            value = np.asarray(value, dtype=dtype)
+    return value
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -19,6 +19,7 @@
     is_categorical_dtype,
     is_bool,
     is_integer, is_integer_dtype,
+    is_numeric_dtype,
     is_float_dtype,
     is_extension_type,
     is_extension_array_dtype,
@@ -264,7 +265,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
                         raise ValueError(
                             'Length of passed values is {val}, '
                             'index implies {ind}'
-                            .format(val=len(data), ind=len(index)))
+                                .format(val=len(data), ind=len(index)))
                 except TypeError:
                     pass
 
@@ -667,9 +668,9 @@ def __array_prepare__(self, result, context=None):
             obj = context[1][0]
             raise TypeError("{obj} with dtype {dtype} cannot perform "
                             "the numpy op {op}".format(
-                                obj=type(obj).__name__,
-                                dtype=getattr(obj, 'dtype', None),
-                                op=context[0].__name__))
+                obj=type(obj).__name__,
+                dtype=getattr(obj, 'dtype', None),
+                op=context[0].__name__))
         return result
 
     # complex
@@ -2082,14 +2083,19 @@ def __rmatmul__(self, other):
     def searchsorted(self, value, side='left', sorter=None):
         if sorter is not None:
             sorter = ensure_platform_int(sorter)
-        if not is_extension_type(self._values):
-            # numpy searchsorted is only fast if value is of same dtype as the
-            # searched array. Below we ensure that value has the right dtype,
-            # and is not 0-dimensional.
-            value = np.asarray(value, dtype=self._values.dtype)
-            value = value[..., np.newaxis] if value.ndim == 0 else value
-
-        return self._values.searchsorted(value, side=side, sorter=sorter)
+
+        if is_numeric_dtype(self):
+            value = com.maybe_convert_numeric_dtype(value, dtype=self.dtype)
+        elif not (is_object_dtype(self) or is_categorical_dtype(self)):
+            value = Series(value)._values
+
+        result = self._values.searchsorted(value, side=side, sorter=sorter)
+
+        if is_scalar(result):
+            # ensure that a 1-dim array is returned
+            result = np.array([result])
+
+        return result
 
     # -------------------------------------------------------------------
     # Combination