Skip to content

Commit

Permalink
Make common impl. with Index.searchsorted
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 committed Aug 11, 2018
1 parent 7f65f38 commit f4961a9
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 13 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,7 @@ Performance Improvements
both when indexing by label (using .loc) and position(.iloc).
Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
- Improved performance of :func:`Series.searchsorted` (:issue:`22034`)
- Improved performance of :func:`Index.searchsorted` when dtype is uint64, float64 or object (:issue:`22034`)
- Improved performance of :func:`Series.describe` in case of numeric dtypes (:issue:`21274`)
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,8 +1230,8 @@ def factorize(self, sort=False, na_sentinel=-1):
@Appender(_shared_docs['searchsorted'])
@deprecate_kwarg(old_arg_name='key', new_arg_name='value')
def searchsorted(self, value, side='left', sorter=None):
# needs coercion on the key (DatetimeIndex does already)
return self.values.searchsorted(value, side=side, sorter=sorter)
return com.searchsorted(self._values, value,
side=side, sorter=sorter)

def drop_duplicates(self, keep='first', inplace=False):
inplace = validate_bool_kwarg(inplace, 'inplace')
Expand Down
46 changes: 45 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from pandas import compat
from pandas.compat import iteritems, PY36, OrderedDict
from pandas.core.dtypes.generic import ABCSeries, ABCIndex, ABCIndexClass
from pandas.core.dtypes.common import is_integer
from pandas.core.dtypes.common import (is_integer, is_integer_dtype,
is_scalar, ensure_platform_int)
from pandas.core.dtypes.inference import _iterable_not_string
from pandas.core.dtypes.missing import isna, isnull, notnull # noqa
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
Expand Down Expand Up @@ -430,3 +431,46 @@ def _pipe(obj, func, *args, **kwargs):
return func(*args, **kwargs)
else:
return func(obj, *args, **kwargs)


def searchsorted_integer(arr, value, side="left", sorter=None):
dtype = arr.dtype
if sorter is not None:
sorter = ensure_platform_int(sorter)

# check integer bounds
iinfo = np.iinfo(dtype)
value_arr = np.array([value]) if is_scalar(value) else np.array(value)
if (value_arr < iinfo.min).any() or (value_arr > iinfo.max).any():
msg = "Value {} out of bound for dtype {}".format(value, dtype)
raise ValueError(msg)

# convert dtype of value for better searchsorted speed
if is_integer(value) or is_integer_dtype(value):
value = np.asarray(value, dtype=dtype)
elif hasattr(value, 'is_integer') and value.is_integer():
# float 2.0 should be converted to int 2
# but float 2.2 should *not* be converted to int 2
value = np.asarray(value, dtype=dtype)

return arr.searchsorted(value, side=side, sorter=sorter)


def searchsorted(arr, value, side="left", sorter=None):
"""
Do a arr.searchsorted(val) with adjustments for int dtypes.
:func:`numpy.searchsorted` is only fast if value is of same dtype
as the searched array. Else numpy recasts arr to a higher dtype, which
causes a slowdown. Below we ensure that value has the right dtype
for giving fast results for arr.searchsorted.
See :meth:`Index.searchsorted` for details on parameters and return value.
"""
if sorter is not None:
sorter = ensure_platform_int(sorter)

if is_integer_dtype(arr):
return searchsorted_integer(arr, value, side=side, sorter=sorter)
else:
return arr.searchsorted(value, side=side, sorter=sorter)
28 changes: 18 additions & 10 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2080,16 +2080,24 @@ def __rmatmul__(self, other):
@Appender(base._shared_docs['searchsorted'])
@deprecate_kwarg(old_arg_name='v', new_arg_name='value')
def searchsorted(self, value, side='left', sorter=None):
if sorter is not None:
sorter = ensure_platform_int(sorter)
if not is_extension_type(self._values):
# numpy searchsorted is only fast if value is of same dtype as the
# searched array. Below we ensure that value has the right dtype,
# and is not 0-dimensional.
value = np.asarray(value, dtype=self._values.dtype)
value = value[..., np.newaxis] if value.ndim == 0 else value

return self._values.searchsorted(value, side=side, sorter=sorter)
simple_types = (is_integer_dtype, is_float_dtype, is_object_dtype,
is_categorical_dtype)

if any(is_dtype(self) for is_dtype in simple_types):
result = com.searchsorted(self._values, value,
side=side, sorter=sorter)
else:
# e.g. self is datetimelike and value is a pd.Timestamp
if sorter is not None:
sorter = ensure_platform_int(sorter)
value = Series(value)._values
result = self._values.searchsorted(value, side=side, sorter=sorter)

if is_scalar(result):
# ensure that a 1-dim array is returned
result = np.array([result])

return result

# -------------------------------------------------------------------
# Combination
Expand Down

0 comments on commit f4961a9

Please sign in to comment.