Skip to content

Commit

Permalink
Merge branch 'master' into 33200-groupby-quantile
Browse files Browse the repository at this point in the history
  • Loading branch information
mabelvj committed Apr 26, 2020
2 parents 5832ba9 + 0db2286 commit 662c102
Show file tree
Hide file tree
Showing 174 changed files with 3,963 additions and 2,756 deletions.
10 changes: 7 additions & 3 deletions asv_bench/benchmarks/arithmetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def time_series_op_with_fill_value_no_nas(self):
self.ser.add(self.ser, fill_value=4)


class MixedFrameWithSeriesAxis0:
class MixedFrameWithSeriesAxis:
params = [
[
"eq",
Expand All @@ -78,7 +78,7 @@ class MixedFrameWithSeriesAxis0:
"gt",
"add",
"sub",
"div",
"truediv",
"floordiv",
"mul",
"pow",
Expand All @@ -87,15 +87,19 @@ class MixedFrameWithSeriesAxis0:
param_names = ["opname"]

def setup(self, opname):
arr = np.arange(10 ** 6).reshape(100, -1)
arr = np.arange(10 ** 6).reshape(1000, -1)
df = DataFrame(arr)
df["C"] = 1.0
self.df = df
self.ser = df[0]
self.row = df.iloc[0]

def time_frame_op_with_series_axis0(self, opname):
getattr(self.df, opname)(self.ser, axis=0)

def time_frame_op_with_series_axis1(self, opname):
getattr(operator, opname)(self.df, self.ser)


class Ops:

Expand Down
58 changes: 58 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,4 +660,62 @@ def function(values):
self.grouper.transform(function, engine="cython")


class AggEngine:
def setup(self):
N = 10 ** 3
data = DataFrame(
{0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N},
columns=[0, 1],
)
self.grouper = data.groupby(0)

def time_series_numba(self):
def function(values, index):
total = 0
for i, value in enumerate(values):
if i % 2:
total += value + 5
else:
total += value * 2
return total

self.grouper[1].agg(function, engine="numba")

def time_series_cython(self):
def function(values):
total = 0
for i, value in enumerate(values):
if i % 2:
total += value + 5
else:
total += value * 2
return total

self.grouper[1].agg(function, engine="cython")

def time_dataframe_numba(self):
def function(values, index):
total = 0
for i, value in enumerate(values):
if i % 2:
total += value + 5
else:
total += value * 2
return total

self.grouper.agg(function, engine="numba")

def time_dataframe_cython(self):
def function(values):
total = 0
for i, value in enumerate(values):
if i % 2:
total += value + 5
else:
total += value * 2
return total

self.grouper.agg(function, engine="cython")


from .pandas_vb_common import setup # noqa: F401 isort:skip
4 changes: 2 additions & 2 deletions asv_bench/benchmarks/stat_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ class FrameOps:
param_names = ["op", "dtype", "axis"]

def setup(self, op, dtype, axis):
if op == "mad" and dtype == "Int64" and axis == 1:
# GH-33036
if op == "mad" and dtype == "Int64":
# GH-33036, GH#33600
raise NotImplementedError
values = np.random.randn(100000, 4)
if dtype == "Int64":
Expand Down
8 changes: 7 additions & 1 deletion ci/code_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,13 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then
# Check for imports from pandas._testing instead of `import pandas._testing as tm`
invgrep -R --include="*.py*" -E "from pandas._testing import" pandas/tests
RET=$(($RET + $?)) ; echo $MSG "DONE"
invgrep -R --include="*.py*" -E "from pandas.util import testing as tm" pandas/tests
invgrep -R --include="*.py*" -E "from pandas import _testing as tm" pandas/tests
RET=$(($RET + $?)) ; echo $MSG "DONE"

# No direct imports from conftest
invgrep -R --include="*.py*" -E "conftest import" pandas/tests
RET=$(($RET + $?)) ; echo $MSG "DONE"
invgrep -R --include="*.py*" -E "import conftest" pandas/tests
RET=$(($RET + $?)) ; echo $MSG "DONE"

MSG='Check for use of exec' ; echo $MSG
Expand Down
2 changes: 1 addition & 1 deletion doc/source/getting_started/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ data set, a sliding window of the data or grouped by categories. The latter is a
<div class="card-body">

Change the structure of your data table in multiple ways. You can :func:`~pandas.melt` your data table from wide to long/tidy form or :func:`~pandas.pivot`
from long to wide format. With aggregations built-in, a pivot table is created with a sinlge command.
from long to wide format. With aggregations built-in, a pivot table is created with a single command.

.. image:: ../_static/schemas/07_melt.svg
:align: center
Expand Down
20 changes: 12 additions & 8 deletions doc/source/user_guide/computation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -318,8 +318,8 @@ We provide a number of common statistical functions:
:meth:`~Rolling.kurt`, Sample kurtosis (4th moment)
:meth:`~Rolling.quantile`, Sample quantile (value at %)
:meth:`~Rolling.apply`, Generic apply
:meth:`~Rolling.cov`, Unbiased covariance (binary)
:meth:`~Rolling.corr`, Correlation (binary)
:meth:`~Rolling.cov`, Sample covariance (binary)
:meth:`~Rolling.corr`, Sample correlation (binary)

.. _computation.window_variance.caveats:

Expand All @@ -341,6 +341,8 @@ We provide a number of common statistical functions:
sample variance under the circumstances would result in a biased estimator
of the variable we are trying to determine.

The same caveats apply to using any supported statistical sample methods.

.. _stats.rolling_apply:

Rolling apply
Expand Down Expand Up @@ -380,8 +382,8 @@ and their default values are set to ``False``, ``True`` and ``False`` respective
.. note::

In terms of performance, **the first time a function is run using the Numba engine will be slow**
as Numba will have some function compilation overhead. However, ``rolling`` objects will cache
the function and subsequent calls will be fast. In general, the Numba engine is performant with
as Numba will have some function compilation overhead. However, the compiled functions are cached,
and subsequent calls will be fast. In general, the Numba engine is performant with
a larger amount of data points (e.g. 1+ million).

.. code-block:: ipython
Expand Down Expand Up @@ -870,12 +872,12 @@ Method summary
:meth:`~Expanding.max`, Maximum
:meth:`~Expanding.std`, Sample standard deviation
:meth:`~Expanding.var`, Sample variance
:meth:`~Expanding.skew`, Unbiased skewness (3rd moment)
:meth:`~Expanding.kurt`, Unbiased kurtosis (4th moment)
:meth:`~Expanding.skew`, Sample skewness (3rd moment)
:meth:`~Expanding.kurt`, Sample kurtosis (4th moment)
:meth:`~Expanding.quantile`, Sample quantile (value at %)
:meth:`~Expanding.apply`, Generic apply
:meth:`~Expanding.cov`, Unbiased covariance (binary)
:meth:`~Expanding.corr`, Correlation (binary)
:meth:`~Expanding.cov`, Sample covariance (binary)
:meth:`~Expanding.corr`, Sample correlation (binary)

.. note::

Expand All @@ -884,6 +886,8 @@ Method summary
windows. See :ref:`this section <computation.window_variance.caveats>` for more
information.

The same caveats apply to using any supported statistical sample methods.

.. currentmodule:: pandas

Aside from not having a ``window`` parameter, these functions have the same
Expand Down
27 changes: 0 additions & 27 deletions doc/source/user_guide/cookbook.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1333,33 +1333,6 @@ Values can be set to NaT using np.nan, similar to datetime
y[1] = np.nan
y
Aliasing axis names
-------------------

To globally provide aliases for axis names, one can define these 2 functions:

.. ipython:: python
def set_axis_alias(cls, axis, alias):
if axis not in cls._AXIS_NUMBERS:
raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias))
cls._AXIS_ALIASES[alias] = axis
.. ipython:: python
def clear_axis_alias(cls, axis, alias):
if axis not in cls._AXIS_NUMBERS:
raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias))
cls._AXIS_ALIASES.pop(alias, None)
.. ipython:: python
set_axis_alias(pd.DataFrame, 'columns', 'myaxis2')
df2 = pd.DataFrame(np.random.randn(3, 2), columns=['c1', 'c2'],
index=['i1', 'i2', 'i3'])
df2.sum(axis='myaxis2')
clear_axis_alias(pd.DataFrame, 'columns', 'myaxis2')
Creating example data
---------------------

Expand Down
67 changes: 67 additions & 0 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,73 @@ that is itself a series, and possibly upcast the result to a DataFrame:
the output as well as set the indices.


Numba Accelerated Routines
--------------------------

.. versionadded:: 1.1

If `Numba <https://numba.pydata.org/>`__ is installed as an optional dependency, the ``transform`` and
``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. The ``engine_kwargs``
argument is a dictionary of keyword arguments that will be passed into the
`numba.jit decorator <https://numba.pydata.org/numba-doc/latest/reference/jit-compilation.html#numba.jit>`__.
These keyword arguments will be applied to the passed function. Currently only ``nogil``, ``nopython``,
and ``parallel`` are supported, and their default values are set to ``False``, ``True`` and ``False`` respectively.

The function signature must start with ``values, index`` **exactly** as the data belonging to each group
will be passed into ``values``, and the group index will be passed into ``index``.

.. warning::

When using ``engine='numba'``, there will be no "fall back" behavior internally. The group
data and group index will be passed as numpy arrays to the JITed user defined function, and no
alternative execution attempts will be tried.

.. note::

In terms of performance, **the first time a function is run using the Numba engine will be slow**
as Numba will have some function compilation overhead. However, the compiled functions are cached,
and subsequent calls will be fast. In general, the Numba engine is performant with
a larger amount of data points (e.g. 1+ million).

.. code-block:: ipython
In [1]: N = 10 ** 3
In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}
In [3]: df = pd.DataFrame(data, columns=[0, 1])
In [4]: def f_numba(values, index):
...: total = 0
...: for i, value in enumerate(values):
...: if i % 2:
...: total += value + 5
...: else:
...: total += value * 2
...: return total
...:
In [5]: def f_cython(values):
...: total = 0
...: for i, value in enumerate(values):
...: if i % 2:
...: total += value + 5
...: else:
...: total += value * 2
...: return total
...:
In [6]: groupby = df.groupby(0)
# Run the first time, compilation time will affect performance
In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba') # noqa: E225
2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
# Function is cached and performance will improve
In [8]: %timeit groupby.aggregate(f_numba, engine='numba')
4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In [9]: %timeit groupby.aggregate(f_cython, engine='cython')
18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Other useful features
---------------------

Expand Down
Loading

0 comments on commit 662c102

Please sign in to comment.