Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add groupby & resample benchmarks #5922

Merged
merged 12 commits into from
Nov 8, 2021
81 changes: 70 additions & 11 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,98 @@
import numpy as np
import pandas as pd

import xarray as xr

from . import parameterized, requires_dask
from . import _skip_slow, parameterized, requires_dask


class GroupBy:
def setup(self, *args, **kwargs):
self.ds = xr.Dataset(
self.n = 100
self.ds1d = xr.Dataset(
{
"a": xr.DataArray(np.r_[np.arange(500.0), np.arange(500.0)]),
"b": xr.DataArray(np.arange(1000.0)),
"a": xr.DataArray(np.r_[np.repeat(1, self.n), np.repeat(2, self.n)]),
"b": xr.DataArray(np.arange(2 * self.n)),
}
)
self.ds2d = self.ds1d.expand_dims(z=10)

@parameterized(["method"], [("sum", "mean")])
def time_agg(self, method):
return getattr(self.ds.groupby("a"), method)()
@parameterized(["ndim"], [(1, 2)])
def time_init(self, ndim):
getattr(self, f"ds{ndim}d").groupby("b")

@parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
def time_agg_small_num_groups(self, method, ndim):
ds = getattr(self, f"ds{ndim}d")
getattr(ds.groupby("a"), method)()

@parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
def time_agg_large_num_groups(self, method, ndim):
ds = getattr(self, f"ds{ndim}d")
getattr(ds.groupby("b"), method)()


class GroupByDask(GroupBy):
def setup(self, *args, **kwargs):
requires_dask()
super().setup(**kwargs)
self.ds = self.ds.chunk({"dim_0": 50})
self.ds1d = self.ds1d.sel(dim_0=slice(None, None, 2)).chunk({"dim_0": 50})
self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)).chunk(
{"dim_0": 50, "z": 5}
)


class GroupByDataFrame(GroupBy):
class GroupByPandasDataFrame(GroupBy):
"""Run groupby tests using pandas DataFrame."""

def setup(self, *args, **kwargs):
# Skip testing in CI as it won't ever change in a commit:
_skip_slow()

super().setup(**kwargs)
self.ds = self.ds.to_dataframe()
self.ds1d = self.ds1d.to_dataframe()


class GroupByDaskDataFrame(GroupBy):
"""Run groupby tests using dask DataFrame."""

def setup(self, *args, **kwargs):
# Skip testing in CI as it won't ever change in a commit:
_skip_slow()

requires_dask()
super().setup(**kwargs)
self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe()


class Resample:
def setup(self, *args, **kwargs):
self.ds1d = xr.Dataset(
{
"b": ("time", np.arange(365.0 * 24)),
},
coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)},
)
self.ds2d = self.ds1d.expand_dims(z=10)

@parameterized(["ndim"], [(1, 2)])
def time_init(self, ndim):
getattr(self, f"ds{ndim}d").resample(time="D")

@parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
def time_agg_small_num_groups(self, method, ndim):
ds = getattr(self, f"ds{ndim}d")
getattr(ds.resample(time="3M"), method)()

@parameterized(["method", "ndim"], [("sum", "mean"), (1, 2)])
def time_agg_large_num_groups(self, method, ndim):
ds = getattr(self, f"ds{ndim}d")
getattr(ds.resample(time="48H"), method)()


class ResampleDask(Resample):
def setup(self, *args, **kwargs):
requires_dask()
super().setup(**kwargs)
self.ds = self.ds.chunk({"dim_0": 50}).to_dataframe()
self.ds1d = self.ds1d.chunk({"time": 50})
self.ds2d = self.ds2d.chunk({"time": 50, "z": 4})