From 511d36cd0a77a9dac49cb36c05484a0eb7013e79 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Mon, 21 Mar 2022 13:21:16 +0530 Subject: [PATCH] [skip-ci] Add benchmarks for groupby math (#6390) --- asv_bench/benchmarks/groupby.py | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 46d6293cc98..fa93ce9e8b5 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -16,6 +16,8 @@ def setup(self, *args, **kwargs): } ) self.ds2d = self.ds1d.expand_dims(z=10) + self.ds1d_mean = self.ds1d.groupby("b").mean() + self.ds2d_mean = self.ds2d.groupby("b").mean() @parameterized(["ndim"], [(1, 2)]) def time_init(self, ndim): @@ -31,6 +33,18 @@ def time_agg_large_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") getattr(ds.groupby("b"), method)() + def time_groupby_binary_op_1d(self): + self.ds1d - self.ds1d_mean + + def time_groupby_binary_op_2d(self): + self.ds2d - self.ds2d_mean + + def peakmem_groupby_binary_op_1d(self): + self.ds1d - self.ds1d_mean + + def peakmem_groupby_binary_op_2d(self): + self.ds2d - self.ds2d_mean + class GroupByDask(GroupBy): def setup(self, *args, **kwargs): @@ -40,6 +54,8 @@ def setup(self, *args, **kwargs): self.ds2d = self.ds2d.sel(dim_0=slice(None, None, 2)).chunk( {"dim_0": 50, "z": 5} ) + self.ds1d_mean = self.ds1d.groupby("b").mean() + self.ds2d_mean = self.ds2d.groupby("b").mean() class GroupByPandasDataFrame(GroupBy): @@ -51,6 +67,13 @@ def setup(self, *args, **kwargs): super().setup(**kwargs) self.ds1d = self.ds1d.to_dataframe() + self.ds1d_mean = self.ds1d.groupby("b").mean() + + def time_groupby_binary_op_2d(self): + raise NotImplementedError + + def peakmem_groupby_binary_op_2d(self): + raise NotImplementedError class GroupByDaskDataFrame(GroupBy): @@ -63,6 +86,13 @@ def setup(self, *args, **kwargs): requires_dask() super().setup(**kwargs) self.ds1d = self.ds1d.chunk({"dim_0": 50}).to_dataframe() + self.ds1d_mean = self.ds1d.groupby("b").mean() + + def time_groupby_binary_op_2d(self): + raise NotImplementedError + + def peakmem_groupby_binary_op_2d(self): + raise NotImplementedError class Resample: @@ -74,6 +104,8 @@ def setup(self, *args, **kwargs): coords={"time": pd.date_range("2001-01-01", freq="H", periods=365 * 24)}, ) self.ds2d = self.ds1d.expand_dims(z=10) + self.ds1d_mean = self.ds1d.resample(time="48H").mean() + self.ds2d_mean = self.ds2d.resample(time="48H").mean() @parameterized(["ndim"], [(1, 2)]) def time_init(self, ndim): @@ -89,6 +121,18 @@ def time_agg_large_num_groups(self, method, ndim): ds = getattr(self, f"ds{ndim}d") getattr(ds.resample(time="48H"), method)() + def time_groupby_binary_op_1d(self): + self.ds1d - self.ds1d_mean + + def time_groupby_binary_op_2d(self): + self.ds2d - self.ds2d_mean + + def peakmem_groupby_binary_op_1d(self): + self.ds1d - self.ds1d_mean + + def peakmem_groupby_binary_op_2d(self): + self.ds2d - self.ds2d_mean + class ResampleDask(Resample): def setup(self, *args, **kwargs):