Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rolling benchmarks #1

Merged
merged 5 commits into from
Jan 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
## Description

Benchmarks for `xclim` using the [`pyperf`](https://pyperf.readthedocs.io/en/latest/) package or the [`memory_profiles`](https://pypi.org/project/memory-profiler/) package.
# Usage
## Running benchmarks

### Single benchmark

To run a single benchmark. In the terminal, the folowing will run the benchmark for xclim and indice `tx_mean`.
To run a `pyperf` single benchmark. In the terminal, the folowing will run the benchmark for xclim and indice `tx_mean`.

`python2 bench_xclim.py -o ./output/bench_xclim.json`
`python bench_xclim.py -o ./output/bench_xclim.json`

### Rolling benchmarks

Given the installed xclim version contains `xc.utils._rolling` and `xarray<=0.14.1`, the benchmark can be run with:

```
python ../scripts/bench_rolling.py gendata
python ../scripts/bench_rolling.py -c -l -s xclim
```

See the documentation `python ../scripts/bench_rolling.py --help` for all options.

### All benchmark

Expand All @@ -17,4 +29,4 @@ To run all benchmarks, launch the bash script.

To consult a single benchmark. In the terminal:

`python -m perf stats bench.json`
`python -m pyperf stats bench.json`
16 changes: 16 additions & 0 deletions bash_scripts/run_rolling.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# This script runs 3 types of rolling and displays the memory usage of each in the end.
# This should be run from the output directory
maxmem=40GB
nthreads=32

declare -a exps=("xclim" "xrdefault" "xrnocounts")

# Generating test data
python ../scripts/bench_rolling.py gendata -n 350 200 150 120

for exp in "${exps[@]}"
do
mprof run -C ../scripts/bench_rolling.py -cls -N $nthreads -m $maxmem $exp
done

python ../scripts/bench_rolling.py plot
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
130 changes: 130 additions & 0 deletions scripts/bench_rolling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Rolling benchmarks
# Comparing xarray's defaults with custom implmentations
# Also comparing different usages of dask.
import glob
import argparse
import xclim as xc
import numpy as np
import xarray as xr
import datetime as dt
from distributed import Client
testfile = 'testdata_t{}.nc'
outfile = 'testout_{}{}{}.nc'


def read_mprofile(filename):
times = []
mem = []
name = filename.split('.')[0]
with open(filename, 'r') as f:
for line in f:
if line.startswith('CMDLINE'):
name = line.strip().split()[-1]
elif line.startswith('MEM'):
_, m, t = line.strip().split()
times.append(dt.datetime.fromtimestamp(float(t)))
mem.append(float(m))
times = [(t - times[0]).total_seconds() for t in times]
return name, times, mem


def xclim_custom(data, dim, window, func):
return xc.utils._rolling(data, dim=dim, window=window, mode=func)


def xr_default(data, dim, window, func, lazy=False, skipna=None):
return getattr(data.rolling(dim={dim: window}), func)(allow_lazy=lazy, skipna=skipna)


def xr_nocounts(data, dim, window, func, lazy=False, skipna=None):
return data.rolling(dim={dim: window}).construct('window_dim').reduce(func, dim='window_dim', allow_lazy=lazy, skipna=skipna)


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Profile memory for rolling functions')
parser.add_argument('-f', '--func', type=str, default='mean', help='which function to run')
parser.add_argument('-l', '--lazy', action='store_true', help='whether to allow lazy (xr)')
parser.add_argument('-c', '--with-client', action='store_true', help='whether to use a dask client')
parser.add_argument('-N', '--nthreads', default=10, type=int, help='When using a dask client, number of threads per worker')
parser.add_argument('-m', '--max-mem', default='2GB', help='When using a dask client, memory limit')
parser.add_argument('-s', '--skipna', action='store_true', help='If specified, passes skipna=True')
parser.add_argument('exp', type=str, help='which exp to run')
parser.add_argument('-i', '--files', default='*.dat', nargs='*', help='Dat files to plot')
parser.add_argument('-n', '--chunk-size', default=[500, 100, 100, 20], nargs='*', help='Size of the random data to generate. 1, 2, 3 or 4 values for t, x, y and nchunks.')
args = parser.parse_args()

if args.exp == 'gendata':
if isinstance(args.chunk_size, list):
if len(args.chunk_size) == 2:
Nt, Nc = map(int, args.chunk_size)
Ny = Nx = Nt
elif len(args.chunk_size) == 3:
Nt, Nx, Nc = map(int, args.chunk_size)
Ny = Nx
else:
Nt, Nx, Ny, Nc = map(int, args.chunk_size)
else:
Nt = Nx = Ny = int(args.chunk_size)
Nc = 20

for i in range(Nc):
print(f'Generating data {i+1:02d}/{Nc}')
data = xr.DataArray(data=np.random.random((Nt, Nx, Ny)),
dims=('time', 'x', 'y'),
coords={'time': np.arange(Nt * i, Nt * (i + 1)),
'x': np.arange(Nx), 'y': np.arange(Ny)},
name='data')
data.to_netcdf(testfile.format(f'{i:02d}'))

elif args.exp == 'plot':
import matplotlib.pyplot as plt
try:
plt.style.use('dark_background')
except OSError:
pass
if not isinstance(args.files, list):
if '*' in args.files:
files = glob.glob(args.files)
else:
files = [args.files]
else:
files = args.files
colors = {exp: col for exp, col in zip(['xclim', 'xrpure', 'xrdefault', 'xrnocounts'],
plt.matplotlib.rcParams['axes.prop_cycle'].by_key()['color'])}
fig, ax = plt.subplots(figsize=(10, 5))
for file in files:
name, times, mem = read_mprofile(file)
ax.plot(times, mem, label=name, color=colors[name])
ax.legend()
ax.set_xlabel('Computation time [s]')
ax.set_ylabel('Memory usage [MiB]')
ax.set_title('Memory usage of different rolling methods')
plt.show()

else:
if args.with_client:
c = Client(n_workers=1, threads_per_worker=args.nthreads, memory_limit=args.max_mem)

data = xr.open_mfdataset(testfile.format('*'), combine='by_coords', chunks={})

func = getattr(xr.core.duck_array_ops, args.func)
print(f'Running rolling with exp: {args.exp}')
if args.exp == 'xclim':
out = xclim_custom(data.data, 'time', 5, args.func)
elif args.exp == 'xrpure':
out = xr_default(data.data, 'time', 5, args.func)
elif args.exp == 'xrdefault':
out = xr_default(data.data, 'time', 5, args.func, lazy=args.lazy, skipna=args.skipna)
elif args.exp == 'xrnocounts':
out = xr_nocounts(data.data, 'time', 5, func, lazy=args.lazy, skipna=args.skipna)

print('Writing to file')
r = out.to_netcdf(outfile.format(args.exp,
'_lazy' if args.lazy else '',
f'_dskcli_{args.nthreads:02d}_{args.max_mem}' if args.with_client else ''),
compute=False)
r.compute()
out.close()

if args.with_client:
c.close()
2 changes: 1 addition & 1 deletion scripts/bench_txmean_icclim.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Imports
import perf
import pyperf as perf
import icclim

def func():
Expand Down
2 changes: 1 addition & 1 deletion scripts/bench_txmean_xclim.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

import perf
import pyperf as perf
import time
import xclim as xc
import xarray as xr
Expand Down
6 changes: 4 additions & 2 deletions scripts/quantile_bench.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
# Imports
import perf
import pyperf as perf
# import icclim
import numpy as np

x = np.array(np.random.rand(1000))
P = np.linspace(0.01, 0.99, 50)


def bench_argsort():
# np.quantile(x, P)
x.argsort().argsort()/len(x)
x.argsort().argsort() / len(x)


def bench_quantile():
np.quantile(x, P)
Expand Down