-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmark_matrix.py
executable file
·108 lines (94 loc) · 3.83 KB
/
benchmark_matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python3
'''
Prints a shell script that repeatedly runs `benchmark.py` with a matrix of
options. Usage:
./benchmark_matrix.py > bench-script
sudo ./isolate.sh bash -f bench-script >> bench-log.json
This follows an <shuffled batch 1> <shuffled batch 2> ... repeat pattern
with many short-runtime jobs. The aim here is to get some sense of variance
across runs, while time-averaging away effects on individual runs from CPU
heat management, OS background activities, etc. In `mmap` mode, keeping
the runs short also reduces page cache effects on `SMALL_CLONE` runs.
TODO: Maybe the matrix to cover non-direct IO (drop caches, use a single
reader, drop `--norandommap` and `--loop`).
TODO: For each unique set of `fio` options it'd be cool to also
automatically benchmark comparatives as in README.md#Benchmarks.
'''
import argparse
import json
import random
import shlex
import sys
from pathlib import Path
# My CPU has 12 HT cores. In my ad-hoc testing, even at 12 fio jobs, we
# barely pushed 2 cores worth of `ublk` load. For 1 job, adding an extra
# queue did seem to slightly help with async IO. On the other hand, 12
# queues seemed to add slight overhead.
JOBS__NUM_QUEUES = [
(1, 2),
(2, 2),
(6, 3),
(12, 3),
]
SMALL_CLONE = '2G' # Expect page-cache effects with `mmap`
BIG_CLONE = '200G' # Not much page-cache, the benchmark host had 32GiB RAM
SEED__VIRT_SZ__CLONE_SZ = (
# Smaller "virtual data" is silly, larger means very slow formatting due
# to my unoptimized implementation.
('fallocate', '275G', SMALL_CLONE),
('fallocate', '275G', BIG_CLONE),
# Overlap on a "virtual data" size with `fallocate` to get
# apples-to-apples. Also try a huge 4E address space since that makes
# blob allocation qualitatively easier.
('mega-extent', '275G', SMALL_CLONE),
('mega-extent', '275G', BIG_CLONE),
('mega-extent', '4E', SMALL_CLONE),
('mega-extent', '4E', BIG_CLONE),
)
def gen_opts_matrix():
for (seed, virt_size, clone_size) in SEED__VIRT_SZ__CLONE_SZ:
# Omitting `sync` since it's just a tad worse than `psync` due to
# the extra syscall (about 10% in my quick-and-dirty check).
for engine, depth in (
('io_uring', 4),
('io_uring', 16),
('mmap', 8), # iodepth not used
('psync', 8), # iodepth not used
):
for jobs, ublk_queues in JOBS__NUM_QUEUES:
yield {
'btrfs.virtual_data_size': virt_size,
'btrfs.total_clone_size': clone_size,
'btrfs.seed': seed,
# Short so we can get many runs, but not too short since
# the `fallocate` mode has ~4s formatting overhead @
# 275G. Note that higher values increase page-cache
# effects for `mmap` + SMALL_CLONE, but the analysis in
# `benchmark_matrix_summarize.py` splits these out.
'fio.runtime': 10,
'fio.direct': 1,
'fio.depth': depth,
'fio.engine': engine,
'fio.jobs': jobs,
'ublk.num_queues': ublk_queues,
}
def main():
my_dir = Path(sys.argv[0]).resolve().parent
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument('--repeat', default=30)
args = p.parse_args()
opts_matrix = list(gen_opts_matrix())
for _ in range(args.repeat):
random.shuffle(opts_matrix)
for opts in opts_matrix:
cmd = [
str(my_dir / 'benchmark.py'),
'--json-opts',
json.dumps(opts),
]
print(' '.join(shlex.quote(c) for c in cmd))
if __name__ == '__main__':
main()