diff --git a/README.md b/README.md index 185974b9fc0d..0cef3f5430f5 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ pair of .c and .py files, and some are directories of files. - tools/[llcstat](tools/llcstat.py): Summarize CPU cache references and misses by process. [Examples](tools/llcstat_example.txt). - tools/[memleak](tools/memleak.py): Display outstanding memory allocations to find memory leaks. [Examples](tools/memleak_example.txt). - tools/[mysqld_qslower](tools/mysqld_qslower.py): Trace MySQL server queries slower than a threshold. [Examples](tools/mysqld_qslower_example.txt). +- tools/[nvmecmd](tools/nvmecmd.py): Observe NVMe commands [Examples](tools/nvmecmd_example.txt). - tools/[offcputime](tools/offcputime.py): Summarize off-CPU time by kernel stack trace. [Examples](tools/offcputime_example.txt). - tools/[offwaketime](tools/offwaketime.py): Summarize blocked time by kernel off-CPU stack and waker stack. [Examples](tools/offwaketime_example.txt). - tools/[oomkill](tools/oomkill.py): Trace the out-of-memory (OOM) killer. [Examples](tools/oomkill_example.txt). diff --git a/man/man8/nvmecmd.8 b/man/man8/nvmecmd.8 new file mode 100644 index 000000000000..fc755865c13d --- /dev/null +++ b/man/man8/nvmecmd.8 @@ -0,0 +1,106 @@ +.TH nvmecmd 8 "2023-11-06 "USER COMMANDS" +.SH NAME +nvmecmd \- Observes NVMe commands and alignment. +.SH SYNOPSIS +.B nvmecmd.py [\-h] [\-d DISK] [\-o OPS] [\--debug] [\--trace] +.B [\--interval INTERVAL] +.SH DESCRIPTION +nvmecmd observes and traces NVMe commands. The program attaches kprobe on +`nvme_setup_cmd` by default to capture NVMe commands issued to any device. If +disk and/or operation filters are used, the program will then skip capturing +for that particular event. If tracing option is passed, then all captured events +will be printed in a table with the following columns, sorted from left to +right: + +- DISK: Prints the NVMe node (e.g. 'nvme0n9'). + +- OPS: Prints the NVMe operation (read/write). + +- LEN: Prints the length in bytes. + +- LBA: Prints the Logical Block Address (LBA). + +- PID: Prints the process ID. + +- COMM: Prints the process name (command). + +- ALGN: Prints the maximum alignment possible in power-of-2 bytes. Example: +An alignment value of 16384 (16k) indicates the command is aligned in size and +LBA to 4k, 8k and 16k. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc +.SH OPTIONS +.TP +\-h, --help +show this help message and exit +.TP +\-d DISK, --disk DISK +If set, the BPF will add a disk name filter to skip NVMe commands that don't +match the given NVMe node. +Example: nvme0n9 +.TP +\-o OPS, --ops OPS +If set, the BPF will add a operation filter to skip NVMe commands that don't +match the given operation. A full list of the operation values can be found at +the 'enum req_op' in the kernel header 'include/linux/blk_types.h'. +.TP +\--debug +Prints BPF code before capturing. +.TP +\--trace +Prints NVMe captured commands in a table form. + +Header: DISK OPS LEN LBA PID COMM ALGN. +.TP +\--interval INTERVAL +Specifies the maximum event polling event interval. +.SH EXAMPLES +.TP +Observe all NVMe commands and print a power-of-2 histogram with the block and \ +alignment sizes at the end. +# +.B nvmecmd +.TP +Observe all commands issued to the 9th NVMe node and print a power-of-2 \ +histogram with the block and alignment sizes at the end. +# +.B nvmecmd --disk nvme9n1 +.TP +Observe and trace all write commands issued to the 9th NVMe node. And print a \ +power-of-2 histogram with the block and alignment sizes at the end. +# +.B nvmecmd --disk nvme9n1 --ops write --trace +.TP +Print eBPF program before observe starts. Observe and trace all write \ +command issued to the 9th NVMe node. And print a power-of-2 histogram with the \ +block and alignment sizes at the end. +# +.B nvmecmd --disk nvme9n1 --ops write --debug +.TP +Observe and trace all write commands issued to the 9th NVMe node. Poll NVMe \ +events from the data ring buffer every 100 ms. And print a power-of-2 \ +histogram with the block and alignment sizes at the end. +# +.B nvmecmd --disk nvme9n1 --ops write --interval 0.1 +.SH OVERHEAD +This traces all NVMe commands issued to any device. The overhead of this can be +high if the volume of the commands is high. To reduce overhead, add filters +such as disk ('--disk') and/or operation ('--ops'). You can also increase the +polling interval ('--interval') when tracing ('--trace') or if possible, just +disable tracing completely. You should only run this on a process where the +slowdown is acceptable. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Daniel Gomez diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py index 847e04c4d819..9d9ce8b08cb2 100755 --- a/tests/python/test_tools_smoke.py +++ b/tests/python/test_tools_smoke.py @@ -261,6 +261,10 @@ def test_nfsdist(self): else: pass + @skipUnless(kernel_version_ge(4,19), "requires kernel >= 4.19") + def test_nvmecmd(self): + self.run_with_duration("nvmecmd.py") + @skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6") @mayFail("This fails on github actions environment, and needs to be fixed") def test_offcputime(self): diff --git a/tools/nvmecmd.py b/tools/nvmecmd.py new file mode 100755 index 000000000000..f61a0868b6a7 --- /dev/null +++ b/tools/nvmecmd.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python +# SPDX-License-Identifier: Apache-2.0 +# +# nvmecmd NVMe command observability tool. +# +# Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 06-Nov-2023 Daniel Gomez Created this. +from __future__ import ( + absolute_import, division, unicode_literals, print_function +) +from bcc import BPF +import argparse +import time + +examples = """examples: + nvmecmd # Observe all NVMe commands + nvmecmd --disk nvme9n1 # Observe all commands on 9th NVMe node + nvmecmd --ops read # Observe read commands on all NVMe + nvmecmd --ops write # Observe write commands on all NVMe + nvmecmd --ops write --disk nvme9n1 # Observe write commands on 9th NVMe node + nvmecmd --debug # Print eBPF program before observe + nvmecmd --trace # Print NVMe captured events + nvmecmd --interval 0.1 # Poll data ring buffer every 100 ms +""" + +parser = argparse.ArgumentParser( + description="NVMe commands observer tool", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples, +) +parser.add_argument( + "-d", + "--disk", + type=str, + help="capture commands for this NVMe disk node only" +) +parser.add_argument( + "-o", + "--ops", + type=str, + help="capture this command operation only" +) +parser.add_argument("--debug", action="store_true", help="debug") +parser.add_argument( + "--trace", + action="store_true", + help="trace NVMe captured commands" +) +parser.add_argument( + "--interval", + type=float, + help="polling interval" +) + +args = parser.parse_args() + +# define BPF program +bpf_text = """ +#include +#include +#include +#include + +struct nvme_ns { + struct list_head list; + + struct nvme_ctrl *ctrl; + struct request_queue *queue; + struct gendisk *disk; +#ifdef CONFIG_NVME_MULTIPATH + enum nvme_ana_state ana_state; + u32 ana_grpid; +#endif + struct list_head siblings; + struct kref kref; + struct nvme_ns_head *head; + + int lba_shift; + // [...] +}; + +struct data_t { + u32 pid; + char comm[TASK_COMM_LEN]; + char disk[DISK_NAME_LEN]; + u32 op; + u32 len; + u32 lba; + u32 algn; +}; + +BPF_HISTOGRAM(block_len, u32, 64); +BPF_HISTOGRAM(algn, u32, 64); +BPF_ARRAY(counts, u64, 1); +BPF_RINGBUF_OUTPUT(events, 8); + +/* local strcmp function, max length 16 to protect instruction loops */ +#define CMPMAX 16 + +static int local_strcmp(const char *cs, const char *ct) +{ + int len = 0; + unsigned char c1, c2; + + while (len++ < CMPMAX) { + c1 = *cs++; + c2 = *ct++; + if (c1 != c2) + return c1 < c2 ? -1 : 1; + if (!c1) + break; + } + return 0; +} +""" + +bpf_text_disk_filter = "" +if args.disk: + bpf_text_disk_filter = """ + if (local_strcmp(req->q->disk->disk_name, "{disk}")) + return; + """.format( + disk=args.disk + ) + +bpf_text_ops_filter = "" +# Operation dictionary. Full list of operations at Linux kernel +# 'include/linux/blk_types.h' header file. +nvme_ops = { + 0: "read", + 1: "write", + "read": 0, + "write": 1, +} +if args.ops: + operation = nvme_ops[args.ops] + bpf_text_ops_filter = """ + if ((req->cmd_flags & 0xff) != {ops}) + return; + """.format( + ops=operation + ) + +bpf_text += """ +void kprobe__nvme_setup_cmd(struct pt_regs *ctx, struct nvme_ns *ns, + struct request *req) +{{ + struct data_t data = {{}}; + u32 max_algn_size = 4096, algn_size = 4096; + u32 lba_len = algn_size / 4096; + bool is_algn = false; + u8 i; + + {disk_filter} + {ops_filter} + + data.pid = bpf_get_current_pid_tgid() >> 32; + bpf_get_current_comm(&data.comm, sizeof(data.comm)); + bpf_probe_read_kernel(&data.disk, sizeof(data.disk), + req->q->disk->disk_name); + data.op = req->cmd_flags & 0xff; + data.len = req->__data_len; + data.lba = req->__sector >> (ns->lba_shift - SECTOR_SHIFT); + + for (i=0; i<8; i++) {{ + is_algn = !(data.len % algn_size) && !(data.lba % lba_len); + if (is_algn) {{ + max_algn_size = algn_size; + }} + algn_size = algn_size << 1; + lba_len = algn_size / 4096; + }} + data.algn = max_algn_size; + + events.ringbuf_output(&data, sizeof(data), 0); + block_len.increment(bpf_log2l(req->__data_len)); + algn.increment(bpf_log2l(max_algn_size)); +}} +""".format( + disk_filter=bpf_text_disk_filter, ops_filter=bpf_text_ops_filter +) + +if args.debug: + print(args) + print(bpf_text) + +bpf = BPF(text=bpf_text) +if args.trace: + print("Tracing NVMe commands... Hit Ctrl-C to end.") + print( + "%-10s %-8s %-8s %-10s %-10s %-16s %-8s" + % ("DISK", "OPS", "LEN", "LBA", "PID", "COMM", "ALGN") + ) + + +def capture_event(ctx, data, size): + event = bpf["events"].event(data) + if args.trace: + print_event(event) + + +def print_event(event): + print( + "%-10s %-8s %-8s %-10s %-10s %-16s %-8s" + % ( + event.disk.decode("utf-8", "replace"), + nvme_ops[event.op], + event.len, + event.lba, + event.pid, + event.comm.decode("utf-8", "replace"), + event.algn, + ), + ) + + +bpf["events"].open_ring_buffer(capture_event) +block_len = bpf["block_len"] +algn = bpf["algn"] +while 1: + try: + bpf.ring_buffer_poll(30) + if args.interval: + time.sleep(abs(args.interval)) + except KeyboardInterrupt: + bpf.ring_buffer_consume() + print() + block_len.print_log2_hist( + "Block size", "operation", section_print_fn=bytes.decode + ) + block_len.clear() + print() + algn.print_log2_hist("Algn size", "operation", + section_print_fn=bytes.decode) + algn.clear() + break +exit() diff --git a/tools/nvmecmd_example.txt b/tools/nvmecmd_example.txt new file mode 100644 index 000000000000..b2e4434aca4c --- /dev/null +++ b/tools/nvmecmd_example.txt @@ -0,0 +1,200 @@ +Demonstrations of nvmecmd, the Linux eBPF/bcc version. + +1. fio test with different block sizes. + +The nvmecmd tool records all NVMe write commands issued to the nvme0n2 device and +produces two histograms representing the block sizes of the transmitted data, and +the alignment of the commands. + +- fio test commands: + +fio -bs=4k -iodepth=1 -rw=write -ioengine=sync -size=4k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 40 +fio -bs=8k -iodepth=1 -rw=write -ioengine=sync -size=8k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 60 +fio -bs=16k -iodepth=1 -rw=write -ioengine=sync -size=16k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 80 +fio -bs=32k -iodepth=1 -rw=write -ioengine=sync -size=32k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 100 +fio -bs=64k -iodepth=1 -rw=write -ioengine=sync -size=64k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 100 +fio -bs=128k -iodepth=1 -rw=write -ioengine=sync -size=128k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 80 +fio -bs=256k -iodepth=1 -rw=write -ioengine=sync -size=256k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 60 +fio -bs=512k -iodepth=1 -rw=write -ioengine=sync -size=512k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 40 + +- nvmecmd capture command: + +tools/nvmecmd.py --disk nvme0n2 --ops write +^C + Block size : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 0 | | + 1024 -> 2047 : 0 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 40 |**************** | + 8192 -> 16383 : 60 |************************ | + 16384 -> 32767 : 80 |******************************** | + 32768 -> 65535 : 100 |****************************************| + 65536 -> 131071 : 100 |****************************************| + 131072 -> 262143 : 80 |******************************** | + 262144 -> 524287 : 60 |************************ | + 524288 -> 1048575 : 40 |**************** | + + Algn size : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 0 | | + 1024 -> 2047 : 0 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 40 |**************** | + 8192 -> 16383 : 60 |************************ | + 16384 -> 32767 : 80 |******************************** | + 32768 -> 65535 : 100 |****************************************| + 65536 -> 131071 : 100 |****************************************| + 131072 -> 262143 : 80 |******************************** | + 262144 -> 524287 : 60 |************************ | + 524288 -> 1048575 : 40 |**************** | + +2. fio test with different block sizes with an offset: + +The nvmecmd tool records all NVMe write commands issued to the nvme0n2 device and +produces two histograms representing the block sizes of the transmitted data, and +the alignment of the commands. + +- fio test commands: + +fio -bs=4k -iodepth=1 -rw=write -ioengine=sync -size=4k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 40 -offset=4096 +fio -bs=8k -iodepth=1 -rw=write -ioengine=sync -size=8k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 60 -offset=4096 +fio -bs=16k -iodepth=1 -rw=write -ioengine=sync -size=16k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 80 -offset=4096 +fio -bs=32k -iodepth=1 -rw=write -ioengine=sync -size=32k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 100 -offset=4096 +fio -bs=64k -iodepth=1 -rw=write -ioengine=sync -size=64k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 100 -offset=4096 +fio -bs=128k -iodepth=1 -rw=write -ioengine=sync -size=128k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 80 -offset=4096 +fio -bs=256k -iodepth=1 -rw=write -ioengine=sync -size=256k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 60 -offset=4096 +fio -bs=512k -iodepth=1 -rw=write -ioengine=sync -size=512k -name=sync -direct=1 -filename=/dev/nvme0n2 -loop 40 -offset=4096 + +- nvmecmd capture command: + +tools/nvmecmd.py --disk nvme0n2 --ops write +^C + Block size : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 0 | | + 1024 -> 2047 : 0 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 40 |**************** | + 8192 -> 16383 : 60 |************************ | + 16384 -> 32767 : 80 |******************************** | + 32768 -> 65535 : 100 |****************************************| + 65536 -> 131071 : 100 |****************************************| + 131072 -> 262143 : 80 |******************************** | + 262144 -> 524287 : 60 |************************ | + 524288 -> 1048575 : 40 |**************** | + + Algn size : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 0 | | + 1024 -> 2047 : 0 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 560 |****************************************| + +3. PostgreSQL workload using sybench: + +The nvmecmd tool records and traces all NVMe write commands issued to the +nvme0n2 device and produces two histograms representing the block sizes of the +transmitted data, and the alignment of the commands. + +- nvmecmd capture command: + +tools/nvmecmd.py --disk nvme0n2 --ops write --trace +Tracing NVMe commands... Hit Ctrl-C to end. +DISK OPS LEN LBA PID COMM ALGN +... +nvme0n1 write 16384 5592372 1402 postgres 16384 +nvme0n1 write 4096 8131612 1406 postgres 4096 +nvme0n1 write 4096 8476298 1407 postgres 4096 +nvme0n1 write 4096 5562083 1404 postgres 4096 +nvme0n1 write 4096 5562081 1404 postgres 4096 +nvme0n1 write 8192 5740024 977 postgres 8192 +nvme0n1 write 8192 8476258 1407 postgres 8192 +nvme0n1 write 16384 2809028 1400 postgres 16384 +nvme0n1 write 8192 9393492 1402 postgres 8192 +nvme0n1 write 16384 1777248 1401 postgres 16384 +nvme0n1 write 8192 11376510 1404 postgres 8192 +nvme0n1 write 4096 11376508 1404 postgres 4096 +nvme0n1 write 8192 1022036 1405 postgres 8192 +nvme0n1 write 4096 5755970 977 postgres 4096 +nvme0n1 write 16384 1777252 1401 postgres 16384 +nvme0n1 write 4096 9393589 1402 postgres 4096 +nvme0n1 write 4096 2809039 1400 postgres 4096 +nvme0n2 write 8192 9637351 1403 postgres 4096 +nvme0n1 write 4096 5822235 1406 postgres 4096 +... +^C + Block size : count distribution + 0 -> 1 : 560 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 0 | | + 1024 -> 2047 : 0 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 325687 |****************************************| + 8192 -> 16383 : 92798 |*********** | + 16384 -> 32767 : 14121 |* | + 32768 -> 65535 : 745 | | + 65536 -> 131071 : 496 | | + 131072 -> 262143 : 640 | | + 262144 -> 524287 : 6365 | | + 524288 -> 1048575 : 497 | | + + Algn size : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 0 | | + 1024 -> 2047 : 0 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 392148 |****************************************| + 8192 -> 16383 : 36459 |*** | + 16384 -> 32767 : 12781 |* | + 32768 -> 65535 : 491 | | + 65536 -> 131071 : 49 | | + 131072 -> 262143 : 33 | | + 262144 -> 524287 : 10 | |