diff --git a/deploy.yml b/deploy.yml index b3db9b55b..b03e4215c 100644 --- a/deploy.yml +++ b/deploy.yml @@ -151,3 +151,10 @@ become: true roles: - { role: firewalld, when: enable_firewalld is defined and enable_firewalld } + +- name: deploying perf-tools + hosts: all + tags: + - always + roles: + - perf_tools diff --git a/roles/perf_tools/tasks/main.yml b/roles/perf_tools/tasks/main.yml new file mode 100644 index 000000000..f7c8c81a3 --- /dev/null +++ b/roles/perf_tools/tasks/main.yml @@ -0,0 +1,12 @@ +--- +# tasks file for perf-tools + +- name: Deploy iosnoop and funcslower + copy: + src: "{{ playbook_dir }}/scripts/{{ item }}" + dest: "{{ deploy_dir }}/scripts/" + mode: 0755 + backup: yes + with_items: + - iosnoop + - funcslower diff --git a/scripts/funcslower b/scripts/funcslower new file mode 100644 index 000000000..86c5caf7e --- /dev/null +++ b/scripts/funcslower @@ -0,0 +1,247 @@ +#!/bin/bash +# +# funcslower - trace kernel functions slower than a threshold (microseconds). +# Uses Linux ftrace. +# +# This uses the Linux ftrace function graph profiler to time kernel functions +# and filter them based on a latency threshold. This is a proof of concept using +# Linux ftrace capabilities on older kernels. +# +# USAGE: funcslower [-aChHPt] [-p PID] [-d secs] funcstring latency_us +# +# Run "funcslower -h" for full usage. +# +# REQUIREMENTS: FTRACE function graph, which you may already have available +# and enabled in recent kernels. And awk. +# +# The output format is the same as the ftrace function graph trace format, +# described in the kernel source under Documentation/trace/ftrace.txt. +# Note that the output may be shuffled when different CPU buffers are read; +# check the CPU column for changes, or include timestamps (-t) and post sort. +# +# WARNING: This uses dynamic tracing of kernel functions, and could cause +# kernel panics or freezes. Test, and know what you are doing, before use. +# +# OVERHEADS: Timing and filtering is performed in-kernel context, costing +# lower overheads than post-processing in user space. If you trace frequent +# events (eg, pick a common function and a low threshold), you might want to +# try the "-d secs" option, which buffers events in-kernel instead of printing +# them live. +# +# From perf-tools: https://github.com/brendangregg/perf-tools +# +# COPYRIGHT: Copyright (c) 2014 Brendan Gregg. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# (http://www.gnu.org/copyleft/gpl.html) +# +# 12-Jul-2014 Brendan Gregg Created this. + +### default variables +tracing=/sys/kernel/debug/tracing +flock=/var/tmp/.ftrace-lock +opt_duration=0; duration=; opt_pid=0; pid=; opt_tid=0; tid= +pidtext=; opt_headers=0; opt_proc=0; opt_time=0; opt_cpu=0 +trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section + +function usage { + cat <<-END >&2 + USAGE: funcslower [-aChHPt] [-p PID] [-L TID] [-d secs] funcstring latency_us + -a # all info (same as -HPt) + -C # measure on-CPU time only + -d seconds # trace duration, and use buffers + -h # this usage message + -H # include column headers + -p PID # trace when this pid is on-CPU + -L TID # trace when this thread is on-CPU + -P # show process names & PIDs + -t # show timestamps + eg, + funcslower vfs_read 10000 # trace vfs_read() slower than 10 ms + See the man page and example file for more info. +END + exit +} + +function warn { + if ! eval "$@"; then + echo >&2 "WARNING: command failed \"$@\"" + fi +} + +function end { + # disable tracing + echo 2>/dev/null + echo "Ending tracing..." 2>/dev/null + cd $tracing + + (( opt_time )) && warn "echo nofuncgraph-abstime > trace_options" + (( opt_proc )) && warn "echo nofuncgraph-proc > trace_options" + (( opt_cpu )) && warn "echo sleep-time > trace_options" + + warn "echo nop > current_tracer" + (( opt_pid )) && warn "echo > set_ftrace_pid" + warn "echo > set_ftrace_filter" + warn "echo > set_graph_function" + warn "echo 0 > tracing_thresh" + warn "echo > trace" + + (( wroteflock )) && warn "rm $flock" +} + +function die { + echo >&2 "$@" + exit 1 +} + +function edie { + # die with a quiet end() + echo >&2 "$@" + exec >/dev/null 2>&1 + end + exit 1 +} + +### process options +while getopts aCd:hHp:L:Pt opt +do + case $opt in + a) opt_headers=1; opt_proc=1; opt_time=1 ;; + C) opt_cpu=1; ;; + d) opt_duration=1; duration=$OPTARG ;; + p) opt_pid=1; pid=$OPTARG ;; + L) opt_tid=1; tid=$OPTARG ;; + H) opt_headers=1; ;; + P) opt_proc=1; ;; + t) opt_time=1; ;; + h|?) usage ;; + esac +done +shift $(( $OPTIND - 1 )) + +### option logic +(( $# < 2 )) && usage +(( opt_pid && opt_tid )) && edie "ERROR: You can use -p or -L but not both." +funcs="$1" +shift +thresh=$1 +(( opt_pid )) && pidtext=" for PID $pid" +(( opt_tid )) && pidtext=" for TID $tid" +printf "Tracing \"$funcs\"$pidtext slower than $thresh us" +if (( opt_duration )); then + echo " for $duration seconds..." +else + echo "... Ctrl-C to end." +fi + +## select awk +if (( opt_duration )); then + [[ -x /usr/bin/mawk ]] && awk=mawk || awk=awk +else + # workarounds for mawk/gawk fflush behavior + if [[ -x /usr/bin/gawk ]]; then + awk=gawk + elif [[ -x /usr/bin/mawk ]]; then + awk="mawk -W interactive" + else + awk=awk + fi +fi + +### check permissions +cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE? + debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)" + +### ftrace lock +[[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $(cat $flock) $flock" +echo $$ > $flock || die "ERROR: unable to write $flock." +wroteflock=1 + +### setup and commence tracing +sysctl -q kernel.ftrace_enabled=1 # doesn't set exit status +read mode < current_tracer +[[ "$mode" != "nop" ]] && edie "ERROR: ftrace active (current_tracer=$mode)" +if ! echo $thresh > tracing_thresh; then + edie "ERROR: setting tracing_thresh to $thresh. Exiting." +fi +if (( opt_pid )); then + echo '' > set_ftrace_pid + # ftrace expects kernel pids, which are thread ids + for tid in /proc/$pid/task/*; do + if ! echo ${tid##*/} >> set_ftrace_pid; then + edie "ERROR: setting -p $pid (PID exist?). Exiting." + fi + done +fi +if (( opt_tid )); then + if ! echo $tid > set_ftrace_pid; then + edie "ERROR: setting -L $tid (TID exist?). Exiting." + fi +fi +if ! echo "$funcs" > set_ftrace_filter; then + edie "ERROR: enabling \"$funcs\" filter. Function exist? Exiting." +fi +if ! echo "$funcs" > set_graph_function; then + edie "ERROR: enabling \"$funcs\" graph. Exiting." +fi +if ! echo function_graph > current_tracer; then + edie "ERROR: setting current_tracer to \"function_graph\". Exiting." +fi +if (( opt_cpu )); then + if ! echo nosleep-time > trace_options; then + edie "ERROR: setting -C (nosleep-time). Exiting." + fi +fi +# the following must be done after setting current_tracer +if (( opt_time )); then + if ! echo funcgraph-abstime > trace_options; then + edie "ERROR: setting -t (funcgraph-abstime). Exiting." + fi +fi +if (( opt_proc )); then + if ! echo funcgraph-proc > trace_options; then + edie "ERROR: setting -P (funcgraph-proc). Exiting." + fi +fi + +### setup output filter +cat=cat +if (( opt_proc )); then + # remove proc change entries, since PID is included. example: + # ------------------------------------------ + # 0) supervi-1699 => supervi-1693 + # ------------------------------------------ + # + cat=$awk' "/(^ ---|^$)/ || \$3 == \"=>\" { next } { print \$0 }"' +fi + +### print trace buffer +warn "echo > trace" +if (( opt_duration )); then + sleep $duration + if (( opt_headers )); then + $cat trace + else + $cat trace | grep -v '^#' + fi +else + # trace_pipe lack headers, so fetch them from trace + (( opt_headers )) && cat trace + eval $cat trace_pipe +fi + +### end tracing +end diff --git a/scripts/iosnoop b/scripts/iosnoop new file mode 100644 index 000000000..43b353635 --- /dev/null +++ b/scripts/iosnoop @@ -0,0 +1,289 @@ +#!/bin/bash +# +# iosnoop - trace block device I/O. +# Written using Linux ftrace. +# +# This traces disk I/O at the block device interface, using the block: +# tracepoints. This can help characterize the I/O requested for the storage +# devices and their resulting performance. I/O completions can also be studied +# event-by-event for debugging disk and controller I/O scheduling issues. +# +# USAGE: ./iosnoop [-hQst] [-d device] [-i iotype] [-p pid] [-n name] [duration] +# +# Run "iosnoop -h" for full usage. +# +# REQUIREMENTS: FTRACE CONFIG, block:block_rq_* tracepoints (you may +# already have these on recent kernels). +# +# OVERHEAD: By default, iosnoop works without buffering, printing I/O events +# as they happen (uses trace_pipe), context switching and consuming CPU to do +# so. This has a limit of about 10,000 IOPS (depending on your platform), at +# which point iosnoop will be consuming 1 CPU. The duration mode uses buffering, +# and can handle much higher IOPS rates, however, the buffer has a limit of +# about 50,000 I/O, after which events will be dropped. You can tune this with +# bufsize_kb, which is per-CPU. Also note that the "-n" option is currently +# post-filtered, so all events are traced. +# +# This was written as a proof of concept for ftrace. It would be better written +# using perf_events (after some capabilities are added), which has a better +# buffering policy, or a tracer such as SystemTap or ktap. +# +# From perf-tools: https://github.com/brendangregg/perf-tools +# +# See the iosnoop(8) man page (in perf-tools) for more info. +# +# COPYRIGHT: Copyright (c) 2014 Brendan Gregg. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# (http://www.gnu.org/copyleft/gpl.html) +# +# 12-Jul-2014 Brendan Gregg Created this. + +### default variables +tracing=/sys/kernel/debug/tracing +flock=/var/tmp/.ftrace-lock +bufsize_kb=4096 +opt_duration=0; duration=; opt_name=0; name=; opt_pid=0; pid=; ftext= +opt_start=0; opt_end=0; opt_device=0; device=; opt_iotype=0; iotype= +opt_queue=0 +trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section + +function usage { + cat <<-END >&2 + USAGE: iosnoop [-hQst] [-d device] [-i iotype] [-p PID] [-n name] + [duration] + -d device # device string (eg, "202,1) + -i iotype # match type (eg, '*R*' for all reads) + -n name # process name to match on I/O issue + -p PID # PID to match on I/O issue + -Q # use queue insert as start time + -s # include start time of I/O (s) + -t # include completion time of I/O (s) + -h # this usage message + duration # duration seconds, and use buffers + eg, + iosnoop # watch block I/O live (unbuffered) + iosnoop 1 # trace 1 sec (buffered) + iosnoop -Q # include queueing time in LATms + iosnoop -ts # include start and end timestamps + iosnoop -i '*R*' # trace reads + iosnoop -p 91 # show I/O issued when PID 91 is on-CPU + iosnoop -Qp 91 # show I/O queued by PID 91, queue time + See the man page and example file for more info. +END + exit +} + +function warn { + if ! eval "$@"; then + echo >&2 "WARNING: command failed \"$@\"" + fi +} + +function end { + # disable tracing + echo 2>/dev/null + echo "Ending tracing..." 2>/dev/null + cd $tracing + warn "echo 0 > events/block/$b_start/enable" + warn "echo 0 > events/block/block_rq_complete/enable" + if (( opt_device || opt_iotype || opt_pid )); then + warn "echo 0 > events/block/$b_start/filter" + warn "echo 0 > events/block/block_rq_complete/filter" + fi + warn "echo > trace" + (( wroteflock )) && warn "rm $flock" +} + +function die { + echo >&2 "$@" + exit 1 +} + +function edie { + # die with a quiet end() + echo >&2 "$@" + exec >/dev/null 2>&1 + end + exit 1 +} + +### process options +while getopts d:hi:n:p:Qst opt +do + case $opt in + d) opt_device=1; device=$OPTARG ;; + i) opt_iotype=1; iotype=$OPTARG ;; + n) opt_name=1; name=$OPTARG ;; + p) opt_pid=1; pid=$OPTARG ;; + Q) opt_queue=1 ;; + s) opt_start=1 ;; + t) opt_end=1 ;; + h|?) usage ;; + esac +done +shift $(( $OPTIND - 1 )) +if (( $# )); then + opt_duration=1 + duration=$1 + shift +fi +if (( opt_device )); then + major=${device%,*} + minor=${device#*,} + dev=$(( (major << 20) + minor )) +fi + +### option logic +(( opt_pid && opt_name )) && die "ERROR: use either -p or -n." +(( opt_pid )) && ftext=" issued by PID $pid" +(( opt_name )) && ftext=" issued by process name \"$name\"" +if (( opt_duration )); then + echo "Tracing block I/O$ftext for $duration seconds (buffered)..." +else + echo "Tracing block I/O$ftext. Ctrl-C to end." +fi +if (( opt_queue )); then + b_start=block_rq_insert +else + b_start=block_rq_issue +fi + +### select awk +(( opt_duration )) && use=mawk || use=gawk # workaround for mawk fflush() +[[ -x /usr/bin/$use ]] && awk=$use || awk=awk +wroteflock=1 + +### check permissions +cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE? + debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)" + +### ftrace lock +[[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $(cat $flock) $flock" +echo $$ > $flock || die "ERROR: unable to write $flock." + +### setup and begin tracing +echo nop > current_tracer +warn "echo $bufsize_kb > buffer_size_kb" +filter= +if (( opt_iotype )); then + filter="rwbs ~ \"$iotype\"" +fi +if (( opt_device )); then + [[ "$filter" != "" ]] && filter="$filter && " + filter="${filter}dev == $dev" +fi +filter_i=$filter +if (( opt_pid )); then + [[ "$filter_i" != "" ]] && filter_i="$filter_i && " + filter_i="${filter_i}common_pid == $pid" + [[ "$filter" == "" ]] && filter=0 +fi +if (( opt_iotype || opt_device || opt_pid )); then + if ! echo "$filter_i" > events/block/$b_start/filter || \ + ! echo "$filter" > events/block/block_rq_complete/filter + then + edie "ERROR: setting -d or -t filter. Exiting." + fi +fi +if ! echo 1 > events/block/$b_start/enable || \ + ! echo 1 > events/block/block_rq_complete/enable; then + edie "ERROR: enabling block I/O tracepoints. Exiting." +fi +(( opt_start )) && printf "%-15s " "STARTs" +(( opt_end )) && printf "%-15s " "ENDs" +printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n" \ + "COMM" "PID" "TYPE" "DEV" "BLOCK" "BYTES" "LATms" + +# +# Determine output format. It may be one of the following (newest first): +# TASK-PID CPU# |||| TIMESTAMP FUNCTION +# TASK-PID CPU# TIMESTAMP FUNCTION +# To differentiate between them, the number of header fields is counted, +# and an offset set, to skip the extra column when needed. +# +offset=$($awk 'BEGIN { o = 0; } + $1 == "#" && $2 ~ /TASK/ && NF == 6 { o = 1; } + $2 ~ /TASK/ { print o; exit }' trace) + +### print trace buffer +warn "echo > trace" +( if (( opt_duration )); then + # wait then dump buffer + sleep $duration + cat trace +else + # print buffer live + cat trace_pipe +fi ) | $awk -v o=$offset -v opt_name=$opt_name -v name=$name \ + -v opt_duration=$opt_duration -v opt_start=$opt_start -v opt_end=$opt_end \ + -v b_start=$b_start ' + # common fields + $1 != "#" { + # task name can contain dashes + comm = pid = $1 + sub(/-[0-9][0-9]*/, "", comm) + sub(/.*-/, "", pid) + time = $(3+o); sub(":", "", time) + dev = $(5+o) + } + # block I/O request + $1 != "#" && $0 ~ b_start { + if (opt_name && match(comm, name) == 0) + next + # + # example: (fields1..4+o) 202,1 W 0 () 12862264 + 8 [tar] + # The cmd field "()" might contain multiple words (hex), + # hence stepping from the right (NF-3). + # + loc = $(NF-3) + starts[dev, loc] = time + comms[dev, loc] = comm + pids[dev, loc] = pid + next + } + # block I/O completion + $1 != "#" && $0 ~ /rq_complete/ { + # + # example: (fields1..4+o) 202,1 W () 12862256 + 8 [0] + # + dir = $(6+o) + loc = $(NF-3) + nsec = $(NF-1) + if (starts[dev, loc] > 0) { + latency = sprintf("%.2f", + 1000 * (time - starts[dev, loc])) + comm = comms[dev, loc] + pid = pids[dev, loc] + if (opt_start) + printf "%-15s ", starts[dev, loc] + if (opt_end) + printf "%-15s ", time + printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n", + comm, pid, dir, dev, loc, nsec * 512, latency + if (!opt_duration) + fflush() + delete starts[dev, loc] + delete comms[dev, loc] + delete pids[dev, loc] + } + next + } + $0 ~ /LOST.*EVENTS/ { print "WARNING: " $0 > "/dev/stderr" } +' + +### end tracing +end