Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

appbench: Detailed per-app performance analysis [WIP] #615

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/apps/keyed_ipv6_tunnel/tunnel.lua
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,11 @@ end

function SimpleKeyedTunnel:push()
-- encapsulation path

local l_in = self.input.decapsulated
local l_out = self.output.encapsulated
assert(l_in and l_out)

while not link.empty(l_in) and not link.full(l_out) do
while (l_in and l_out) and not link.empty(l_in) and not link.full(l_out) do
local p = link.receive(l_in)
packet.prepend(p, self.header, HEADER_SIZE)
local plength = ffi.cast(plength_ctype, p.data + LENGTH_OFFSET)
Expand All @@ -189,8 +189,7 @@ function SimpleKeyedTunnel:push()
-- decapsulation path
l_in = self.input.encapsulated
l_out = self.output.decapsulated
assert(l_in and l_out)
while not link.empty(l_in) and not link.full(l_out) do
while (l_in and l_out) and not link.empty(l_in) and not link.full(l_out) do
local p = link.receive(l_in)
-- match next header, cookie, src/dst addresses
local drop = true
Expand Down
25 changes: 14 additions & 11 deletions src/lib/pmu.lua
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ function setup (patterns)
local EN = bit.lshift(1, 22)
writemsr(0, 0x186+n, bit.bor(0x10000, USR, EN, code))
end
enabled = {"instructions", "cycles", "ref-cycles"}
enabled = {"instructions", "cycles", "ref_cycles"}
for i = 1, #set do table.insert(enabled, set[i]) end
return ndropped
end
Expand All @@ -226,11 +226,15 @@ function writemsr (cpu, msr, value)
end

-- API function (see above)
function report (set, aux)
function report (tab, aux)
aux = aux or {}
local names = lib.array_copy(enabled)
local values = {}
for i = 0, #names-1 do table.insert(values, tonumber(set[i])) end
local data = {}
for k,v in pairs(tab) do table.insert(data, {k=k,v=v}) end
-- Sort fixed-purpose counters to come first in definite order
local fixed = {cycles='0', ref_cycles='1', instructions='2'}
table.sort(data, function(x,y)
return (fixed[x.k] or x.k) < (fixed[y.k] or y.k)
end)
local auxnames, auxvalues = {}, {}
for k,v in pairs(aux) do
table.insert(auxnames,k)
Expand All @@ -244,14 +248,13 @@ function report (set, aux)
print()
-- include aux values in results
for i = 1, #auxnames do
table.insert(names, auxnames[i])
table.insert(values, auxvalues[i])
table.insert(data, {k=auxnames[i], v=auxvalues[i]})
end
-- print values
for i = 1, #names do
io.write(("%-40s %14s"):format(names[i], core.lib.comma_value(values[i])))
for i = 1, #data do
io.write(("%-40s %14s"):format(data[i].k, core.lib.comma_value(data[i].v)))
for j = 1, #auxnames do
io.write(("%12.3f"):format(tonumber(values[i]/auxvalues[j])))
io.write(("%12.3f"):format(tonumber(data[i].v/auxvalues[j])))
end
print()
end
Expand All @@ -274,7 +277,7 @@ function profile (f, events, aux, quiet)
switch_to(set)
local res = f()
switch_to(nil)
if not quiet then report(set, aux) end
if not quiet then report(to_table(set), aux) end
return res
end

Expand Down
4 changes: 4 additions & 0 deletions src/lib/pmu_x86.dasl
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,15 @@ local dasm = require("dasm")

local gen = {}

-- Table keeping machine code alive to the GC.
local anchor = {}

-- Utility: assemble code and optionally dump disassembly.
function assemble (name, prototype, generator)
local Dst = dasm.new(actions)
generator(Dst)
local mcode, size = Dst:build()
table.insert(anchor, mcode)
if debug then
print("mcode dump: "..name)
dasm.dump(mcode, size)
Expand Down
71 changes: 71 additions & 0 deletions src/program/snabbmark/snabbmark.lua
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ function run (args)
nfvconfig(unpack(args))
elseif command == 'solarflare' and #args >= 2 and #args <= 3 then
solarflare(unpack(args))
elseif command == 'appbench' then
appbench(unpack(args))
else
print(usage)
main.exit(1)
Expand Down Expand Up @@ -227,3 +229,72 @@ function solarflare (npackets, packet_size, timeout)
main.exit(1)
end
end

-- snabbmark appbench
--
-- Benchmark an individual app and print detailed measurements of its
-- per-packet performance and behavior.
--
-- The benchmark is done like this:
--
-- 1. Process very many packets in a loop.
--
-- 2. Use CPU performance counters (PMU) to precisely measure performance.
--
-- 3. Isolate the performance impact of the app by comparing an app
-- network with the app vs without it.

events = {"mem_load_uops_retired.l1_hit",
"mem_load_uops_retired.l2_hit",
"mem_load_uops_retired.l3_hit",
"mem_load_uops_retired.l3_miss",
"br_misp_retired.all_branches$"}

function appbench (mod, app, configstring, inlink, outlink)
print("module: " .. mod)
print("app: " .. app)
print("config: " .. (configstring or ''))
print("inlink: " .. (inlink or '[default: rx]'))
print("outlink: " .. (outlink or '[default: tx]'))
inlink = inlink or 'rx'
outlink = outlink or 'tx'
local cfg = configstring and core.lib.load_string(configstring)()
print(mod, app, cfg)
local pmu = require("lib.pmu")
-- First test the "null app network".
local c0 = config.new()
config.app(c0, "source", basic_apps.Source)
config.app(c0, "sink", basic_apps.Sink)
config.link(c0, "source.tx->sink.rx")
engine.configure(c0)
local start = C.get_monotonic_time()
local npackets = 100e6
print("starting reference run...")
local run = function ()
while link.stats(engine.app_table.source.output.tx).txpackets < npackets do
engine.main({duration = 0.01, no_report = true})
end
end
local _, t0 = pmu.measure(run, events)
print("reference result:")
pmu.report(t0, {packet = npackets})
engine.configure(config.new())
local c1 = config.new()
config.app(c1, "source", basic_apps.Source)
config.app(c1, "app", require(mod)[app], cfg)
config.app(c1, "sink", basic_apps.Sink)
config.link(c1, "source.tx->app."..inlink)
config.link(c1, "app."..outlink.."->sink.rx")
engine.configure(c1)
print("\nstarting production run...")
local _, t1 = pmu.measure(run, events)
print("production result:")
pmu.report(t1, {packet = npackets})
local tdelta = {}
for k, v in pairs(t1) do tdelta[k] = v - t0[k] end
print("\ndifference from reference to production:")
pmu.report(tdelta, {packet = npackets})
local finish = C.get_monotonic_time()
local runtime = finish - start
end