diff --git a/src/apps/keyed_ipv6_tunnel/tunnel.lua b/src/apps/keyed_ipv6_tunnel/tunnel.lua index 9bf92eafa5..39779cfedd 100644 --- a/src/apps/keyed_ipv6_tunnel/tunnel.lua +++ b/src/apps/keyed_ipv6_tunnel/tunnel.lua @@ -174,11 +174,11 @@ end function SimpleKeyedTunnel:push() -- encapsulation path + local l_in = self.input.decapsulated local l_out = self.output.encapsulated - assert(l_in and l_out) - while not link.empty(l_in) and not link.full(l_out) do + while (l_in and l_out) and not link.empty(l_in) and not link.full(l_out) do local p = link.receive(l_in) packet.prepend(p, self.header, HEADER_SIZE) local plength = ffi.cast(plength_ctype, p.data + LENGTH_OFFSET) @@ -189,8 +189,7 @@ function SimpleKeyedTunnel:push() -- decapsulation path l_in = self.input.encapsulated l_out = self.output.decapsulated - assert(l_in and l_out) - while not link.empty(l_in) and not link.full(l_out) do + while (l_in and l_out) and not link.empty(l_in) and not link.full(l_out) do local p = link.receive(l_in) -- match next header, cookie, src/dst addresses local drop = true diff --git a/src/lib/pmu.lua b/src/lib/pmu.lua index b7394c14e1..a905f707cc 100644 --- a/src/lib/pmu.lua +++ b/src/lib/pmu.lua @@ -209,7 +209,7 @@ function setup (patterns) local EN = bit.lshift(1, 22) writemsr(0, 0x186+n, bit.bor(0x10000, USR, EN, code)) end - enabled = {"instructions", "cycles", "ref-cycles"} + enabled = {"instructions", "cycles", "ref_cycles"} for i = 1, #set do table.insert(enabled, set[i]) end return ndropped end @@ -226,11 +226,15 @@ function writemsr (cpu, msr, value) end -- API function (see above) -function report (set, aux) +function report (tab, aux) aux = aux or {} - local names = lib.array_copy(enabled) - local values = {} - for i = 0, #names-1 do table.insert(values, tonumber(set[i])) end + local data = {} + for k,v in pairs(tab) do table.insert(data, {k=k,v=v}) end + -- Sort fixed-purpose counters to come first in definite order + local fixed = {cycles='0', ref_cycles='1', instructions='2'} + table.sort(data, function(x,y) + return (fixed[x.k] or x.k) < (fixed[y.k] or y.k) + end) local auxnames, auxvalues = {}, {} for k,v in pairs(aux) do table.insert(auxnames,k) @@ -244,14 +248,13 @@ function report (set, aux) print() -- include aux values in results for i = 1, #auxnames do - table.insert(names, auxnames[i]) - table.insert(values, auxvalues[i]) + table.insert(data, {k=auxnames[i], v=auxvalues[i]}) end -- print values - for i = 1, #names do - io.write(("%-40s %14s"):format(names[i], core.lib.comma_value(values[i]))) + for i = 1, #data do + io.write(("%-40s %14s"):format(data[i].k, core.lib.comma_value(data[i].v))) for j = 1, #auxnames do - io.write(("%12.3f"):format(tonumber(values[i]/auxvalues[j]))) + io.write(("%12.3f"):format(tonumber(data[i].v/auxvalues[j]))) end print() end @@ -274,7 +277,7 @@ function profile (f, events, aux, quiet) switch_to(set) local res = f() switch_to(nil) - if not quiet then report(set, aux) end + if not quiet then report(to_table(set), aux) end return res end diff --git a/src/lib/pmu_x86.dasl b/src/lib/pmu_x86.dasl index 6171f46999..f8861056c7 100644 --- a/src/lib/pmu_x86.dasl +++ b/src/lib/pmu_x86.dasl @@ -24,11 +24,15 @@ local dasm = require("dasm") local gen = {} +-- Table keeping machine code alive to the GC. +local anchor = {} + -- Utility: assemble code and optionally dump disassembly. function assemble (name, prototype, generator) local Dst = dasm.new(actions) generator(Dst) local mcode, size = Dst:build() + table.insert(anchor, mcode) if debug then print("mcode dump: "..name) dasm.dump(mcode, size) diff --git a/src/program/snabbmark/snabbmark.lua b/src/program/snabbmark/snabbmark.lua index 13eb2f813a..9ed76aae53 100644 --- a/src/program/snabbmark/snabbmark.lua +++ b/src/program/snabbmark/snabbmark.lua @@ -17,6 +17,8 @@ function run (args) nfvconfig(unpack(args)) elseif command == 'solarflare' and #args >= 2 and #args <= 3 then solarflare(unpack(args)) + elseif command == 'appbench' then + appbench(unpack(args)) else print(usage) main.exit(1) @@ -227,3 +229,72 @@ function solarflare (npackets, packet_size, timeout) main.exit(1) end end + +-- snabbmark appbench +-- +-- Benchmark an individual app and print detailed measurements of its +-- per-packet performance and behavior. +-- +-- The benchmark is done like this: +-- +-- 1. Process very many packets in a loop. +-- +-- 2. Use CPU performance counters (PMU) to precisely measure performance. +-- +-- 3. Isolate the performance impact of the app by comparing an app +-- network with the app vs without it. + +events = {"mem_load_uops_retired.l1_hit", + "mem_load_uops_retired.l2_hit", + "mem_load_uops_retired.l3_hit", + "mem_load_uops_retired.l3_miss", + "br_misp_retired.all_branches$"} + +function appbench (mod, app, configstring, inlink, outlink) + print("module: " .. mod) + print("app: " .. app) + print("config: " .. (configstring or '')) + print("inlink: " .. (inlink or '[default: rx]')) + print("outlink: " .. (outlink or '[default: tx]')) + inlink = inlink or 'rx' + outlink = outlink or 'tx' + local cfg = configstring and core.lib.load_string(configstring)() + print(mod, app, cfg) + local pmu = require("lib.pmu") + -- First test the "null app network". + local c0 = config.new() + config.app(c0, "source", basic_apps.Source) + config.app(c0, "sink", basic_apps.Sink) + config.link(c0, "source.tx->sink.rx") + engine.configure(c0) + local start = C.get_monotonic_time() + local npackets = 100e6 + print("starting reference run...") + local run = function () + while link.stats(engine.app_table.source.output.tx).txpackets < npackets do + engine.main({duration = 0.01, no_report = true}) + end + end + local _, t0 = pmu.measure(run, events) + print("reference result:") + pmu.report(t0, {packet = npackets}) + engine.configure(config.new()) + local c1 = config.new() + config.app(c1, "source", basic_apps.Source) + config.app(c1, "app", require(mod)[app], cfg) + config.app(c1, "sink", basic_apps.Sink) + config.link(c1, "source.tx->app."..inlink) + config.link(c1, "app."..outlink.."->sink.rx") + engine.configure(c1) + print("\nstarting production run...") + local _, t1 = pmu.measure(run, events) + print("production result:") + pmu.report(t1, {packet = npackets}) + local tdelta = {} + for k, v in pairs(t1) do tdelta[k] = v - t0[k] end + print("\ndifference from reference to production:") + pmu.report(tdelta, {packet = npackets}) + local finish = C.get_monotonic_time() + local runtime = finish - start +end +