Skip to content

Commit

Permalink
Reduce overhead of sampling profiler by having only one thread do it (#…
Browse files Browse the repository at this point in the history
…6433)

* Reduce overhead of sampling profiler by having only one thread do it

* Use const ref

* One line per member
  • Loading branch information
abadams authored Dec 2, 2021
1 parent 479d839 commit 5cf9ae5
Show file tree
Hide file tree
Showing 2 changed files with 198 additions and 71 deletions.
213 changes: 165 additions & 48 deletions src/Profiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
#include <string>

#include "CodeGen_Internal.h"
#include "ExprUsesVar.h"
#include "IRMutator.h"
#include "IROperator.h"
#include "Profiling.h"
#include "Scope.h"
#include "Simplify.h"
#include "Substitute.h"
#include "UniquifyVariableNames.h"
#include "Util.h"

namespace Halide {
Expand All @@ -21,6 +23,46 @@ using std::vector;

namespace {

Stmt incr_active_threads(const Expr &profiler_state) {
return Evaluate::make(Call::make(Int(32), "halide_profiler_incr_active_threads",
{profiler_state}, Call::Extern));
}

Stmt decr_active_threads(const Expr &profiler_state) {
return Evaluate::make(Call::make(Int(32), "halide_profiler_decr_active_threads",
{profiler_state}, Call::Extern));
}

Stmt acquire_sampling_token(const Expr &shared_token, const Expr &local_token) {
return Evaluate::make(Call::make(Int(32), "halide_profiler_acquire_sampling_token",
{shared_token, local_token}, Call::Extern));
}

Stmt release_sampling_token(const Expr &shared_token, const Expr &local_token) {
return Evaluate::make(Call::make(Int(32), "halide_profiler_release_sampling_token",
{shared_token, local_token}, Call::Extern));
}

Stmt activate_thread(const Stmt &s, const Expr &profiler_state) {
return Block::make({incr_active_threads(profiler_state),
s,
decr_active_threads(profiler_state)});
}

Stmt suspend_thread(const Stmt &s, const Expr &profiler_state) {
return Block::make({decr_active_threads(profiler_state),
s,
incr_active_threads(profiler_state)});
}

Stmt claim_sampling_token(const Stmt &s, const Expr &shared_token, const Expr &local_token) {
return LetStmt::make(local_token.as<Variable>()->name,
Call::make(Handle(), Call::alloca, {Int(32).bytes()}, Call::Intrinsic),
Block::make({acquire_sampling_token(shared_token, local_token),
s,
release_sampling_token(shared_token, local_token)}));
}

class InjectProfiling : public IRMutator {

public:
Expand All @@ -30,6 +72,10 @@ class InjectProfiling : public IRMutator {

string pipeline_name;

bool in_fork = false;
bool in_parallel = false;
bool in_leaf_task = false;

InjectProfiling(const string &pipeline_name)
: pipeline_name(pipeline_name) {
stack.push_back(get_func_id("overhead"));
Expand All @@ -41,6 +87,8 @@ class InjectProfiling : public IRMutator {
profiler_pipeline_state = Variable::make(Handle(), "profiler_pipeline_state");
profiler_state = Variable::make(Handle(), "profiler_state");
profiler_token = Variable::make(Int(32), "profiler_token");
profiler_local_sampling_token = Variable::make(Handle(), "profiler_local_sampling_token");
profiler_shared_sampling_token = Variable::make(Handle(), "profiler_shared_sampling_token");
}

map<int, uint64_t> func_stack_current; // map from func id -> current stack allocation
Expand All @@ -53,6 +101,13 @@ class InjectProfiling : public IRMutator {
Expr profiler_pipeline_state;
Expr profiler_state;
Expr profiler_token;
Expr profiler_local_sampling_token;
Expr profiler_shared_sampling_token;

// May need to be set to -1 at the start of control flow blocks
// that have multiple incoming edges, if all sources don't have
// the same most_recently_set_func.
int most_recently_set_func = -1;

struct AllocSize {
bool on_stack;
Expand Down Expand Up @@ -83,10 +138,17 @@ class InjectProfiling : public IRMutator {
return idx;
}

Stmt set_current_func(const Expr &id) {
Stmt set_current_func(int id) {
if (most_recently_set_func == id) {
return Evaluate::make(0);
}
most_recently_set_func = id;
Expr last_arg = in_leaf_task ? profiler_local_sampling_token : reinterpret(Handle(), cast<uint64_t>(0));
// This call gets inlined and becomes a single store instruction.
return Evaluate::make(Call::make(Int(32), "halide_profiler_set_current_func",
{profiler_state, profiler_token, id}, Call::Extern));
Stmt s = Evaluate::make(Call::make(Int(32), "halide_profiler_set_current_func",
{profiler_state, profiler_token, id, last_arg}, Call::Extern));

return s;
}

Expr compute_allocation_size(const vector<Expr> &extents,
Expand Down Expand Up @@ -147,7 +209,20 @@ class InjectProfiling : public IRMutator {
<< "; peak: " << func_stack_peak[idx] << "\n";
}

vector<Stmt> tasks;
bool track_heap_allocation = !is_const_zero(size) && !on_stack && profiling_memory;
if (track_heap_allocation) {
debug(3) << " Allocation on heap: " << op->name
<< "(" << size << ") in pipeline "
<< pipeline_name << "\n";

tasks.push_back(set_current_func(malloc_id));
tasks.push_back(Evaluate::make(Call::make(Int(32), "halide_profiler_memory_allocate",
{profiler_pipeline_state, idx, size}, Call::Extern)));
}

Stmt body = mutate(op->body);

Expr new_expr;
Stmt stmt;
if (op->new_expr.defined()) {
Expand All @@ -163,22 +238,9 @@ class InjectProfiling : public IRMutator {
new_extents, condition, body, new_expr, op->free_function);
}

if (!is_const_zero(size) && !on_stack && profiling_memory) {
debug(3) << " Allocation on heap: " << op->name
<< "(" << size << ") in pipeline "
<< pipeline_name << "\n";

vector<Stmt> tasks{
set_current_func(malloc_id),
Evaluate::make(Call::make(Int(32), "halide_profiler_memory_allocate",
{profiler_pipeline_state, idx, size}, Call::Extern)),
stmt,
set_current_func(stack.back())};
tasks.push_back(stmt);

stmt = Block::make(tasks);
}

return stmt;
return Block::make(tasks);
}

Stmt visit(const Free *op) override {
Expand Down Expand Up @@ -222,48 +284,43 @@ class InjectProfiling : public IRMutator {
if (op->is_producer) {
idx = get_func_id(op->name);
stack.push_back(idx);
body = mutate(op->body);
Stmt set_current = set_current_func(idx);
body = Block::make(set_current, mutate(op->body));
stack.pop_back();
} else {
body = mutate(op->body);
// At the beginning of the consume step, set the current task
// back to the outer one.
idx = stack.back();
Stmt set_current = set_current_func(stack.back());
body = Block::make(set_current, mutate(op->body));
}

body = Block::make(set_current_func(idx), body);

return ProducerConsumer::make(op->name, op->is_producer, body);
}

Stmt incr_active_threads() {
return Evaluate::make(Call::make(Int(32), "halide_profiler_incr_active_threads",
{profiler_state}, Call::Extern));
}

Stmt decr_active_threads() {
return Evaluate::make(Call::make(Int(32), "halide_profiler_decr_active_threads",
{profiler_state}, Call::Extern));
}

Stmt visit_parallel_task(const Stmt &s) {
Stmt visit_parallel_task(Stmt s) {
int old = most_recently_set_func;
if (const Fork *f = s.as<Fork>()) {
return Fork::make(visit_parallel_task(f->first), visit_parallel_task(f->rest));
s = Fork::make(visit_parallel_task(f->first), visit_parallel_task(f->rest));
} else if (const Acquire *a = s.as<Acquire>()) {
return Acquire::make(a->semaphore, a->count, visit_parallel_task(a->body));
s = Acquire::make(a->semaphore, a->count, visit_parallel_task(a->body));
} else {
return Block::make({incr_active_threads(), mutate(s), decr_active_threads()});
s = activate_thread(mutate(s), profiler_state);
}
if (most_recently_set_func != old) {
most_recently_set_func = -1;
}
return s;
}

Stmt visit(const Acquire *op) override {
Stmt s = visit_parallel_task(op);
return Block::make({decr_active_threads(), s, incr_active_threads()});
return suspend_thread(s, profiler_state);
}

Stmt visit(const Fork *op) override {
ScopedValue<bool> bind(in_fork, true);
Stmt s = visit_parallel_task(op);
return Block::make({decr_active_threads(), s, incr_active_threads()});
return suspend_thread(s, profiler_state);
}

Stmt visit(const For *op) override {
Expand All @@ -276,9 +333,40 @@ class InjectProfiling : public IRMutator {
bool update_active_threads = (op->device_api == DeviceAPI::Hexagon ||
op->is_unordered_parallel());

ScopedValue<bool> bind_in_parallel(in_parallel, in_parallel || op->is_unordered_parallel());

bool leaf_task = false;
if (update_active_threads) {
body = Block::make({incr_active_threads(), body, decr_active_threads()});
body = activate_thread(body, profiler_state);

class ContainsParallelOrBlockingNode : public IRVisitor {
using IRVisitor::visit;
void visit(const For *op) override {
result |= (op->is_unordered_parallel() ||
op->device_api != DeviceAPI::None);
IRVisitor::visit(op);
}
void visit(const Fork *op) override {
result = true;
}
void visit(const Acquire *op) override {
result = true;
}

public:
bool result = false;
} contains_parallel_or_blocking_node;

body.accept(&contains_parallel_or_blocking_node);
leaf_task = !contains_parallel_or_blocking_node.result;

if (leaf_task) {
body = claim_sampling_token(body, profiler_shared_sampling_token, profiler_local_sampling_token);
}
}
ScopedValue<bool> bind_leaf_task(in_leaf_task, in_leaf_task || leaf_task);

int old = most_recently_set_func;

// We profile by storing a token to global memory, so don't enter GPU loops
if (op->device_api == DeviceAPI::Hexagon) {
Expand All @@ -304,13 +392,36 @@ class InjectProfiling : public IRMutator {
body = op->body;
}

if (old != most_recently_set_func) {
most_recently_set_func = -1;
}

Stmt stmt = For::make(op->name, op->min, op->extent, op->for_type, op->device_api, body);

if (update_active_threads) {
stmt = Block::make({decr_active_threads(), stmt, incr_active_threads()});
stmt = suspend_thread(stmt, profiler_state);
}

return stmt;
}

Stmt visit(const IfThenElse *op) override {
int old = most_recently_set_func;
Expr condition = mutate(op->condition);
Stmt then_case = mutate(op->then_case);
int func_computed_in_then = most_recently_set_func;
most_recently_set_func = old;
Stmt else_case = mutate(op->else_case);
if (most_recently_set_func != func_computed_in_then) {
most_recently_set_func = -1;
}
if (condition.same_as(op->condition) &&
then_case.same_as(op->then_case) &&
else_case.same_as(op->else_case)) {
return op;
}
return IfThenElse::make(std::move(condition), std::move(then_case), std::move(else_case));
}
};

} // namespace
Expand Down Expand Up @@ -346,13 +457,16 @@ Stmt inject_profiling(Stmt s, const string &pipeline_name) {
}

Expr profiler_state = Variable::make(Handle(), "profiler_state");
Stmt incr_active_threads =
Evaluate::make(Call::make(Int(32), "halide_profiler_incr_active_threads",
{profiler_state}, Call::Extern));
Stmt decr_active_threads =
Evaluate::make(Call::make(Int(32), "halide_profiler_decr_active_threads",
{profiler_state}, Call::Extern));
s = Block::make({incr_active_threads, s, decr_active_threads});

s = activate_thread(s, profiler_state);

// Initialize the shared sampling token
Expr shared_sampling_token_var = Variable::make(Handle(), "profiler_shared_sampling_token");
Expr init_sampling_token =
Call::make(Int(32), "halide_profiler_init_sampling_token", {shared_sampling_token_var, 0}, Call::Extern);
s = Block::make({Evaluate::make(init_sampling_token), s});
s = LetStmt::make("profiler_shared_sampling_token",
Call::make(Handle(), Call::alloca, {Int(32).bytes()}, Call::Intrinsic), s);

s = LetStmt::make("profiler_pipeline_state", get_pipeline_state, s);
s = LetStmt::make("profiler_state", get_state, s);
Expand Down Expand Up @@ -383,6 +497,9 @@ Stmt inject_profiling(Stmt s, const string &pipeline_name) {
MemoryType::Auto, {num_funcs}, const_true(), s);
s = Block::make(Evaluate::make(stop_profiler), s);

// We have nested definitions of the sampling token
s = uniquify_variable_names(s);

return s;
}

Expand Down
Loading

0 comments on commit 5cf9ae5

Please sign in to comment.