Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ORT ETW dynamic logging that improves ORT diagnosability & performance #18882

Merged
merged 22 commits into from
Jan 11, 2024
Merged
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f7d1fd5
Initial version of dynamic ETW logger
ivberg Nov 21, 2023
5c9af58
Also fix logger created with CreateEnvWithCustomLogger API
ivberg Nov 21, 2023
b9e95b5
Add support to Telemetry provider to expose if enabled, level, keywor…
ivberg Nov 30, 2023
421249d
Support CompositeSink having seperate min severity so that ETWSink ca…
ivberg Nov 30, 2023
1231a32
Support writing QNN profiling events to ETW if the provider is enabled
ivberg Nov 30, 2023
ffec546
Fix forgotten override we removed after refactor
ivberg Dec 1, 2023
f40530e
Take the min of the severity as we don't know which one is more verbose
ivberg Dec 1, 2023
fe208ee
LintRunner
ivberg Dec 1, 2023
4efea8b
move per inference very verbose log statements to VLOGs
ivberg Dec 14, 2023
0a92598
Mark all ETW events with appropiate keyword and level. Cleanup
ivberg Dec 14, 2023
e56d951
Fine-tune QNN Profiling and logging to be aware of common ETW levels …
ivberg Dec 15, 2023
7e67f71
Introduce appropiate ETW levels of some existing "Telemetry" events f…
ivberg Dec 15, 2023
281d624
Change ORT.wprp to level 5
ivberg Dec 19, 2023
d7f4d5a
LintRunner
ivberg Dec 20, 2023
17fad57
Fixes for PR cross-platform build, cpplint
ivberg Dec 20, 2023
e083dcc
Lintrunner
ivberg Dec 20, 2023
e757042
More PR fixes lintcpp and Linux build
ivberg Dec 20, 2023
dc2fb4d
More PR fixes for lintcpp and Linux build
ivberg Dec 20, 2023
b070134
Small changes for PR comments
ivberg Jan 8, 2024
80f08b5
Lintrunner strikes again
ivberg Jan 8, 2024
be85e9d
Address PR comments with renames and comments
ivberg Jan 8, 2024
7e387c6
Rename ORTTraceLoggingKeyword and more lint NITs
ivberg Jan 9, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions include/onnxruntime/core/common/logging/logging.h
Original file line number Diff line number Diff line change
@@ -75,6 +75,21 @@ struct Category {
// TODO: What other high level categories are meaningful? Model? Optimizer? Execution?
};

/// <summary>
/// ORT TraceLogging keywords for categories of dynamic logging enablement
/// </summary>
enum class ORTTraceLoggingKeyword : uint64_t {
Session = 0x1, // ORT Session TraceLoggingWrite
Logs = 0x2, // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required
Reserved1 = 0x4, // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses
Reserved2 = 0x8,
Reserved3 = 0x10,
Reserved4 = 0x20,
Reserved5 = 0x40,
Reserved6 = 0x80,
Profiling = 0x100 // Enables profiling. At higher levels >5 can impact inference performance
};

class ISink;
class Logger;
class Capture;
@@ -333,5 +348,17 @@ unsigned int GetThreadId();
*/
unsigned int GetProcessId();

/**
If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then adds to the existing logger.
*/
std::unique_ptr<ISink> EnhanceLoggerWithEtw(std::unique_ptr<ISink> existingLogger, logging::Severity originalSeverity,
ivberg marked this conversation as resolved.
Show resolved Hide resolved
logging::Severity etwSeverity);

/**
If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then can override the logging level.
But this overrided level only applies to the ETW sink. The original logger(s) retain their original logging level
*/
Severity OverrideLevelWithEtw(Severity originalSeverity);

} // namespace logging
} // namespace onnxruntime
33 changes: 33 additions & 0 deletions onnxruntime/core/common/logging/logging.cc
Original file line number Diff line number Diff line change
@@ -12,6 +12,8 @@

#ifdef _WIN32
#include <Windows.h>
#include "core/platform/windows/logging/etw_sink.h"
#include "core/common/logging/sinks/composite_sink.h"
#else
#include <unistd.h>
#if defined(__MACH__) || defined(__wasm__) || defined(_AIX)
@@ -243,5 +245,36 @@ unsigned int GetProcessId() {
#endif
}

std::unique_ptr<ISink> EnhanceLoggerWithEtw(std::unique_ptr<ISink> existingLogger, logging::Severity originalSeverity,
logging::Severity etwSeverity) {
#ifdef _WIN32
auto& manager = EtwRegistrationManager::Instance();
if (manager.IsEnabled()) {
auto compositeSink = std::make_unique<CompositeSink>();
compositeSink->AddSink(std::move(existingLogger), originalSeverity);
compositeSink->AddSink(std::make_unique<EtwSink>(), etwSeverity);
return compositeSink;
} else {
return existingLogger;
}
#else
// On non-Windows platforms, just return the existing logger
(void)originalSeverity;
(void)etwSeverity;
return existingLogger;
#endif // _WIN32
}

Severity OverrideLevelWithEtw(Severity originalSeverity) {
#ifdef _WIN32
auto& manager = logging::EtwRegistrationManager::Instance();
if (manager.IsEnabled() &&
(manager.Keyword() & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) {
return manager.MapLevelToSeverity();
}
#endif // _WIN32
return originalSeverity;
}

} // namespace logging
} // namespace onnxruntime
23 changes: 18 additions & 5 deletions onnxruntime/core/common/logging/sinks/composite_sink.h
Original file line number Diff line number Diff line change
@@ -5,6 +5,8 @@

#include <string>
#include <vector>
#include <utility>
#include <memory>

#include "core/common/logging/isink.h"
#include "core/common/logging/logging.h"
@@ -27,20 +29,31 @@ class CompositeSink : public ISink {
/// Adds a sink. Takes ownership of the sink (so pass unique_ptr by value).
/// </summary>
/// <param name="sink">The sink.</param>
/// <param name="severity">The min severity to send a message to that sink</param>
/// <returns>This instance to allow chaining.</returns>
CompositeSink& AddSink(std::unique_ptr<ISink> sink) {
sinks_.push_back(std::move(sink));
CompositeSink& AddSink(std::unique_ptr<ISink> sink, logging::Severity severity) {
sinks_with_severity_.emplace_back(std::move(sink), severity);
return *this;
}

/// <summary>
/// Gets a const reference to the collection of sinks and min severity for that sink
/// </summary>
/// <returns>A const reference to the vector pair of unique_ptr to ISink and severity.</returns>
const std::vector<std::pair<std::unique_ptr<ISink>, logging::Severity>>& GetSinks() const {
return sinks_with_severity_;
}

private:
void SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) override {
for (auto& sink : sinks_) {
sink->Send(timestamp, logger_id, message);
for (auto& sink_pair : sinks_with_severity_) {
if (message.Severity() >= sink_pair.second) {
sink_pair.first->Send(timestamp, logger_id, message);
}
}
}

std::vector<std::unique_ptr<ISink>> sinks_;
std::vector<std::pair<std::unique_ptr<ISink>, logging::Severity>> sinks_with_severity_;
};
} // namespace logging
} // namespace onnxruntime
3 changes: 3 additions & 0 deletions onnxruntime/core/framework/execution_providers.h
Original file line number Diff line number Diff line change
@@ -13,6 +13,7 @@
#include "core/graph/graph_viewer.h"
#include "core/common/logging/logging.h"
#ifdef _WIN32
#include <winmeta.h>
#include "core/platform/tracing.h"
#endif

@@ -47,6 +48,8 @@ class ExecutionProviders {
TraceLoggingWrite(
telemetry_provider_handle,
"ProviderOptions",
TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Session)),
TraceLoggingLevel(WINEVENT_LEVEL_INFO),
TraceLoggingString(provider_id.c_str(), "ProviderId"),
TraceLoggingString(config_pair.first.c_str(), "Key"),
TraceLoggingString(config_pair.second.c_str(), "Value"));
6 changes: 3 additions & 3 deletions onnxruntime/core/framework/sequential_executor.cc
Original file line number Diff line number Diff line change
@@ -181,7 +181,7 @@ class SessionScope {
}

auto& logger = session_state_.Logger();
LOGS(logger, VERBOSE) << "Begin execution";
VLOGS(logger, 0) << "Begin execution";
const SequentialExecutionPlan& seq_exec_plan = *session_state_.GetExecutionPlan();
const auto& exec_plan_vec = seq_exec_plan.execution_plan;
VLOGS(logger, 1) << "Size of execution plan vector: " << exec_plan_vec.size();
@@ -515,7 +515,7 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
return Status(status.Category(), status.Code(), msg_string);
}
ctx.RecycleNodeInputs(idx);
LOGS(logger, VERBOSE) << "stream " << stream_idx << " launch kernel with idx " << idx;
VLOGS(logger, 0) << "stream " << stream_idx << " launch kernel with idx " << idx;
return Status::OK();
}

@@ -531,7 +531,7 @@ onnxruntime::Status ExecuteThePlan(const SessionState& session_state, gsl::span<
const bool only_execute_path_to_fetches,
bool single_thread_mode) {
auto* execution_plan = session_state.GetExecutionPlan();
LOGS(logger, VERBOSE) << "Number of streams: " << execution_plan->execution_plan.size();
VLOGS(logger, 0) << "Number of streams: " << execution_plan->execution_plan.size();
int32_t valid_streams = 0;
for (auto& stream : execution_plan->execution_plan) {
if (stream && stream->steps_.size() > 0)
2 changes: 1 addition & 1 deletion onnxruntime/core/framework/stream_execution_context.cc
Original file line number Diff line number Diff line change
@@ -168,7 +168,7 @@ void StreamExecutionContext::RecycleNodeInputs(onnxruntime::NodeIndex node_index
for (auto idx : execution_plan->node_release_list[node_index]) {
if (--release_plan_[idx] == 0) {
ORT_ENFORCE(frame_.ReleaseMLValue(static_cast<int>(execution_plan->release_actions[idx].value_index)).IsOK());
LOGS(*logger_, VERBOSE) << "ort value " << execution_plan->release_actions[idx].value_index << " released";
VLOGS(*logger_, 0) << "ort value " << execution_plan->release_actions[idx].value_index << " released";
}
}
}
15 changes: 15 additions & 0 deletions onnxruntime/core/platform/telemetry.cc
Original file line number Diff line number Diff line change
@@ -12,6 +12,21 @@ void LogRuntimeError(uint32_t sessionId, const common::Status& status, const cha
env.GetTelemetryProvider().LogRuntimeError(sessionId, status, file, function, line);
}

bool Telemetry::IsEnabled() const {
return false;
}

// Get the current logging level
// The Level defined as uchar is coming from the ETW Enable callback in TraceLoggingRegisterEx.
unsigned char Telemetry::Level() const {
ivberg marked this conversation as resolved.
Show resolved Hide resolved
return 0;
}

// Get the current keyword
uint64_t Telemetry::Keyword() const {
return 0;
}

void Telemetry::EnableTelemetryEvents() const {
}

8 changes: 8 additions & 0 deletions onnxruntime/core/platform/telemetry.h
Original file line number Diff line number Diff line change
@@ -38,6 +38,14 @@ class Telemetry {
virtual void DisableTelemetryEvents() const;
virtual void SetLanguageProjection(uint32_t projection) const;

virtual bool IsEnabled() const;

// Get the current logging level
virtual unsigned char Level() const;

// Get the current keyword
virtual uint64_t Keyword() const;

virtual void LogProcessInfo() const;

virtual void LogSessionCreationStart() const;
128 changes: 97 additions & 31 deletions onnxruntime/core/platform/windows/logging/etw_sink.cc
Original file line number Diff line number Diff line change
@@ -58,52 +58,116 @@ TRACELOGGING_DEFINE_PROVIDER(etw_provider_handle, "ONNXRuntimeTraceLoggingProvid
#pragma warning(pop)
#endif

// Class to unregister ETW provider at shutdown.
// We expect one static instance to be created for the lifetime of the program.
class EtwRegistrationManager {
public:
static EtwRegistrationManager& Register() {
const HRESULT etw_status = ::TraceLoggingRegister(etw_provider_handle);

if (FAILED(etw_status)) {
ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status));
}
EtwRegistrationManager& EtwRegistrationManager::Instance() {
static EtwRegistrationManager instance;
instance.LazyInitialize();
return instance;
}

// return an instance that is just used to unregister as the program exits
static EtwRegistrationManager instance(etw_status);
return instance;
}
bool EtwRegistrationManager::IsEnabled() const {
std::lock_guard<OrtMutex> lock(provider_change_mutex_);
return is_enabled_;
}

UCHAR EtwRegistrationManager::Level() const {
std::lock_guard<OrtMutex> lock(provider_change_mutex_);
return level_;
}

const HRESULT Status() const noexcept {
return etw_status_;
Severity EtwRegistrationManager::MapLevelToSeverity() {
switch (level_) {
case TRACE_LEVEL_NONE:
return Severity::kFATAL; // There is no none severity option
case TRACE_LEVEL_VERBOSE:
return Severity::kVERBOSE;
case TRACE_LEVEL_INFORMATION:
return Severity::kINFO;
case TRACE_LEVEL_WARNING:
return Severity::kWARNING;
case TRACE_LEVEL_ERROR:
return Severity::kERROR;
case TRACE_LEVEL_CRITICAL:
return Severity::kFATAL;
default:
return Severity::kVERBOSE;
}
}

ULONGLONG EtwRegistrationManager::Keyword() const {
std::lock_guard<OrtMutex> lock(provider_change_mutex_);
return keyword_;
}

~EtwRegistrationManager() {
::TraceLoggingUnregister(etw_provider_handle);
HRESULT EtwRegistrationManager::Status() const {
return etw_status_;
}

void EtwRegistrationManager::RegisterInternalCallback(const EtwInternalCallback& callback) {
std::lock_guard<OrtMutex> lock(callbacks_mutex_);
callbacks_.push_back(callback);
}

void NTAPI EtwRegistrationManager::ORT_TL_EtwEnableCallback(
_In_ LPCGUID SourceId,
_In_ ULONG IsEnabled,
_In_ UCHAR Level,
_In_ ULONGLONG MatchAnyKeyword,
_In_ ULONGLONG MatchAllKeyword,
_In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
_In_opt_ PVOID CallbackContext) {
auto& manager = EtwRegistrationManager::Instance();
{
std::lock_guard<OrtMutex> lock(manager.provider_change_mutex_);
manager.is_enabled_ = (IsEnabled != 0);
manager.level_ = Level;
manager.keyword_ = MatchAnyKeyword;
}
manager.InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
}

EtwRegistrationManager::~EtwRegistrationManager() {
::TraceLoggingUnregister(etw_provider_handle);
}

private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(EtwRegistrationManager);
EtwRegistrationManager::EtwRegistrationManager() {
}

EtwRegistrationManager(const HRESULT status) noexcept : etw_status_{status} {}
const HRESULT etw_status_;
};
void EtwRegistrationManager::LazyInitialize() {
if (!initialized_) {
std::lock_guard<OrtMutex> lock(init_mutex_);
if (!initialized_) { // Double-check locking pattern
initialized_ = true;
etw_status_ = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
if (FAILED(etw_status_)) {
ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status_));
}
}
}
}

void EtwRegistrationManager::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData,
PVOID CallbackContext) {
std::lock_guard<OrtMutex> lock(callbacks_mutex_);
for (const auto& callback : callbacks_) {
callback(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
}
}

void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
UNREFERENCED_PARAMETER(timestamp);

// register on first usage
static EtwRegistrationManager& etw_manager = EtwRegistrationManager::Register();
static EtwRegistrationManager& etw_manager = EtwRegistrationManager::Instance();

// do something (not that meaningful) with etw_manager so it doesn't get optimized out
// as we want an instance around to do the unregister
if (FAILED(etw_manager.Status())) {
return;
}

// Do we want to output Verbose level messages via ETW at any point it time?
// TODO: Validate if this filtering makes sense.
if (message.Severity() <= Severity::kVERBOSE || message.DataType() == DataType::USER) {
if (message.DataType() == DataType::USER) {
return;
}

@@ -114,11 +178,13 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id,
// TraceLoggingWrite requires (painfully) a compile time constant for the TraceLoggingLevel,
// forcing us to use an ugly macro for the call.
#define ETW_EVENT_NAME "ONNXRuntimeLogEvent"
#define TRACE_LOG_WRITE(level) \
TraceLoggingWrite(etw_provider_handle, ETW_EVENT_NAME, TraceLoggingLevel(level), \
TraceLoggingString(logger_id.c_str(), "logger"), \
TraceLoggingString(message.Category(), "category"), \
TraceLoggingString(message.Location().ToString().c_str(), "location"), \
#define TRACE_LOG_WRITE(level) \
TraceLoggingWrite(etw_provider_handle, ETW_EVENT_NAME, \
TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)), \
TraceLoggingLevel(level), \
TraceLoggingString(logger_id.c_str(), "logger"), \
TraceLoggingString(message.Category(), "category"), \
TraceLoggingString(message.Location().ToString().c_str(), "location"), \
TraceLoggingString(message.Message().c_str(), "message"))

const auto severity{message.Severity()};
Loading
Loading