diff --git a/syncd/Makefile.am b/syncd/Makefile.am index 1ae9315a353f..3082d142b936 100644 --- a/syncd/Makefile.am +++ b/syncd/Makefile.am @@ -16,6 +16,7 @@ endif noinst_LIBRARIES = libSyncd.a libSyncdRequestShutdown.a libSyncd_a_SOURCES = \ + WatchdogScope.cpp \ SaiSwitchInterface.cpp \ ZeroMQSelectableChannel.cpp \ RedisSelectableChannel.cpp \ diff --git a/syncd/Syncd.cpp b/syncd/Syncd.cpp index 931c10ea9de6..db2ca0efd2ab 100644 --- a/syncd/Syncd.cpp +++ b/syncd/Syncd.cpp @@ -14,7 +14,7 @@ #include "RedisSelectableChannel.h" #include "ZeroMQSelectableChannel.h" #include "PerformanceIntervalTimer.h" -#include "TimerWatchdog.h" +#include "WatchdogScope.h" #include "sairediscommon.h" @@ -50,7 +50,8 @@ Syncd::Syncd( m_asicInitViewMode(false), // by default we are in APPLY view mode m_vendorSai(vendorSai), m_veryFirstRun(false), - m_enableSyncMode(false) + m_enableSyncMode(false), + m_timerWatchdog(30 * 1000000) // watch for executions over 30 seconds { SWSS_LOG_ENTER(); @@ -317,6 +318,8 @@ sai_status_t Syncd::processSingleEvent( return SAI_STATUS_SUCCESS; } + WatchdogScope ws(m_timerWatchdog, op + ":" + key); + if (op == REDIS_ASIC_STATE_COMMAND_CREATE) return processQuadEvent(SAI_COMMON_API_CREATE, kco); @@ -1956,6 +1959,8 @@ void Syncd::processFlexCounterGroupEvent( // TODO must be moved to go via ASIC c auto& op = kfvOp(kco); auto& values = kfvFieldsValues(kco); + WatchdogScope ws(m_timerWatchdog, op + ":" + groupName); + if (op == SET_COMMAND) { m_manager->addCounterPlugin(groupName, values); @@ -1984,6 +1989,8 @@ void Syncd::processFlexCounterEvent( // TODO must be moved to go via ASIC channe auto& key = kfvKey(kco); auto& op = kfvOp(kco); + WatchdogScope ws(m_timerWatchdog, op + ":" + key); + auto delimiter = key.find_first_of(":"); if (delimiter == std::string::npos) @@ -4372,11 +4379,9 @@ void Syncd::run() runMainLoop = false; } - TimerWatchdog twd(30 * 1000000); // watch for executions over 30 seconds + m_timerWatchdog.setCallback(timerWatchdogCallback); - twd.setCallback(timerWatchdogCallback); - - while(runMainLoop) + while (runMainLoop) { try { @@ -4384,8 +4389,6 @@ void Syncd::run() int result = s->select(&sel); - twd.setStartTime(); - if (sel == m_restartQuery.get()) { /* @@ -4405,6 +4408,8 @@ void Syncd::run() SWSS_LOG_NOTICE("drained queue"); + WatchdogScope ws(m_timerWatchdog, "restart query"); + shutdownType = handleRestartQuery(*m_restartQuery); if (shutdownType != SYNCD_RESTART_TYPE_PRE_SHUTDOWN) @@ -4474,8 +4479,6 @@ void Syncd::run() { SWSS_LOG_ERROR("select failed: %d", result); } - - twd.setEndTime(); } catch(const std::exception &e) { @@ -4492,11 +4495,11 @@ void Syncd::run() // make sure that if second exception will arise, then we break the loop m_commandLineOptions->m_disableExitSleep = true; - - twd.setEndTime(); } } + WatchdogScope ws(m_timerWatchdog, "shutting down syncd"); + if (shutdownType == SYNCD_RESTART_TYPE_WARM) { const char *warmBootWriteFile = profileGetValue(0, SAI_KEY_WARM_BOOT_WRITE_FILE); @@ -4572,6 +4575,8 @@ syncd_restart_type_t Syncd::handleRestartQuery( restartQuery.pop(op, data, values); + m_timerWatchdog.setEventName(op + ":" + data); + SWSS_LOG_NOTICE("received %s switch shutdown event", op.c_str()); return RequestShutdownCommandLineOptions::stringToRestartType(op); diff --git a/syncd/Syncd.h b/syncd/Syncd.h index dc3c5b996277..4cea8fac5abe 100644 --- a/syncd/Syncd.h +++ b/syncd/Syncd.h @@ -17,6 +17,7 @@ #include "BreakConfig.h" #include "NotificationProducerBase.h" #include "SelectableChannel.h" +#include "TimerWatchdog.h" #include "meta/SaiAttributeList.h" @@ -489,5 +490,7 @@ namespace syncd std::shared_ptr m_contextConfig; std::shared_ptr m_breakConfig; + + TimerWatchdog m_timerWatchdog; }; } diff --git a/syncd/TimerWatchdog.cpp b/syncd/TimerWatchdog.cpp index c51588b9306a..6f50ca2a32f9 100644 --- a/syncd/TimerWatchdog.cpp +++ b/syncd/TimerWatchdog.cpp @@ -4,6 +4,8 @@ #include +#define MUTEX std::lock_guard _lock(m_mutex); + using namespace syncd; TimerWatchdog::TimerWatchdog( @@ -35,6 +37,7 @@ TimerWatchdog::~TimerWatchdog() void TimerWatchdog::setStartTime() { + MUTEX; SWSS_LOG_ENTER(); do @@ -42,11 +45,11 @@ void TimerWatchdog::setStartTime() m_startTimestamp = getTimeSinceEpoch(); } while (m_startTimestamp <= m_endTimestamp); // make sure new start time is always past last end time - } void TimerWatchdog::setEndTime() { + MUTEX; SWSS_LOG_ENTER(); do @@ -54,11 +57,34 @@ void TimerWatchdog::setEndTime() m_endTimestamp = getTimeSinceEpoch(); } while (m_endTimestamp <= m_startTimestamp); // make sure new end time is always past last start time + + auto span = m_endTimestamp - m_startTimestamp; + + if (span > m_warnTimespan) + { + SWSS_LOG_ERROR("event '%s' took %ld ms to execute", m_eventName.c_str(), span/1000); + } + else if (span > 50*1000) + { + SWSS_LOG_WARN("event '%s' took %ld ms to execute", m_eventName.c_str(), span/1000); + } + + m_eventName = ""; +} + +void TimerWatchdog::setEventName( + _In_ const std::string& eventName) +{ + MUTEX; + SWSS_LOG_ENTER(); + + m_eventName = eventName; } void TimerWatchdog::setCallback( _In_ std::function callback) { + MUTEX; SWSS_LOG_ENTER(); m_callback = callback; @@ -74,6 +100,8 @@ void TimerWatchdog::threadFunction() { std::this_thread::sleep_for(std::chrono::seconds(1)); + MUTEX; // to protect event name + // we make local copies, since executing functions can be so fast that // when we will read second time start timestamp it can be different // than previous one @@ -94,7 +122,7 @@ void TimerWatchdog::threadFunction() span = now - start; // this must be always non negative - SWSS_LOG_NOTICE("time span %ld ms", span/1000); + SWSS_LOG_NOTICE("time span %ld ms for '%s'", span/1000, m_eventName.c_str()); // TODO remove this if (span < 0) SWSS_LOG_THROW("negative span 'now - start': %ld - %ld", now, start); @@ -105,12 +133,7 @@ void TimerWatchdog::threadFunction() // function probably hanged - SWSS_LOG_WARN("timespan WD exceeded %ld ms", span/1000); - - auto callback = m_callback; - - if (callback) - callback(span); + SWSS_LOG_ERROR("time span WD exceeded %ld ms for %s", span/1000, m_eventName.c_str()); } continue; diff --git a/syncd/TimerWatchdog.h b/syncd/TimerWatchdog.h index 4599c247f5dd..1768da4ad2df 100644 --- a/syncd/TimerWatchdog.h +++ b/syncd/TimerWatchdog.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace syncd { @@ -23,6 +24,9 @@ namespace syncd void setEndTime(); + void setEventName( + _In_ const std::string& eventName); + void setCallback( _In_ std::function callback); @@ -51,5 +55,9 @@ namespace syncd std::shared_ptr m_thread; std::function m_callback; + + std::string m_eventName; + + std::mutex m_mutex; }; } diff --git a/syncd/WatchdogScope.cpp b/syncd/WatchdogScope.cpp new file mode 100644 index 000000000000..2fa90bd5042b --- /dev/null +++ b/syncd/WatchdogScope.cpp @@ -0,0 +1,22 @@ +#include "WatchdogScope.h" + +using namespace syncd; + +WatchdogScope::WatchdogScope( + _In_ TimerWatchdog& tw, + _In_ const std::string eventName): + m_tw(tw) +{ + // SWSS_LOG_ENTER(); // disabled + + m_tw.setStartTime(); + + m_tw.setEventName(eventName); +} + +WatchdogScope::~WatchdogScope() +{ + // SWSS_LOG_ENTER(); // disabled + + m_tw.setEndTime(); +} diff --git a/syncd/WatchdogScope.h b/syncd/WatchdogScope.h new file mode 100644 index 000000000000..13c67d5d96be --- /dev/null +++ b/syncd/WatchdogScope.h @@ -0,0 +1,21 @@ +#pragma once + +#include "TimerWatchdog.h" + +namespace syncd +{ + class WatchdogScope + { + public: + + WatchdogScope( + _In_ TimerWatchdog& tw, + _In_ const std::string eventName); + + ~WatchdogScope(); + + private: + + TimerWatchdog& m_tw; + }; +}