From 002bb1d92fdc04435ac2d269cebf82996ab036da Mon Sep 17 00:00:00 2001 From: judyjoseph <53951155+judyjoseph@users.noreply.github.com> Date: Thu, 16 Sep 2021 19:20:29 -0700 Subject: [PATCH] [tlm teamd] Add retry mechanism before logging the ERR in get_dumps. (#1629) Fix https://github.com/Azure/sonic-buildimage/issues/6632 There has been cases when the get_dumps API in tlm_teamd process is not able to get the right data and logs an error message. The issue occurs very rarely and it is due to the race condition between teammgrd/teamsyncd/tlm_teamd when a Portchannel is removed. In the teamd telemetry module there are two places where the get_dumps() is called. 1. When the portchannel object is add/removed. [https://github.com/Azure/sonic-swss/blob/master/tlm_teamd/main.cpp#L101] 2. On timeout of 1 sec. [https://github.com/Azure/sonic-swss/blob/master/tlm_teamd/main.cpp#L108] In case of timeout call for get_dumps(), there could be an inconsistent state where the portchannel/teamd process is getting removed by teammgrd but the STATE table update to remove the lag interface is still not received by the tlm_teamd module. Seen below on a bad case where the get_dumps() call from TIMEOUT handler throws an ERR message - as the remove_lag message is not yet received. On a good case ``` Feb 7 02:03:27.576078 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' has been removed. Feb 7 02:03:28.453829 vlab-01 INFO teamd#supervisord 2021-02-07 02:03:28,451 INFO reaped unknown pid 4747 (exit status 0) Feb 7 02:03:28.458616 vlab-01 NOTICE teamd#teammgrd: :- removeLag: Stop port channel PortChannel999 ``` On a bad case ``` Feb 7 02:03:33.037401 vlab-01 ERR teamd#tlm_teamd: :- get_dump: Can't get dump for LAG 'PortChannel999'. Skipping Feb 7 02:03:33.046179 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' has been removed. Feb 7 02:03:33.997639 vlab-01 INFO teamd#supervisord 2021-02-07 02:03:33,996 INFO reaped unknown pid 4775 (exit status 0) Feb 7 02:03:34.040126 vlab-01 NOTICE teamd#teammgrd: :- removeLag: Stop port channel PortChannel999 ``` **How I did it** Add retry mechanism before logging the ERR in get_dumps API(). The number of retries is set as 3. So that if the same error repeats 3 times - it is logged, other wise it is considered a transient condition - not an error. Additionally added a **to_retry** flag to get_dumps() API so that the caller can decide whether to use the retry mechanism or not. **How I verified it** Verified that the error message is no more seen in the syslog. Confirmed by running ~ 200 times portchannel creation (which had reproduced the issue earlier on VS testbed). The new NOTICE message added in remove_lag shows that we had indeed hit the original issue earlier and clearing flags here. ``` admin@vlab-01:/var/log$ sudo zgrep -i "get dump for LAG" syslog*; sudo zgrep -i "clearing it" syslog* syslog.1:Feb 8 06:41:54.995716 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' had errored while getting dump, clearing it syslog.2.gz:Feb 8 06:31:32.360135 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' had errored while getting dump, clearing it syslog.2.gz:Feb 8 06:36:16.617283 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' had errored while getting dump, clearing it syslog.2.gz:Feb 8 06:37:56.906306 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' had errored while getting dump, clearing it syslog.3.gz:Feb 8 06:25:44.442474 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' had errored while getting dump, clearing it syslog.3.gz:Feb 8 06:27:02.539413 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' had errored while getting dump, clearing it syslog.3.gz:Feb 8 06:27:42.785533 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' had errored while getting dump, clearing it syslog.3.gz:Feb 8 06:29:33.510933 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' had errored while getting dump, clearing it syslog.5.gz:Feb 8 06:08:03.643106 vlab-01 NOTICE teamd#tlm_teamd: :- remove_lag: The LAG 'PortChannel999' had errored while getting dump, clearing it ``` --- tlm_teamd/main.cpp | 7 ++++-- tlm_teamd/teamdctl_mgr.cpp | 51 +++++++++++++++++++++++++++++++++++--- tlm_teamd/teamdctl_mgr.h | 6 +++-- 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/tlm_teamd/main.cpp b/tlm_teamd/main.cpp index 3f00c569406c..3a464e8a67b9 100644 --- a/tlm_teamd/main.cpp +++ b/tlm_teamd/main.cpp @@ -98,7 +98,7 @@ int main() if (res == swss::Select::OBJECT) { update_interfaces(sst_lag, teamdctl_mgr); - values_store.update(teamdctl_mgr.get_dumps()); + values_store.update(teamdctl_mgr.get_dumps(false)); } else if (res == swss::Select::ERROR) { @@ -108,7 +108,10 @@ int main() else if (res == swss::Select::TIMEOUT) { teamdctl_mgr.process_add_queue(); - values_store.update(teamdctl_mgr.get_dumps()); + // In the case of lag removal, there is a scenario where the select::TIMEOUT + // occurs, it triggers get_dumps incorrectly for resource which was in process of + // getting deleted. The fix here is to retry and check if this is a real failure. + values_store.update(teamdctl_mgr.get_dumps(true)); } else { diff --git a/tlm_teamd/teamdctl_mgr.cpp b/tlm_teamd/teamdctl_mgr.cpp index 3ea374c797d4..6a7a36f7400f 100644 --- a/tlm_teamd/teamdctl_mgr.cpp +++ b/tlm_teamd/teamdctl_mgr.cpp @@ -5,6 +5,8 @@ #include "teamdctl_mgr.h" +#define MAX_RETRY 3 + /// /// Custom function for libteamdctl logger. IT is empty to prevent libteamdctl to spam us with the error messages /// @param tdc teamdctl descriptor @@ -136,6 +138,15 @@ bool TeamdCtlMgr::remove_lag(const std::string & lag_name) { SWSS_LOG_WARN("The LAG '%s' hasn't been added. Can't remove it", lag_name.c_str()); } + + // If this lag interface errored last time, remove the entry. + // This is needed as here in this remove API, we do m_handlers.erase(lag_name). + if (m_lags_err_retry.find(lag_name) != m_lags_err_retry.end()) + { + SWSS_LOG_NOTICE("The LAG '%s' had errored while getting dump, removing it", lag_name.c_str()); + m_lags_err_retry.erase(lag_name); + } + return true; } @@ -163,11 +174,12 @@ void TeamdCtlMgr::process_add_queue() /// /// Get json dump from teamd for LAG interface with name lag_name /// @param lag_name a name for LAG interface +/// @param to_retry is the flag used to do retry or not. /// @return a pair. First element of the pair is true, if the method is successful /// false otherwise. If the first element is true, the second element has a dump /// otherwise the second element is an empty string /// -TeamdCtlDump TeamdCtlMgr::get_dump(const std::string & lag_name) +TeamdCtlDump TeamdCtlMgr::get_dump(const std::string & lag_name, bool to_retry) { TeamdCtlDump res = { false, "" }; if (has_key(lag_name)) @@ -178,10 +190,41 @@ TeamdCtlDump TeamdCtlMgr::get_dump(const std::string & lag_name) if (r == 0) { res = { true, std::string(dump) }; + + // If this lag interface errored last time, remove the entry + if (m_lags_err_retry.find(lag_name) != m_lags_err_retry.end()) + { + SWSS_LOG_NOTICE("The LAG '%s' had errored in get_dump earlier, removing it", lag_name.c_str()); + m_lags_err_retry.erase(lag_name); + } } else { - SWSS_LOG_ERROR("Can't get dump for LAG '%s'. Skipping", lag_name.c_str()); + // In case of failure and retry flag is set, check if it fails for MAX_RETRY times. + if (to_retry) + { + if (m_lags_err_retry.find(lag_name) != m_lags_err_retry.end()) + { + if (m_lags_err_retry[lag_name] == MAX_RETRY) + { + SWSS_LOG_ERROR("Can't get dump for LAG '%s'. Skipping", lag_name.c_str()); + m_lags_err_retry.erase(lag_name); + } + else + m_lags_err_retry[lag_name]++; + } + else + { + + // This time a different lag interface errored out. + m_lags_err_retry[lag_name] = 1; + } + } + else + { + // No need to retry if the flag is not set. + SWSS_LOG_ERROR("Can't get dump for LAG '%s'. Skipping", lag_name.c_str()); + } } } else @@ -196,14 +239,14 @@ TeamdCtlDump TeamdCtlMgr::get_dump(const std::string & lag_name) /// Get dumps for all registered LAG interfaces /// @return vector of pairs. Each pair first value is a name of LAG, second value is a dump /// -TeamdCtlDumps TeamdCtlMgr::get_dumps() +TeamdCtlDumps TeamdCtlMgr::get_dumps(bool to_retry) { TeamdCtlDumps res; for (const auto & p: m_handlers) { const auto & lag_name = p.first; - const auto & result = get_dump(lag_name); + const auto & result = get_dump(lag_name, to_retry); const auto & status = result.first; const auto & dump = result.second; if (status) diff --git a/tlm_teamd/teamdctl_mgr.h b/tlm_teamd/teamdctl_mgr.h index 73d6bb3b7e28..f6a5dd204f52 100644 --- a/tlm_teamd/teamdctl_mgr.h +++ b/tlm_teamd/teamdctl_mgr.h @@ -18,8 +18,9 @@ class TeamdCtlMgr bool add_lag(const std::string & lag_name); bool remove_lag(const std::string & lag_name); void process_add_queue(); - TeamdCtlDump get_dump(const std::string & lag_name); - TeamdCtlDumps get_dumps(); + // Retry logic added to prevent incorrect error reporting in dump API's + TeamdCtlDump get_dump(const std::string & lag_name, bool to_retry); + TeamdCtlDumps get_dumps(bool to_retry); private: bool has_key(const std::string & lag_name) const; @@ -27,6 +28,7 @@ class TeamdCtlMgr std::unordered_map m_handlers; std::unordered_map m_lags_to_add; + std::unordered_map m_lags_err_retry; const int max_attempts_to_add = 10; };