Skip to content

Commit 26a8a12

Browse files
authored
Prevent other notification event storms to keep enqueue unchecked and drained all memory that leads to crashing the switch router (#968)
1 parent 0cb253a commit 26a8a12

File tree

4 files changed

+131
-7
lines changed

4 files changed

+131
-7
lines changed

syncd/NotificationQueue.cpp

+45-5
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,13 @@ using namespace syncd;
88
#define MUTEX std::lock_guard<std::mutex> _lock(m_mutex);
99

1010
NotificationQueue::NotificationQueue(
11-
_In_ size_t queueLimit):
11+
_In_ size_t queueLimit,
12+
_In_ size_t consecutiveThresholdLimit):
1213
m_queueSizeLimit(queueLimit),
13-
m_dropCount(0)
14+
m_thresholdLimit(consecutiveThresholdLimit),
15+
m_dropCount(0),
16+
m_lastEventCount(0),
17+
m_lastEvent(SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT)
1418
{
1519
SWSS_LOG_ENTER();
1620

@@ -30,16 +34,52 @@ bool NotificationQueue::enqueue(
3034
MUTEX;
3135

3236
SWSS_LOG_ENTER();
37+
bool candidateToDrop = false;
38+
std::string currentEvent;
3339

3440
/*
3541
* If the queue exceeds the limit, then drop all further FDB events This is
3642
* a temporary solution to handle high memory usage by syncd and the
3743
* notification queue keeps growing. The permanent solution would be to
3844
* make this stateful so that only the *latest* event is published.
45+
*
46+
* We have also seen other notification storms that can also cause this queue issue
47+
* So the new scheme is to keep the last notification event and its consecutive count
48+
* If threshold limit reached and the consecutive count also reached then this notification
49+
* will also be dropped regardless of its event type to protect the device from crashing due to
50+
* running out of memory
3951
*/
4052
auto queueSize = m_queue.size();
53+
currentEvent = kfvKey(item);
54+
if (currentEvent == m_lastEvent)
55+
{
56+
m_lastEventCount++;
57+
}
58+
else
59+
{
60+
m_lastEventCount = 1;
61+
m_lastEvent = currentEvent;
62+
}
63+
if (queueSize >= m_queueSizeLimit)
64+
{
65+
/* Too many queued up already check if notification fits condition to e dropped
66+
* 1. All FDB events should be dropped at this point.
67+
* 2. All other notification events will start to drop if it reached the consecutive threshold limit
68+
*/
69+
if (currentEvent == SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT)
70+
{
71+
candidateToDrop = true;
72+
}
73+
else
74+
{
75+
if (m_lastEventCount >= m_thresholdLimit)
76+
{
77+
candidateToDrop = true;
78+
}
79+
}
80+
}
4181

42-
if (queueSize < m_queueSizeLimit || kfvKey(item) != SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT) // TODO use enum instead of strings
82+
if (!candidateToDrop)
4383
{
4484
m_queue.push(item);
4585

@@ -51,9 +91,9 @@ bool NotificationQueue::enqueue(
5191
if (!(m_dropCount % NOTIFICATION_QUEUE_DROP_COUNT_INDICATOR))
5292
{
5393
SWSS_LOG_NOTICE(
54-
"Too many messages in queue (%zu), dropped %zu FDB events!",
94+
"Too many messages in queue (%zu), dropped (%zu), lastEventCount (%zu) Dropping %s !",
5595
queueSize,
56-
m_dropCount);
96+
m_dropCount, m_lastEventCount, m_lastEvent.c_str());
5797
}
5898

5999
return false;

syncd/NotificationQueue.h

+16-1
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,17 @@ extern "C" {
1616
* Value based on typical L2 deployment with 256k MAC entries and
1717
* some extra buffer for other events like port-state, q-deadlock etc
1818
*
19+
* Note: We recently found a case where SAI continuously sending switch notification events
20+
* that also caused the queue to keep growing and eventually exhaust all system memory and crashed.
21+
* So a new detection/limit scheme is being implemented by keeping track of the last Event
22+
* and if the current Event matches the last Event, then the last Event Count will get
23+
* incremented and this count will also be used as part of the equation to ensure this
24+
* notification should also be dropped if the queue size limit has already reached and not
25+
* just dropping FDB events only.
1926
* TODO: move to config, also this limit only applies to fdb notifications
2027
*/
2128
#define DEFAULT_NOTIFICATION_QUEUE_SIZE_LIMIT (300000)
29+
#define DEFAULT_NOTIFICATION_CONSECUTIVE_THRESHOLD (1000)
2230

2331
namespace syncd
2432
{
@@ -27,7 +35,8 @@ namespace syncd
2735
public:
2836

2937
NotificationQueue(
30-
_In_ size_t limit = DEFAULT_NOTIFICATION_QUEUE_SIZE_LIMIT);
38+
_In_ size_t limit = DEFAULT_NOTIFICATION_QUEUE_SIZE_LIMIT,
39+
_In_ size_t consecutiveThresholdLimit = DEFAULT_NOTIFICATION_CONSECUTIVE_THRESHOLD);
3140

3241
virtual ~NotificationQueue();
3342

@@ -49,6 +58,12 @@ namespace syncd
4958

5059
size_t m_queueSizeLimit;
5160

61+
size_t m_thresholdLimit;
62+
5263
size_t m_dropCount;
64+
65+
size_t m_lastEventCount;
66+
67+
std::string m_lastEvent;
5368
};
5469
}

unittest/syncd/Makefile.am

+2-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ tests_SOURCES = main.cpp \
99
MockableSaiInterface.cpp \
1010
MockHelper.cpp \
1111
TestCommandLineOptions.cpp \
12-
TestFlexCounter.cpp
12+
TestFlexCounter.cpp \
13+
TestNotificationQueue.cpp
1314

1415
tests_CXXFLAGS = $(DBGFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS_COMMON)
1516
tests_LDADD = $(LDADD_GTEST) $(top_srcdir)/syncd/libSyncd.a -lhiredis -lswsscommon -lpthread -L$(top_srcdir)/lib/.libs -lsairedis -L$(top_srcdir)/meta/.libs -lsaimetadata -lsaimeta -lzmq $(CODE_COVERAGE_LIBS)
+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#include "NotificationQueue.h"
2+
3+
#include "sairediscommon.h"
4+
5+
#include <gtest/gtest.h>
6+
7+
using namespace syncd;
8+
9+
static std::string fdbData =
10+
"[{\"fdb_entry\":\"{\\\"bvid\\\":\\\"oid:0x260000000005be\\\",\\\"mac\\\":\\\"52:54:00:86:DD:7A\\\",\\\"switch_id\\\":\\\"oid:0x21000000000000\\\"}\","
11+
"\"fdb_event\":\"SAI_FDB_EVENT_LEARNED\","
12+
"\"list\":[{\"id\":\"SAI_FDB_ENTRY_ATTR_TYPE\",\"value\":\"SAI_FDB_ENTRY_TYPE_DYNAMIC\"},{\"id\":\"SAI_FDB_ENTRY_ATTR_BRIDGE_PORT_ID\",\"value\":\"oid:0x3a000000000660\"}]}]";
13+
14+
static std::string sscData = "{\"status\":\"SAI_SWITCH_OPER_STATUS_UP\",\"switch_id\":\"oid:0x2100000000\"}";
15+
16+
TEST(NotificationQueue, EnqueueLimitTest)
17+
{
18+
bool status;
19+
int i;
20+
std::vector<swss::FieldValueTuple> fdbEntry;
21+
std::vector<swss::FieldValueTuple> sscEntry;
22+
23+
// Set up a queue with limit at 5 and threshold at 3 where after this is reached event starts dropping
24+
syncd::NotificationQueue testQ(5, 3);
25+
26+
// Try queue up 5 fake FDB event and expect them to be added successfully
27+
swss::KeyOpFieldsValuesTuple fdbItem(SAI_SWITCH_NOTIFICATION_NAME_FDB_EVENT, fdbData, fdbEntry);
28+
for (i = 0; i < 5; ++i)
29+
{
30+
status = testQ.enqueue(fdbItem);
31+
EXPECT_EQ(status, true);
32+
}
33+
34+
// On the 6th fake FDB event expect it to be dropped right away
35+
status = testQ.enqueue(fdbItem);
36+
EXPECT_EQ(status, false);
37+
38+
// Add 2 switch state change events expect both are accepted as consecutive limit not yet reached
39+
swss::KeyOpFieldsValuesTuple sscItem(SAI_SWITCH_NOTIFICATION_NAME_SWITCH_STATE_CHANGE, sscData, sscEntry);
40+
for (i = 0; i < 2; ++i)
41+
{
42+
status = testQ.enqueue(sscItem);
43+
EXPECT_EQ(status, true);
44+
}
45+
46+
// On the 3rd consecutive switch state change event expect it to be dropped
47+
status = testQ.enqueue(sscItem);
48+
EXPECT_EQ(status, false);
49+
50+
// Add a fake FDB event to cause the consecutive event signature to change while this FDB event is dropped
51+
status = testQ.enqueue(fdbItem);
52+
EXPECT_EQ(status, false);
53+
54+
// Add 2 switch state change events expect both are accepted as consecutive limit not yet reached
55+
for (i = 0; i < 2; ++i)
56+
{
57+
status = testQ.enqueue(sscItem);
58+
EXPECT_EQ(status, true);
59+
}
60+
61+
// Add 2 switch state change events expect both are dropped as consecutive limit has already reached
62+
for (i = 0; i < 2; ++i)
63+
{
64+
status = testQ.enqueue(sscItem);
65+
EXPECT_EQ(status, false);
66+
}
67+
}
68+

0 commit comments

Comments
 (0)