Skip to content

Commit d685e35

Browse files
kcudniklguohan
authored andcommitted
Add support for fdb_event MOVE and check fdb event oids (sonic-net#420)
* Add support for fdb_event MOVE and check fdb event oids * bring back ntf_queue constructor * Log also entire fdb_entry
1 parent 2b91013 commit d685e35

File tree

5 files changed

+229
-5
lines changed

5 files changed

+229
-5
lines changed

meta/sai_meta.cpp

+95
Original file line numberDiff line numberDiff line change
@@ -6015,6 +6015,45 @@ void meta_sai_on_fdb_flush_event_consolidated(
60156015
}
60166016
}
60176017

6018+
void meta_fdb_event_snoop_oid(
6019+
_In_ sai_object_id_t oid)
6020+
{
6021+
SWSS_LOG_ENTER();
6022+
6023+
if (oid == SAI_NULL_OBJECT_ID)
6024+
return;
6025+
6026+
if (object_reference_exists(oid))
6027+
return;
6028+
6029+
sai_object_type_t ot = sai_object_type_query(oid);
6030+
6031+
if (ot == SAI_OBJECT_TYPE_NULL)
6032+
{
6033+
SWSS_LOG_ERROR("failed to get object type on fdb_event oid: 0x%lx", oid);
6034+
return;
6035+
}
6036+
6037+
sai_object_meta_key_t key = { .objecttype = ot, .objectkey = { .key = { .object_id = oid } } };
6038+
6039+
object_reference_insert(oid);
6040+
6041+
if (!object_exists(key))
6042+
create_object(key);
6043+
6044+
/*
6045+
* In normal operation orch agent should query or create all bridge, vlan
6046+
* and bridge port, so we should not get this message. Let's put it as
6047+
* warning for better visibility. Most likely if this happen there is a
6048+
* vendor bug in SAI and we should also see warnings or errors reported
6049+
* from syncd in logs.
6050+
*/
6051+
6052+
SWSS_LOG_WARN("fdb_entry oid (snoop): %s: %s",
6053+
sai_serialize_object_type(ot).c_str(),
6054+
sai_serialize_object_id(oid).c_str());
6055+
}
6056+
60186057
void meta_sai_on_fdb_event_single(
60196058
_In_ const sai_fdb_event_notification_data_t& data)
60206059
{
@@ -6024,6 +6063,35 @@ void meta_sai_on_fdb_event_single(
60246063

60256064
std::string key_fdb = sai_serialize_object_meta_key(meta_key_fdb);
60266065

6066+
/*
6067+
* Because we could receive fdb event's before orch agent will query or
6068+
* create bridge/vlan/bridge port we should snoop here new OIDs and put
6069+
* them in local DB.
6070+
*
6071+
* Unfortunately we don't have a way to check whether those OIDs are correct
6072+
* or whether there maybe some bug in vendor SAI and for example is sending
6073+
* invalid OIDs in those event's. Also sai_object_type_query can return
6074+
* valid object type for OID, but this does not guarantee that this OID is
6075+
* valid, for example one of existing bridge ports that orch agent didn't
6076+
* query yet.
6077+
*/
6078+
6079+
meta_fdb_event_snoop_oid(data.fdb_entry.bv_id);
6080+
6081+
for (uint32_t i = 0; i < data.attr_count; i++)
6082+
{
6083+
auto meta = sai_metadata_get_attr_metadata(SAI_OBJECT_TYPE_FDB_ENTRY, data.attr[i].id);
6084+
6085+
if (meta == NULL)
6086+
{
6087+
SWSS_LOG_ERROR("failed to get metadata for fdb_entry attr.id = %d", data.attr[i].id);
6088+
continue;
6089+
}
6090+
6091+
if (meta->attrvaluetype == SAI_ATTR_VALUE_TYPE_OBJECT_ID)
6092+
meta_fdb_event_snoop_oid(data.attr[i].value.oid);
6093+
}
6094+
60276095
switch (data.event_type)
60286096
{
60296097
case SAI_FDB_EVENT_LEARNED:
@@ -6096,6 +6164,33 @@ void meta_sai_on_fdb_event_single(
60966164

60976165
break;
60986166

6167+
case SAI_FDB_EVENT_MOVE:
6168+
6169+
if (!object_exists(key_fdb))
6170+
{
6171+
SWSS_LOG_WARN("object key %s doesn't exist but received FDB MOVE event", key_fdb.c_str());
6172+
break;
6173+
}
6174+
6175+
// on MOVE event, just update attributes on existing entry
6176+
6177+
for (uint32_t i = 0; i < data.attr_count; i++)
6178+
{
6179+
const sai_attribute_t& attr = data.attr[i];
6180+
6181+
sai_status_t status = meta_generic_validation_set(meta_key_fdb, &attr);
6182+
6183+
if (status != SAI_STATUS_SUCCESS)
6184+
{
6185+
SWSS_LOG_ERROR("object key %s FDB MOVE event, SET validateion failed on attr.id = %d", key_fdb.c_str(), attr.id);
6186+
continue;
6187+
}
6188+
6189+
meta_generic_validation_post_set(meta_key_fdb, &attr);
6190+
}
6191+
6192+
break;
6193+
60996194
default:
61006195

61016196
SWSS_LOG_ERROR("got FDB_ENTRY notification with unknown event_type %d, bug?", data.event_type);

saiplayer/saiplayer.cpp

+45-2
Original file line numberDiff line numberDiff line change
@@ -1079,6 +1079,8 @@ void processBulk(
10791079
}
10801080
}
10811081

1082+
bool g_sleep = false;
1083+
10821084
int replay(int argc, char **argv)
10831085
{
10841086
//swss::Logger::getInstance().setMinPrio(swss::Logger::SWSS_DEBUG);
@@ -1246,17 +1248,51 @@ int replay(int argc, char **argv)
12461248
catch (const std::exception &e)
12471249
{
12481250
SWSS_LOG_NOTICE("line: %s", line.c_str());
1249-
SWSS_LOG_NOTICE("resp: %s", response.c_str());
1251+
SWSS_LOG_NOTICE("resp (expected): %s", response.c_str());
1252+
SWSS_LOG_NOTICE("got: %s", sai_serialize_status(status).c_str());
1253+
1254+
if (api == SAI_COMMON_API_GET && (status == SAI_STATUS_SUCCESS || status == SAI_STATUS_BUFFER_OVERFLOW))
1255+
{
1256+
// log each get parameter
1257+
for (uint32_t i = 0; i < attr_count; ++i)
1258+
{
1259+
auto meta = sai_metadata_get_attr_metadata(object_type, attr_list[i].id);
1260+
1261+
auto val = sai_serialize_attr_value(*meta, attr_list[i]);
1262+
1263+
SWSS_LOG_NOTICE(" - %s:%s", meta->attridname, val.c_str());
1264+
}
1265+
}
12501266

12511267
exit(EXIT_FAILURE);
12521268
}
1269+
1270+
if (api == SAI_COMMON_API_GET && (status == SAI_STATUS_SUCCESS || status == SAI_STATUS_BUFFER_OVERFLOW))
1271+
{
1272+
// log each get parameter
1273+
for (uint32_t i = 0; i < attr_count; ++i)
1274+
{
1275+
auto meta = sai_metadata_get_attr_metadata(object_type, attr_list[i].id);
1276+
1277+
auto val = sai_serialize_attr_value(*meta, attr_list[i]);
1278+
1279+
SWSS_LOG_NOTICE(" - %s:%s", meta->attridname, val.c_str());
1280+
}
1281+
}
12531282
}
12541283
}
12551284

12561285
infile.close();
12571286

12581287
SWSS_LOG_NOTICE("finished replaying %s with SUCCESS", filename);
12591288

1289+
if (g_sleep)
1290+
{
1291+
fprintf(stderr, "Reply SUCCESS, sleeping, watching for notifications\n");
1292+
1293+
sleep(-1);
1294+
}
1295+
12601296
return 0;
12611297
}
12621298

@@ -1273,6 +1309,8 @@ void printUsage()
12731309
std::cout << " Enable temporary view between init and apply" << std::endl << std::endl;
12741310
std::cout << " -i --inspectAsic:" << std::endl;
12751311
std::cout << " Inspect ASIC by ASIC DB" << std::endl << std::endl;
1312+
std::cout << " -s --sleep:" << std::endl;
1313+
std::cout << " Sleep after success reply, to notice any switch notifications" << std::endl << std::endl;
12761314
std::cout << " -h --help:" << std::endl;
12771315
std::cout << " Print out this message" << std::endl << std::endl;
12781316
}
@@ -1293,10 +1331,11 @@ int handleCmdLine(int argc, char **argv)
12931331
{ "skipNotifySyncd", no_argument, 0, 'C' },
12941332
{ "enableDebug", no_argument, 0, 'd' },
12951333
{ "inspectAsic", no_argument, 0, 'i' },
1334+
{ "sleep", no_argument, 0, 's' },
12961335
{ 0, 0, 0, 0 }
12971336
};
12981337

1299-
const char* const optstring = "hCdui";
1338+
const char* const optstring = "hCduis";
13001339

13011340
int c = getopt_long(argc, argv, optstring, long_options, 0);
13021341

@@ -1321,6 +1360,10 @@ int handleCmdLine(int argc, char **argv)
13211360
g_inspectAsic = true;
13221361
break;
13231362

1363+
case 's':
1364+
g_sleep = true;
1365+
break;
1366+
13241367
case 'h':
13251368
printUsage();
13261369
exit(EXIT_SUCCESS);

syncd/syncd.cpp

+28
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,34 @@ sai_object_id_t translate_rid_to_vid(
448448
return vid;
449449
}
450450

451+
/**
452+
* @brief Check if RID exists on the ASIC DB.
453+
*
454+
* @param rid Real object id to check.
455+
*
456+
* @return True if exists or SAI_NULL_OBJECT_ID, otherwise false.
457+
*/
458+
bool check_rid_exists(
459+
_In_ sai_object_id_t rid)
460+
{
461+
SWSS_LOG_ENTER();
462+
463+
if (rid == SAI_NULL_OBJECT_ID)
464+
return true;
465+
466+
if (local_rid_to_vid.find(rid) != local_rid_to_vid.end())
467+
return true;
468+
469+
std::string str_rid = sai_serialize_object_id(rid);
470+
471+
auto pvid = g_redisClient->hget(RIDTOVID, str_rid);
472+
473+
if (pvid != NULL)
474+
return true;
475+
476+
return false;
477+
}
478+
451479
void translate_list_rid_to_vid(
452480
_In_ sai_object_list_t &element,
453481
_In_ sai_object_id_t switch_id)

syncd/syncd.h

+3
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ sai_object_id_t translate_rid_to_vid(
110110
_In_ sai_object_id_t rid,
111111
_In_ sai_object_id_t switch_vid);
112112

113+
bool check_rid_exists(
114+
_In_ sai_object_id_t rid);
115+
113116
void translate_rid_to_vid_list(
114117
_In_ sai_object_type_t object_type,
115118
_In_ sai_object_id_t switch_vid,

syncd/syncd_notifications.cpp

+58-3
Original file line numberDiff line numberDiff line change
@@ -240,18 +240,73 @@ void redisPutFdbEntryToAsicView(
240240
g_redisClient->hset(key, strAttrId, strAttrValue);
241241
}
242242

243+
void check_fdb_event_notification_data(
244+
_In_ const sai_fdb_event_notification_data_t& data)
245+
{
246+
SWSS_LOG_ENTER();
247+
248+
/*
249+
* Any new RID value spotted in fdb notification can happen for 2 reasons:
250+
*
251+
* - a bug is present on the vendor SAI, all RID's are already in local or
252+
* REDIS ASIC DB but vendor SAI returned new or invalid RID
253+
*
254+
* - orch agent didn't query yet bridge ID/vlan ID and already
255+
* started to receive fdb notifications in which case warn message
256+
* could be ignored.
257+
*
258+
* If vendor SAI will return invalid RID, then this will later on lead to
259+
* inconsistent DB state and possible failure on apply view after cold or
260+
* warm boot.
261+
*
262+
* On switch init we do discover phase, and we should discover all objects
263+
* so we should not get any of those messages if SAI is in consistent
264+
* state.
265+
*/
266+
267+
if (!check_rid_exists(data.fdb_entry.bv_id))
268+
SWSS_LOG_ERROR("bv_id RID 0x%lx is not present on local ASIC DB: %s", data.fdb_entry.bv_id,
269+
sai_serialize_fdb_entry(data.fdb_entry).c_str());
270+
271+
if (!check_rid_exists(data.fdb_entry.switch_id) || data.fdb_entry.switch_id == SAI_NULL_OBJECT_ID)
272+
SWSS_LOG_ERROR("switch_id RID 0x%lx is not present on local ASIC DB: %s", data.fdb_entry.bv_id,
273+
sai_serialize_fdb_entry(data.fdb_entry).c_str());
274+
275+
for (uint32_t i = 0; i < data.attr_count; i++)
276+
{
277+
const sai_attribute_t& attr = data.attr[i];
278+
279+
auto meta = sai_metadata_get_attr_metadata(SAI_OBJECT_TYPE_FDB_ENTRY, attr.id);
280+
281+
if (meta == NULL)
282+
{
283+
SWSS_LOG_ERROR("unable to get metadata for fdb_entry attr.id = %d", attr.id);
284+
continue;
285+
}
286+
287+
// skip non oid attributes
288+
if (meta->attrvaluetype != SAI_ATTR_VALUE_TYPE_OBJECT_ID)
289+
continue;
290+
291+
if (!check_rid_exists(attr.value.oid))
292+
SWSS_LOG_WARN("RID 0x%lx on %s is not present on local ASIC DB", attr.value.oid, meta->attridname);
293+
}
294+
}
295+
243296
void process_on_fdb_event(
244297
_In_ uint32_t count,
245298
_In_ sai_fdb_event_notification_data_t *data)
246299
{
247300
SWSS_LOG_ENTER();
248301

249-
SWSS_LOG_DEBUG("fdb event count: %d", count);
302+
SWSS_LOG_INFO("fdb event count: %u", count);
250303

251304
for (uint32_t i = 0; i < count; i++)
252305
{
253306
sai_fdb_event_notification_data_t *fdb = &data[i];
254307

308+
check_fdb_event_notification_data(*fdb);
309+
255310
SWSS_LOG_DEBUG("fdb %u: type: %d", i, fdb->event_type);
256311

257312
fdb->fdb_entry.switch_id = translate_rid_to_vid(fdb->fdb_entry.switch_id, SAI_NULL_OBJECT_ID);
@@ -536,7 +591,7 @@ bool ntf_queue_t::enqueue(
536591

537592
if (!(log_count % 1000))
538593
{
539-
SWSS_LOG_NOTICE("Too many messages in queue (%ld), dropped %ld FDB events!",
594+
SWSS_LOG_NOTICE("Too many messages in queue (%zu), dropped %u FDB events!",
540595
queueStats(), (log_count+1));
541596
}
542597

@@ -782,6 +837,6 @@ void check_notifications_pointers(
782837
* Here we translated pointer, just log it.
783838
*/
784839

785-
SWSS_LOG_INFO("%s: %lp (orch) => %lp (syncd)", meta->attridname, prev, attr.value.ptr);
840+
SWSS_LOG_INFO("%s: 0x%lX (orch) => 0x%lX (syncd)", meta->attridname, (uint64_t)prev, (uint64_t)attr.value.ptr);
786841
}
787842
}

0 commit comments

Comments
 (0)