From 34670c476a2a828ed58d574fc8800a0822202197 Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Mon, 17 Jun 2024 11:05:28 -0400
Subject: [PATCH 1/4] zebra: Use the ctx queue counters

The ctx queue data structures already have a counter
associated with them.  Let's just use them instead.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 zebra/zebra_dplane.c | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index 085166651055..eb22da2164d2 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -483,10 +483,8 @@ struct zebra_dplane_provider {
 	int (*dp_fini)(struct zebra_dplane_provider *prov, bool early_p);
 
 	_Atomic uint32_t dp_in_counter;
-	_Atomic uint32_t dp_in_queued;
 	_Atomic uint32_t dp_in_max;
 	_Atomic uint32_t dp_out_counter;
-	_Atomic uint32_t dp_out_queued;
 	_Atomic uint32_t dp_out_max;
 	_Atomic uint32_t dp_error_counter;
 
@@ -6137,17 +6135,19 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed)
 
 	/* Show counters, useful info from each registered provider */
 	while (prov) {
+		dplane_provider_lock(prov);
+		in_q = dplane_ctx_queue_count(&prov->dp_ctx_in_list);
+		out_q = dplane_ctx_queue_count(&prov->dp_ctx_out_list);
+		dplane_provider_unlock(prov);
 
 		in = atomic_load_explicit(&prov->dp_in_counter,
 					  memory_order_relaxed);
-		in_q = atomic_load_explicit(&prov->dp_in_queued,
-					    memory_order_relaxed);
+
 		in_max = atomic_load_explicit(&prov->dp_in_max,
 					      memory_order_relaxed);
 		out = atomic_load_explicit(&prov->dp_out_counter,
 					   memory_order_relaxed);
-		out_q = atomic_load_explicit(&prov->dp_out_queued,
-					     memory_order_relaxed);
+
 		out_max = atomic_load_explicit(&prov->dp_out_max,
 					       memory_order_relaxed);
 
@@ -6299,10 +6299,6 @@ struct zebra_dplane_ctx *dplane_provider_dequeue_in_ctx(
 	dplane_provider_lock(prov);
 
 	ctx = dplane_ctx_list_pop(&(prov->dp_ctx_in_list));
-	if (ctx) {
-		atomic_fetch_sub_explicit(&prov->dp_in_queued, 1,
-					  memory_order_relaxed);
-	}
 
 	dplane_provider_unlock(prov);
 
@@ -6330,10 +6326,6 @@ int dplane_provider_dequeue_in_list(struct zebra_dplane_provider *prov,
 			break;
 	}
 
-	if (ret > 0)
-		atomic_fetch_sub_explicit(&prov->dp_in_queued, ret,
-					  memory_order_relaxed);
-
 	dplane_provider_unlock(prov);
 
 	return ret;
@@ -6358,10 +6350,7 @@ void dplane_provider_enqueue_out_ctx(struct zebra_dplane_provider *prov,
 	dplane_ctx_list_add_tail(&(prov->dp_ctx_out_list), ctx);
 
 	/* Maintain out-queue counters */
-	atomic_fetch_add_explicit(&(prov->dp_out_queued), 1,
-				  memory_order_relaxed);
-	curr = atomic_load_explicit(&prov->dp_out_queued,
-				    memory_order_relaxed);
+	curr = dplane_ctx_queue_count(&prov->dp_ctx_out_list);
 	high = atomic_load_explicit(&prov->dp_out_max,
 				    memory_order_relaxed);
 	if (curr > high)
@@ -6383,9 +6372,6 @@ dplane_provider_dequeue_out_ctx(struct zebra_dplane_provider *prov)
 	if (!ctx)
 		return NULL;
 
-	atomic_fetch_sub_explicit(&(prov->dp_out_queued), 1,
-				  memory_order_relaxed);
-
 	return ctx;
 }
 
@@ -7422,10 +7408,7 @@ static void dplane_thread_loop(struct event *event)
 
 		atomic_fetch_add_explicit(&prov->dp_in_counter, counter,
 					  memory_order_relaxed);
-		atomic_fetch_add_explicit(&prov->dp_in_queued, counter,
-					  memory_order_relaxed);
-		curr = atomic_load_explicit(&prov->dp_in_queued,
-					    memory_order_relaxed);
+		curr = dplane_ctx_queue_count(&prov->dp_ctx_in_list);
 		high = atomic_load_explicit(&prov->dp_in_max,
 					    memory_order_relaxed);
 		if (curr > high)

From 3af381b5021d0de1cb3fc9187d70f224764c64b9 Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Wed, 12 Jun 2024 14:14:48 -0400
Subject: [PATCH 2/4] zebra: Modify dplane loop to allow backpressure to filter
 up

Currently when the dplane_thread_loop is run, it moves contexts
from the dg_update_list and puts the contexts on the input queue
of the first provider.  This provider is given a chance to run
and then the items on the output queue are pulled off and placed
on the input queue of the next provider.  Rinse/Repeat down through
the entire list of providers.  Now imagine that we have a list
of multiple providers and the last provider is getting backed up.
Contexts will end up sticking in the input Queue of the `slow`
provider.  This can grow without bounds.  This is a real problem
when you have a situation where an interface is flapping and an
upper level protocol is sending a continous stream of route
updates to reflect the change in ecmp.  You can end up with
a very very large backlog of contexts.  This is bad because
zebra can easily grow to a very very large memory size and on
restricted systems you can run out of memory.  Fortunately
for us, the MetaQ already participates with this process
by not doing more route processing until the dg_update_list
goes below the working limit of dg_updates_per_cycle.  Thus
if FRR modifies the behavior of this loop to not move more
contexts onto the input queue if either the input queue
or output queue of the next provider has reached this limit.
FRR will naturaly start auto handling backpressure for the dplane
context system and memory will not go out of control.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 zebra/zebra_dplane.c | 114 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 94 insertions(+), 20 deletions(-)

diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index eb22da2164d2..df86a0803390 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -7317,10 +7317,10 @@ static void dplane_thread_loop(struct event *event)
 {
 	struct dplane_ctx_list_head work_list;
 	struct dplane_ctx_list_head error_list;
-	struct zebra_dplane_provider *prov;
+	struct zebra_dplane_provider *prov, *next_prov;
 	struct zebra_dplane_ctx *ctx;
 	int limit, counter, error_counter;
-	uint64_t curr, high;
+	uint64_t curr, out_curr, high;
 	bool reschedule = false;
 
 	/* Capture work limit per cycle */
@@ -7344,18 +7344,48 @@ static void dplane_thread_loop(struct event *event)
 	/* Locate initial registered provider */
 	prov = dplane_prov_list_first(&zdplane_info.dg_providers);
 
-	/* Move new work from incoming list to temp list */
-	for (counter = 0; counter < limit; counter++) {
-		ctx = dplane_ctx_list_pop(&zdplane_info.dg_update_list);
-		if (ctx) {
-			ctx->zd_provider = prov->dp_id;
+	curr = dplane_ctx_queue_count(&prov->dp_ctx_in_list);
+	out_curr = dplane_ctx_queue_count(&prov->dp_ctx_out_list);
 
-			dplane_ctx_list_add_tail(&work_list, ctx);
-		} else {
-			break;
+	if (curr >= (uint64_t)limit) {
+		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+			zlog_debug("%s: Current first provider(%s) Input queue is %" PRIu64
+				   ", holding off work",
+				   __func__, prov->dp_name, curr);
+		counter = 0;
+	} else if (out_curr >= (uint64_t)limit) {
+		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+			zlog_debug("%s: Current first provider(%s) Output queue is %" PRIu64
+				   ", holding off work",
+				   __func__, prov->dp_name, out_curr);
+		counter = 0;
+	} else {
+		int tlimit;
+		/*
+		 * Let's limit the work to how what can be put on the
+		 * in or out queue without going over
+		 */
+		tlimit = limit - MAX(curr, out_curr);
+		/* Move new work from incoming list to temp list */
+		for (counter = 0; counter < tlimit; counter++) {
+			ctx = dplane_ctx_list_pop(&zdplane_info.dg_update_list);
+			if (ctx) {
+				ctx->zd_provider = prov->dp_id;
+
+				dplane_ctx_list_add_tail(&work_list, ctx);
+			} else {
+				break;
+			}
 		}
 	}
 
+	/*
+	 * If there is anything still on the two input queues reschedule
+	 */
+	if (dplane_ctx_queue_count(&prov->dp_ctx_in_list) > 0 ||
+	    dplane_ctx_queue_count(&zdplane_info.dg_update_list) > 0)
+		reschedule = true;
+
 	DPLANE_UNLOCK();
 
 	atomic_fetch_sub_explicit(&zdplane_info.dg_routes_queued, counter,
@@ -7374,8 +7404,9 @@ static void dplane_thread_loop(struct event *event)
 		 * items.
 		 */
 		if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
-			zlog_debug("dplane enqueues %d new work to provider '%s'",
-				   counter, dplane_provider_get_name(prov));
+			zlog_debug("dplane enqueues %d new work to provider '%s' curr is %" PRIu64,
+				   counter, dplane_provider_get_name(prov),
+				   curr);
 
 		/* Capture current provider id in each context; check for
 		 * error status.
@@ -7433,18 +7464,61 @@ static void dplane_thread_loop(struct event *event)
 		if (!zdplane_info.dg_run)
 			break;
 
+		/* Locate next provider */
+		next_prov = dplane_prov_list_next(&zdplane_info.dg_providers,
+						  prov);
+		if (next_prov) {
+			curr = dplane_ctx_queue_count(
+				&next_prov->dp_ctx_in_list);
+			out_curr = dplane_ctx_queue_count(
+				&next_prov->dp_ctx_out_list);
+		} else
+			out_curr = curr = 0;
+
 		/* Dequeue completed work from the provider */
 		dplane_provider_lock(prov);
 
-		while (counter < limit) {
-			ctx = dplane_provider_dequeue_out_ctx(prov);
-			if (ctx) {
-				dplane_ctx_list_add_tail(&work_list, ctx);
-				counter++;
-			} else
-				break;
+		if (curr >= (uint64_t)limit) {
+			if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+				zlog_debug("%s: Next Provider(%s) Input queue is %" PRIu64
+					   ", holding off work",
+					   __func__, next_prov->dp_name, curr);
+			counter = 0;
+		} else if (out_curr >= (uint64_t)limit) {
+			if (IS_ZEBRA_DEBUG_DPLANE_DETAIL)
+				zlog_debug("%s: Next Provider(%s) Output queue is %" PRIu64
+					   ", holding off work",
+					   __func__, next_prov->dp_name,
+					   out_curr);
+			counter = 0;
+		} else {
+			int tlimit;
+
+			/*
+			 * Let's limit the work to how what can be put on the
+			 * in or out queue without going over
+			 */
+			tlimit = limit - MAX(curr, out_curr);
+			while (counter < tlimit) {
+				ctx = dplane_provider_dequeue_out_ctx(prov);
+				if (ctx) {
+					dplane_ctx_list_add_tail(&work_list,
+								 ctx);
+					counter++;
+				} else
+					break;
+			}
 		}
 
+		/*
+		 * Let's check if there are still any items on the
+		 * input or output queus of the current provider
+		 * if so then we know we need to reschedule.
+		 */
+		if (dplane_ctx_queue_count(&prov->dp_ctx_in_list) > 0 ||
+		    dplane_ctx_queue_count(&prov->dp_ctx_out_list) > 0)
+			reschedule = true;
+
 		dplane_provider_unlock(prov);
 
 		if (counter >= limit)
@@ -7460,7 +7534,7 @@ static void dplane_thread_loop(struct event *event)
 		}
 
 		/* Locate next provider */
-		prov = dplane_prov_list_next(&zdplane_info.dg_providers, prov);
+		prov = next_prov;
 	}
 
 	/*

From 8926ac198487dd6c782a998e2dde15f7267e69a3 Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Wed, 12 Jun 2024 15:16:08 -0400
Subject: [PATCH 3/4] zebra: Limit queue depth in dplane_fpm_nl

The dplane providers have a concept of input queues
and output queues.  These queues are chained together
during normal operation.  The code in zebra also has
a feedback mechanism where the MetaQ will not run when
the first input queue is backed up.  Having the dplane_fpm_nl
code grab all contexts when it is backed up prevents
this system from behaving appropriately.

Modify the code to not add to the dplane_fpm_nl's internal
queue when it is already full.  This will allow the backpressure
to work appropriately in zebra proper.

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 zebra/dplane_fpm_nl.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/zebra/dplane_fpm_nl.c b/zebra/dplane_fpm_nl.c
index eb968bcd372a..1d2f9e695f30 100644
--- a/zebra/dplane_fpm_nl.c
+++ b/zebra/dplane_fpm_nl.c
@@ -1678,6 +1678,25 @@ static int fpm_nl_process(struct zebra_dplane_provider *prov)
 
 	fnc = dplane_provider_get_data(prov);
 	limit = dplane_provider_get_work_limit(prov);
+
+	frr_with_mutex (&fnc->ctxqueue_mutex) {
+		cur_queue = dplane_ctx_queue_count(&fnc->ctxqueue);
+	}
+
+	if (cur_queue >= (uint64_t)limit) {
+		if (IS_ZEBRA_DEBUG_FPM)
+			zlog_debug("%s: Already at a limit(%" PRIu64
+				   ") of internal work, hold off",
+				   __func__, cur_queue);
+		limit = 0;
+	} else if (cur_queue != 0) {
+		if (IS_ZEBRA_DEBUG_FPM)
+			zlog_debug("%s: current queue is %" PRIu64
+				   ", limiting to lesser amount of %" PRIu64,
+				   __func__, cur_queue, limit - cur_queue);
+		limit -= cur_queue;
+	}
+
 	for (counter = 0; counter < limit; counter++) {
 		ctx = dplane_provider_dequeue_in_ctx(prov);
 		if (ctx == NULL)

From 98b11de9f60c16e61a581b03a97294563eb9f673 Mon Sep 17 00:00:00 2001
From: Donald Sharp <sharpd@nvidia.com>
Date: Mon, 17 Jun 2024 10:42:41 -0400
Subject: [PATCH 4/4] zebra: Modify show `zebra dplane providers` to give more
 data

The show zebra dplane provider command was ommitting
the input and output queues to the dplane itself.
It would be nice to have this insight as well.

New output:
r1# show zebra dplane providers
dataplane Incoming Queue from Zebra: 100
Zebra dataplane providers:
  Kernel (1): in: 6, q: 0, q_max: 3, out: 6, q: 14, q_max: 3
  dplane_fpm_nl (2): in: 6, q: 10, q_max: 3, out: 6, q: 0, q_max: 3
dataplane Outgoing Queue to Zebra: 43
r1#

Signed-off-by: Donald Sharp <sharpd@nvidia.com>
---
 zebra/rib.h          |  1 +
 zebra/zebra_dplane.c | 18 +++++++++++++-----
 zebra/zebra_rib.c    | 11 +++++++++++
 3 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/zebra/rib.h b/zebra/rib.h
index cd6efbfb36dd..3095a9d67dc8 100644
--- a/zebra/rib.h
+++ b/zebra/rib.h
@@ -631,6 +631,7 @@ extern int rib_add_gr_run(afi_t afi, vrf_id_t vrf_id, uint8_t proto,
 			  uint8_t instance, time_t restart_time);
 
 extern void zebra_vty_init(void);
+extern uint32_t zebra_rib_dplane_results_count(void);
 
 extern pid_t pid;
 
diff --git a/zebra/zebra_dplane.c b/zebra/zebra_dplane.c
index df86a0803390..75147e713649 100644
--- a/zebra/zebra_dplane.c
+++ b/zebra/zebra_dplane.c
@@ -6127,12 +6127,14 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed)
 	struct zebra_dplane_provider *prov;
 	uint64_t in, in_q, in_max, out, out_q, out_max;
 
-	vty_out(vty, "Zebra dataplane providers:\n");
-
 	DPLANE_LOCK();
 	prov = dplane_prov_list_first(&zdplane_info.dg_providers);
+	in = dplane_ctx_queue_count(&zdplane_info.dg_update_list);
 	DPLANE_UNLOCK();
 
+	vty_out(vty, "dataplane Incoming Queue from Zebra: %" PRIu64 "\n", in);
+	vty_out(vty, "Zebra dataplane providers:\n");
+
 	/* Show counters, useful info from each registered provider */
 	while (prov) {
 		dplane_provider_lock(prov);
@@ -6151,13 +6153,19 @@ int dplane_show_provs_helper(struct vty *vty, bool detailed)
 		out_max = atomic_load_explicit(&prov->dp_out_max,
 					       memory_order_relaxed);
 
-		vty_out(vty, "%s (%u): in: %"PRIu64", q: %"PRIu64", q_max: %"PRIu64", out: %"PRIu64", q: %"PRIu64", q_max: %"PRIu64"\n",
-			prov->dp_name, prov->dp_id, in, in_q, in_max,
-			out, out_q, out_max);
+		vty_out(vty,
+			"  %s (%u): in: %" PRIu64 ", q: %" PRIu64
+			", q_max: %" PRIu64 ", out: %" PRIu64 ", q: %" PRIu64
+			", q_max: %" PRIu64 "\n",
+			prov->dp_name, prov->dp_id, in, in_q, in_max, out,
+			out_q, out_max);
 
 		prov = dplane_prov_list_next(&zdplane_info.dg_providers, prov);
 	}
 
+	out = zebra_rib_dplane_results_count();
+	vty_out(vty, "dataplane Outgoing Queue to Zebra: %" PRIu64 "\n", out);
+
 	return CMD_SUCCESS;
 }
 
diff --git a/zebra/zebra_rib.c b/zebra/zebra_rib.c
index d53b27a38708..075cc2ffb4a5 100644
--- a/zebra/zebra_rib.c
+++ b/zebra/zebra_rib.c
@@ -5108,6 +5108,17 @@ static int rib_dplane_results(struct dplane_ctx_list_head *ctxlist)
 	return 0;
 }
 
+uint32_t zebra_rib_dplane_results_count(void)
+{
+	uint32_t count;
+
+	frr_with_mutex (&dplane_mutex) {
+		count = dplane_ctx_queue_count(&rib_dplane_q);
+	}
+
+	return count;
+}
+
 /*
  * Ensure there are no empty slots in the route_info array.
  * Every route type in zebra should be present there.