From d7ac4c4d8840d14c50924bffe2497b09c7f8b795 Mon Sep 17 00:00:00 2001 From: Donald Sharp Date: Tue, 2 Aug 2022 13:57:18 -0400 Subject: [PATCH] zebra: Introduce early route processing on the MetaQ Currently if an operator does this operation: sharpd@eva ~/frr8> sudo ip nexthop add id 5000 via 192.168.119.44 dev enp39s0 ; sudo ip route add 10.0.0.1 nhid 5000 2022/06/30 08:52:40 ZEBRA: [ZHQK5-J9M1R] proto2zebra: Please add this protocol(0) to proper rt_netlink.c handling 2022/06/30 08:52:40 ZEBRA: [PS16P-365FK][EC 4043309076] Zebra failed to find the nexthop hash entry for id=5000 in a route entry sharpd@eva ~/frr8> vtysh -c "show ip route 10.0.0.1" Routing entry for 0.0.0.0/0 Known via "kernel", distance 0, metric 100, best Last update 00:01:58 ago * 192.168.119.1, via enp39s0 The route is dropped by zebra with no warnings. This is not good, but unlikely to happen at this point in time. In order to fix this issue route processing from inputs needs to happen after nexthop group processing from inputs. This was not possible because nexthop groups are placed on the metaQ. As such the above nexthop group creation is placed on the metaQ for processing in META_QUEUE_NHG. Then the route is read in and processed immediately. The nexthop group is not found ( not processed yet!) and the route is dropped in zebra. Modify the code to have early route processing of validity on the MetaQ. This preserves the order of operations. Signed-off-by: Donald Sharp --- zebra/rib.h | 17 +- zebra/zapi_msg.c | 7 +- zebra/zebra_rib.c | 1239 ++++++++++++++++++++++++++------------------- 3 files changed, 719 insertions(+), 544 deletions(-) diff --git a/zebra/rib.h b/zebra/rib.h index 9f34399653a5..2e8b05f9dea3 100644 --- a/zebra/rib.h +++ b/zebra/rib.h @@ -178,16 +178,17 @@ struct route_entry { /* meta-queue structure: * sub-queue 0: nexthop group objects * sub-queue 1: EVPN/VxLAN objects - * sub-queue 2: Early Label Processing - * sub-queue 2: connected - * sub-queue 3: kernel - * sub-queue 4: static - * sub-queue 5: RIP, RIPng, OSPF, OSPF6, IS-IS, EIGRP, NHRP - * sub-queue 6: iBGP, eBGP - * sub-queue 7: any other origin (if any) typically those that + * sub-queue 2: Early Route Processing + * sub-queue 3: Early Label Processing + * sub-queue 4: connected + * sub-queue 5: kernel + * sub-queue 6: static + * sub-queue 7: RIP, RIPng, OSPF, OSPF6, IS-IS, EIGRP, NHRP + * sub-queue 8: iBGP, eBGP + * sub-queue 9: any other origin (if any) typically those that * don't generate routes */ -#define MQ_SIZE 9 +#define MQ_SIZE 10 struct meta_queue { struct list *subq[MQ_SIZE]; uint32_t size; /* sum of lengths of all subqueues */ diff --git a/zebra/zapi_msg.c b/zebra/zapi_msg.c index a578395ef8cf..b68dbb38b6ba 100644 --- a/zebra/zapi_msg.c +++ b/zebra/zapi_msg.c @@ -2034,7 +2034,7 @@ static void zread_route_add(ZAPI_HANDLER_ARGS) struct nhg_backup_info *bnhg = NULL; int ret; vrf_id_t vrf_id; - struct nhg_hash_entry nhe; + struct nhg_hash_entry nhe, *n = NULL; s = msg; if (zapi_route_decode(s, &api) < 0) { @@ -2161,9 +2161,10 @@ static void zread_route_add(ZAPI_HANDLER_ARGS) zebra_nhe_init(&nhe, afi, ng->nexthop); nhe.nhg.nexthop = ng->nexthop; nhe.backup_info = bnhg; + n = zebra_nhe_copy(&nhe, 0); } - ret = rib_add_multipath_nhe(afi, api.safi, &api.prefix, src_p, - re, &nhe, false); + ret = rib_add_multipath_nhe(afi, api.safi, &api.prefix, src_p, re, n, + false); /* * rib_add_multipath_nhe only fails in a couple spots diff --git a/zebra/zebra_rib.c b/zebra/zebra_rib.c index 14e25d16d163..9596783e2b8a 100644 --- a/zebra/zebra_rib.c +++ b/zebra/zebra_rib.c @@ -81,6 +81,7 @@ DEFINE_HOOK(rib_update, (struct route_node * rn, const char *reason), enum meta_queue_indexes { META_QUEUE_NHG, META_QUEUE_EVPN, + META_QUEUE_EARLY_ROUTE, META_QUEUE_EARLY_LABEL, META_QUEUE_CONNECTED, META_QUEUE_KERNEL, @@ -194,6 +195,9 @@ struct wq_label_wrapper { int afi; }; +static void rib_addnode(struct route_node *rn, struct route_entry *re, + int process); + /* %pRN is already a printer for route_nodes that just prints the prefix */ #ifdef _FRR_ATTRIBUTE_PRINTFRR #pragma FRR printfrr_ext "%pZN" (struct route_node *) @@ -206,6 +210,8 @@ static const char *subqueue2str(enum meta_queue_indexes index) return "NHG Objects"; case META_QUEUE_EVPN: return "EVPN/VxLan Objects"; + case META_QUEUE_EARLY_ROUTE: + return "Early Route Processing"; case META_QUEUE_EARLY_LABEL: return "Early Label Handling"; case META_QUEUE_CONNECTED: @@ -2556,178 +2562,636 @@ static void process_subq_route(struct listnode *lnode, uint8_t qindex) route_unlock_node(rnode); } -/* - * Examine the specified subqueue; process one entry and return 1 if - * there is a node, return 0 otherwise. - */ -static unsigned int process_subq(struct list *subq, - enum meta_queue_indexes qindex) +static void rib_re_nhg_free(struct route_entry *re) { - struct listnode *lnode = listhead(subq); + if (re->nhe && re->nhe_id) { + assert(re->nhe->id == re->nhe_id); + route_entry_update_nhe(re, NULL); + } else if (re->nhe && re->nhe->nhg.nexthop) + nexthops_free(re->nhe->nhg.nexthop); - if (!lnode) - return 0; + nexthops_free(re->fib_ng.nexthop); +} - switch (qindex) { - case META_QUEUE_EVPN: - process_subq_evpn(lnode); - break; - case META_QUEUE_NHG: - process_subq_nhg(lnode); - break; - case META_QUEUE_EARLY_LABEL: - process_subq_early_label(lnode); - break; - case META_QUEUE_CONNECTED: - case META_QUEUE_KERNEL: - case META_QUEUE_STATIC: - case META_QUEUE_NOTBGP: - case META_QUEUE_BGP: - case META_QUEUE_OTHER: - process_subq_route(lnode, qindex); - break; - } +struct zebra_early_route { + afi_t afi; + safi_t safi; + struct prefix p; + struct prefix_ipv6 src_p; + bool src_p_provided; + struct route_entry *re; + struct nhg_hash_entry *re_nhe; + bool startup; + bool deletion; + bool fromkernel; +}; - list_delete_node(subq, lnode); +static void early_route_memory_free(struct zebra_early_route *ere) +{ + if (ere->re_nhe) + zebra_nhg_free(ere->re_nhe); - return 1; + XFREE(MTYPE_RE, ere->re); + XFREE(MTYPE_WQ_WRAPPER, ere); } -/* Dispatch the meta queue by picking and processing the next node from - * a non-empty sub-queue with lowest priority. wq is equal to zebra->ribq and - * data is pointed to the meta queue structure. - */ -static wq_item_status meta_queue_process(struct work_queue *dummy, void *data) +static void process_subq_early_route_add(struct zebra_early_route *ere) { - struct meta_queue *mq = data; - unsigned i; - uint32_t queue_len, queue_limit; + struct route_entry *re = ere->re; + struct route_table *table; + struct nhg_hash_entry *nhe = NULL; + struct route_node *rn; + struct route_entry *same = NULL, *first_same = NULL; + int same_count = 0; + rib_dest_t *dest; - /* Ensure there's room for more dataplane updates */ - queue_limit = dplane_get_in_queue_limit(); - queue_len = dplane_get_in_queue_len(); - if (queue_len > queue_limit) { - if (IS_ZEBRA_DEBUG_RIB_DETAILED) - zlog_debug("rib queue: dplane queue len %u, limit %u, retrying", - queue_len, queue_limit); + /* Lookup table. */ + table = zebra_vrf_get_table_with_table_id(ere->afi, ere->safi, + re->vrf_id, re->table); + if (!table) { + early_route_memory_free(ere); + return; + } - /* Ensure that the meta-queue is actually enqueued */ - if (work_queue_empty(zrouter.ribq)) - work_queue_add(zrouter.ribq, zrouter.mq); + if (re->nhe_id > 0) { + nhe = zebra_nhg_lookup_id(re->nhe_id); - return WQ_QUEUE_BLOCKED; - } + if (!nhe) { + /* + * We've received from the kernel a nexthop id + * that we don't have saved yet. More than likely + * it has not been processed and is on the + * queue to be processed. Let's stop what we + * are doing and cause the meta q to be processed + * storing this for later. + * + * This is being done this way because zebra + * runs with the assumption t + */ + flog_err( + EC_ZEBRA_TABLE_LOOKUP_FAILED, + "Zebra failed to find the nexthop hash entry for id=%u in a route entry %pFX", + re->nhe_id, &ere->p); - for (i = 0; i < MQ_SIZE; i++) - if (process_subq(mq->subq[i], i)) { - mq->size--; - break; + early_route_memory_free(ere); + return; } - return mq->size ? WQ_REQUEUE : WQ_SUCCESS; -} + } else { + /* Lookup nhe from route information */ + nhe = zebra_nhg_rib_find_nhe(ere->re_nhe, ere->afi); + if (!nhe) { + char buf2[PREFIX_STRLEN] = ""; + flog_err( + EC_ZEBRA_TABLE_LOOKUP_FAILED, + "Zebra failed to find or create a nexthop hash entry for %pFX%s%s", + &ere->p, ere->src_p_provided ? " from " : "", + ere->src_p_provided + ? prefix2str(&ere->src_p, buf2, + sizeof(buf2)) + : ""); -/* - * Look into the RN and queue it into the highest priority queue - * at this point in time for processing. - * - * We will enqueue a route node only once per invocation. - * - * There are two possibilities here that should be kept in mind. - * If the original invocation has not been pulled off for processing - * yet, A subsuquent invocation can have a route entry with a better - * meta queue index value and we can have a situation where - * we might have the same node enqueued 2 times. Not necessarily - * an optimal situation but it should be ok. - * - * The other possibility is that the original invocation has not - * been pulled off for processing yet, A subsusquent invocation - * doesn't have a route_entry with a better meta-queue and the - * original metaqueue index value will win and we'll end up with - * the route node enqueued once. - */ -static int rib_meta_queue_add(struct meta_queue *mq, void *data) -{ - struct route_node *rn = NULL; - struct route_entry *re = NULL, *curr_re = NULL; - uint8_t qindex = MQ_SIZE, curr_qindex = MQ_SIZE; + early_route_memory_free(ere); + return; + } + } - rn = (struct route_node *)data; + /* + * Attach the re to the nhe's nexthop group. + * + * TODO: This will need to change when we start getting IDs from upper + * level protocols, as the refcnt might be wrong, since it checks + * if old_id != new_id. + */ + route_entry_update_nhe(re, nhe); - RNODE_FOREACH_RE (rn, curr_re) { - curr_qindex = route_info[curr_re->type].meta_q_map; + /* Make it sure prefixlen is applied to the prefix. */ + apply_mask(&ere->p); + if (ere->src_p_provided) + apply_mask_ipv6(&ere->src_p); - if (curr_qindex <= qindex) { - re = curr_re; - qindex = curr_qindex; + /* Set default distance by route type. */ + if (re->distance == 0) + re->distance = route_distance(re->type); + + /* Lookup route node.*/ + rn = srcdest_rnode_get(table, &ere->p, + ere->src_p_provided ? &ere->src_p : NULL); + + /* + * If same type of route are installed, treat it as a implicit + * withdraw. If the user has specified the No route replace semantics + * for the install don't do a route replace. + */ + RNODE_FOREACH_RE (rn, same) { + if (CHECK_FLAG(same->status, ROUTE_ENTRY_REMOVED)) { + same_count++; + continue; } - } - if (!re) - return -1; + /* Compare various route_entry properties */ + if (rib_compare_routes(re, same)) { + same_count++; - /* Invariant: at this point we always have rn->info set. */ - if (CHECK_FLAG(rib_dest_from_rnode(rn)->flags, - RIB_ROUTE_QUEUED(qindex))) { - if (IS_ZEBRA_DEBUG_RIB_DETAILED) - rnode_debug(rn, re->vrf_id, - "rn %p is already queued in sub-queue %s", - (void *)rn, subqueue2str(qindex)); - return -1; + if (first_same == NULL) + first_same = same; + } } - SET_FLAG(rib_dest_from_rnode(rn)->flags, RIB_ROUTE_QUEUED(qindex)); - listnode_add(mq->subq[qindex], rn); - route_lock_node(rn); - mq->size++; - - if (IS_ZEBRA_DEBUG_RIB_DETAILED) - rnode_debug(rn, re->vrf_id, "queued rn %p into sub-queue %s", - (void *)rn, subqueue2str(qindex)); + same = first_same; - return 0; -} + if (!ere->startup && (re->flags & ZEBRA_FLAG_SELFROUTE) && + zrouter.asic_offloaded) { + if (!same) { + if (IS_ZEBRA_DEBUG_RIB) + zlog_debug( + "prefix: %pRN is a self route where we do not have an entry for it. Dropping this update, it's useless", + rn); + /* + * We are not on startup, this is a self route + * and we have asic offload. Which means + * we are getting a callback for a entry + * that was already deleted to the kernel + * but an earlier response was just handed + * back. Drop it on the floor + */ + early_route_memory_free(ere); + return; + } + } -static int early_label_meta_queue_add(struct meta_queue *mq, void *data) -{ - listnode_add(mq->subq[META_QUEUE_EARLY_LABEL], data); - mq->size++; - return 0; -} + /* If this route is kernel/connected route, notify the dataplane. */ + if (RIB_SYSTEM_ROUTE(re)) { + /* Notify dataplane */ + dplane_sys_route_add(rn, re); + } -static int rib_meta_queue_nhg_ctx_add(struct meta_queue *mq, void *data) -{ - struct nhg_ctx *ctx = NULL; - uint8_t qindex = META_QUEUE_NHG; - struct wq_nhg_wrapper *w; + /* Link new re to node.*/ + if (IS_ZEBRA_DEBUG_RIB) { + rnode_debug( + rn, re->vrf_id, + "Inserting route rn %p, re %p (%s) existing %p, same_count %d", + rn, re, zebra_route_string(re->type), same, same_count); - ctx = (struct nhg_ctx *)data; + if (IS_ZEBRA_DEBUG_RIB_DETAILED) + route_entry_dump( + &ere->p, + ere->src_p_provided ? &ere->src_p : NULL, re); + } - if (!ctx) - return -1; + SET_FLAG(re->status, ROUTE_ENTRY_CHANGED); + rib_addnode(rn, re, 1); - w = XCALLOC(MTYPE_WQ_WRAPPER, sizeof(struct wq_nhg_wrapper)); + /* Free implicit route.*/ + if (same) + rib_delnode(rn, same); - w->type = WQ_NHG_WRAPPER_TYPE_CTX; - w->u.ctx = ctx; + /* See if we can remove some RE entries that are queued for + * removal, but won't be considered in rib processing. + */ + dest = rib_dest_from_rnode(rn); + RNODE_FOREACH_RE_SAFE (rn, re, same) { + if (CHECK_FLAG(re->status, ROUTE_ENTRY_REMOVED)) { + /* If the route was used earlier, must retain it. */ + if (dest && re == dest->selected_fib) + continue; - listnode_add(mq->subq[qindex], w); - mq->size++; + if (IS_ZEBRA_DEBUG_RIB) + rnode_debug(rn, re->vrf_id, + "rn %p, removing unneeded re %p", + rn, re); - if (IS_ZEBRA_DEBUG_RIB_DETAILED) - zlog_debug("NHG Context id=%u queued into sub-queue %s", - ctx->id, subqueue2str(qindex)); + rib_unlink(rn, re); + } + } - return 0; + route_unlock_node(rn); + if (ere->re_nhe) + zebra_nhg_free(ere->re_nhe); + XFREE(MTYPE_WQ_WRAPPER, ere); } -static int rib_meta_queue_nhg_add(struct meta_queue *mq, void *data) +static void process_subq_early_route_delete(struct zebra_early_route *ere) { - struct nhg_hash_entry *nhe = NULL; - uint8_t qindex = META_QUEUE_NHG; - struct wq_nhg_wrapper *w; + struct route_table *table; + struct route_node *rn; + struct route_entry *re; + struct route_entry *fib = NULL; + struct route_entry *same = NULL; + struct nexthop *rtnh; + char buf2[INET6_ADDRSTRLEN]; + rib_dest_t *dest; - nhe = (struct nhg_hash_entry *)data; + if (ere->src_p_provided) + assert(!ere->src_p.prefixlen || ere->afi == AFI_IP6); + + /* Lookup table. */ + table = zebra_vrf_lookup_table_with_table_id( + ere->afi, ere->safi, ere->re->vrf_id, ere->re->table); + if (!table) { + early_route_memory_free(ere); + return; + } + + /* Apply mask. */ + apply_mask(&ere->p); + if (ere->src_p_provided) + apply_mask_ipv6(&ere->src_p); + + /* Lookup route node. */ + rn = srcdest_rnode_lookup(table, &ere->p, + ere->src_p_provided ? &ere->src_p : NULL); + if (!rn) { + if (IS_ZEBRA_DEBUG_RIB) { + char src_buf[PREFIX_STRLEN]; + struct vrf *vrf = vrf_lookup_by_id(ere->re->vrf_id); + + if (ere->src_p_provided && ere->src_p.prefixlen) + prefix2str(&ere->src_p, src_buf, + sizeof(src_buf)); + else + src_buf[0] = '\0'; + + zlog_debug("%s[%d]:%pRN%s%s doesn't exist in rib", + vrf->name, ere->re->table, rn, + (src_buf[0] != '\0') ? " from " : "", + src_buf); + } + early_route_memory_free(ere); + return; + } + + dest = rib_dest_from_rnode(rn); + fib = dest->selected_fib; + + struct nexthop *nh = NULL; + + if (ere->re->nhe) + nh = ere->re->nhe->nhg.nexthop; + + /* Lookup same type route. */ + RNODE_FOREACH_RE (rn, re) { + if (CHECK_FLAG(re->status, ROUTE_ENTRY_REMOVED)) + continue; + + if (re->type != ere->re->type) + continue; + if (re->instance != ere->re->instance) + continue; + if (CHECK_FLAG(re->flags, ZEBRA_FLAG_RR_USE_DISTANCE) && + ere->re->distance != re->distance) + continue; + + if (re->type == ZEBRA_ROUTE_KERNEL && + re->metric != ere->re->metric) + continue; + if (re->type == ZEBRA_ROUTE_CONNECT && (rtnh = nh) && + rtnh->type == NEXTHOP_TYPE_IFINDEX && nh) { + if (rtnh->ifindex != nh->ifindex) + continue; + same = re; + break; + } + + /* Make sure that the route found has the same gateway. */ + if (ere->re->nhe_id && re->nhe_id == ere->re->nhe_id) { + same = re; + break; + } + + if (nh == NULL) { + same = re; + break; + } + for (ALL_NEXTHOPS(re->nhe->nhg, rtnh)) { + /* + * No guarantee all kernel send nh with labels + * on delete. + */ + if (nexthop_same_no_labels(rtnh, nh)) { + same = re; + break; + } + } + + if (same) + break; + } + /* + * If same type of route can't be found and this message is from + * kernel. + */ + if (!same) { + /* + * In the past(HA!) we could get here because + * we were receiving a route delete from the + * kernel and we're not marking the proto + * as coming from it's appropriate originator. + * Now that we are properly noticing the fact + * that the kernel has deleted our route we + * are not going to get called in this path + * I am going to leave this here because + * this might still work this way on non-linux + * platforms as well as some weird state I have + * not properly thought of yet. + * If we can show that this code path is + * dead then we can remove it. + */ + if (fib && CHECK_FLAG(ere->re->flags, ZEBRA_FLAG_SELFROUTE)) { + if (IS_ZEBRA_DEBUG_RIB) { + rnode_debug( + rn, ere->re->vrf_id, + "rn %p, re %p (%s) was deleted from kernel, adding", + rn, fib, zebra_route_string(fib->type)); + } + if (zrouter.allow_delete || + CHECK_FLAG(dest->flags, RIB_ROUTE_ANY_QUEUED)) { + UNSET_FLAG(fib->status, ROUTE_ENTRY_INSTALLED); + /* Unset flags. */ + for (rtnh = fib->nhe->nhg.nexthop; rtnh; + rtnh = rtnh->next) + UNSET_FLAG(rtnh->flags, + NEXTHOP_FLAG_FIB); + + /* + * This is a non FRR route + * as such we should mark + * it as deleted + */ + dest->selected_fib = NULL; + } else { + /* + * This means someone else, other than Zebra, + * has deleted a Zebra router from the kernel. + * We will add it back + */ + rib_install_kernel(rn, fib, NULL); + } + } else { + if (IS_ZEBRA_DEBUG_RIB) { + if (nh) + rnode_debug( + rn, ere->re->vrf_id, + "via %s ifindex %d type %d doesn't exist in rib", + inet_ntop(afi2family(ere->afi), + &nh->gate, buf2, + sizeof(buf2)), + nh->ifindex, ere->re->type); + else + rnode_debug( + rn, ere->re->vrf_id, + "type %d doesn't exist in rib", + ere->re->type); + } + route_unlock_node(rn); + early_route_memory_free(ere); + return; + } + } + + if (same) { + struct nexthop *tmp_nh; + + if (ere->fromkernel && + CHECK_FLAG(ere->re->flags, ZEBRA_FLAG_SELFROUTE) && + !zrouter.allow_delete) { + rib_install_kernel(rn, same, NULL); + route_unlock_node(rn); + + early_route_memory_free(ere); + return; + } + + /* Special handling for IPv4 or IPv6 routes sourced from + * EVPN - the nexthop (and associated MAC) need to be + * uninstalled if no more refs. + */ + for (ALL_NEXTHOPS(re->nhe->nhg, tmp_nh)) { + struct ipaddr vtep_ip; + + if (CHECK_FLAG(tmp_nh->flags, NEXTHOP_FLAG_EVPN)) { + memset(&vtep_ip, 0, sizeof(struct ipaddr)); + if (ere->afi == AFI_IP) { + vtep_ip.ipa_type = IPADDR_V4; + memcpy(&(vtep_ip.ipaddr_v4), + &(tmp_nh->gate.ipv4), + sizeof(struct in_addr)); + } else { + vtep_ip.ipa_type = IPADDR_V6; + memcpy(&(vtep_ip.ipaddr_v6), + &(tmp_nh->gate.ipv6), + sizeof(struct in6_addr)); + } + zebra_rib_queue_evpn_route_del( + re->vrf_id, &vtep_ip, &ere->p); + } + } + + /* Notify dplane if system route changes */ + if (RIB_SYSTEM_ROUTE(re)) + dplane_sys_route_del(rn, same); + + rib_delnode(rn, same); + } + + route_unlock_node(rn); + + early_route_memory_free(ere); +} + +/* + * When FRR receives a route we need to match the route up to + * nexthop groups. That we also may have just received + * place the data on this queue so that this work of finding + * the nexthop group entries for the route entry is always + * done after the nexthop group has had a chance to be processed + */ +static void process_subq_early_route(struct listnode *lnode) +{ + struct zebra_early_route *ere = listgetdata(lnode); + + if (ere->deletion) + process_subq_early_route_delete(ere); + else + process_subq_early_route_add(ere); +} + +/* + * Examine the specified subqueue; process one entry and return 1 if + * there is a node, return 0 otherwise. + */ +static unsigned int process_subq(struct list *subq, + enum meta_queue_indexes qindex) +{ + struct listnode *lnode = listhead(subq); + + if (!lnode) + return 0; + + switch (qindex) { + case META_QUEUE_EVPN: + process_subq_evpn(lnode); + break; + case META_QUEUE_NHG: + process_subq_nhg(lnode); + break; + case META_QUEUE_EARLY_ROUTE: + process_subq_early_route(lnode); + break; + case META_QUEUE_EARLY_LABEL: + process_subq_early_label(lnode); + break; + case META_QUEUE_CONNECTED: + case META_QUEUE_KERNEL: + case META_QUEUE_STATIC: + case META_QUEUE_NOTBGP: + case META_QUEUE_BGP: + case META_QUEUE_OTHER: + process_subq_route(lnode, qindex); + break; + } + + list_delete_node(subq, lnode); + + return 1; +} + +/* Dispatch the meta queue by picking and processing the next node from + * a non-empty sub-queue with lowest priority. wq is equal to zebra->ribq and + * data is pointed to the meta queue structure. + */ +static wq_item_status meta_queue_process(struct work_queue *dummy, void *data) +{ + struct meta_queue *mq = data; + unsigned i; + uint32_t queue_len, queue_limit; + + /* Ensure there's room for more dataplane updates */ + queue_limit = dplane_get_in_queue_limit(); + queue_len = dplane_get_in_queue_len(); + if (queue_len > queue_limit) { + if (IS_ZEBRA_DEBUG_RIB_DETAILED) + zlog_debug( + "rib queue: dplane queue len %u, limit %u, retrying", + queue_len, queue_limit); + + /* Ensure that the meta-queue is actually enqueued */ + if (work_queue_empty(zrouter.ribq)) + work_queue_add(zrouter.ribq, zrouter.mq); + + return WQ_QUEUE_BLOCKED; + } + + for (i = 0; i < MQ_SIZE; i++) + if (process_subq(mq->subq[i], i)) { + mq->size--; + break; + } + return mq->size ? WQ_REQUEUE : WQ_SUCCESS; +} + + +/* + * Look into the RN and queue it into the highest priority queue + * at this point in time for processing. + * + * We will enqueue a route node only once per invocation. + * + * There are two possibilities here that should be kept in mind. + * If the original invocation has not been pulled off for processing + * yet, A subsuquent invocation can have a route entry with a better + * meta queue index value and we can have a situation where + * we might have the same node enqueued 2 times. Not necessarily + * an optimal situation but it should be ok. + * + * The other possibility is that the original invocation has not + * been pulled off for processing yet, A subsusquent invocation + * doesn't have a route_entry with a better meta-queue and the + * original metaqueue index value will win and we'll end up with + * the route node enqueued once. + */ +static int rib_meta_queue_add(struct meta_queue *mq, void *data) +{ + struct route_node *rn = NULL; + struct route_entry *re = NULL, *curr_re = NULL; + uint8_t qindex = MQ_SIZE, curr_qindex = MQ_SIZE; + + rn = (struct route_node *)data; + + RNODE_FOREACH_RE (rn, curr_re) { + curr_qindex = route_info[curr_re->type].meta_q_map; + + if (curr_qindex <= qindex) { + re = curr_re; + qindex = curr_qindex; + } + } + + if (!re) + return -1; + + /* Invariant: at this point we always have rn->info set. */ + if (CHECK_FLAG(rib_dest_from_rnode(rn)->flags, + RIB_ROUTE_QUEUED(qindex))) { + if (IS_ZEBRA_DEBUG_RIB_DETAILED) + rnode_debug(rn, re->vrf_id, + "rn %p is already queued in sub-queue %s", + (void *)rn, subqueue2str(qindex)); + return -1; + } + + SET_FLAG(rib_dest_from_rnode(rn)->flags, RIB_ROUTE_QUEUED(qindex)); + listnode_add(mq->subq[qindex], rn); + route_lock_node(rn); + mq->size++; + + if (IS_ZEBRA_DEBUG_RIB_DETAILED) + rnode_debug(rn, re->vrf_id, "queued rn %p into sub-queue %s", + (void *)rn, subqueue2str(qindex)); + + return 0; +} + +static int early_label_meta_queue_add(struct meta_queue *mq, void *data) +{ + listnode_add(mq->subq[META_QUEUE_EARLY_LABEL], data); + mq->size++; + return 0; +} + +static int rib_meta_queue_nhg_ctx_add(struct meta_queue *mq, void *data) +{ + struct nhg_ctx *ctx = NULL; + uint8_t qindex = META_QUEUE_NHG; + struct wq_nhg_wrapper *w; + + ctx = (struct nhg_ctx *)data; + + if (!ctx) + return -1; + + w = XCALLOC(MTYPE_WQ_WRAPPER, sizeof(struct wq_nhg_wrapper)); + + w->type = WQ_NHG_WRAPPER_TYPE_CTX; + w->u.ctx = ctx; + + listnode_add(mq->subq[qindex], w); + mq->size++; + + if (IS_ZEBRA_DEBUG_RIB_DETAILED) + zlog_debug("NHG Context id=%u queued into sub-queue %s", + ctx->id, subqueue2str(qindex)); + + return 0; +} + +static int rib_meta_queue_nhg_add(struct meta_queue *mq, void *data) +{ + struct nhg_hash_entry *nhe = NULL; + uint8_t qindex = META_QUEUE_NHG; + struct wq_nhg_wrapper *w; + + nhe = (struct nhg_hash_entry *)data; if (!nhe) return -1; @@ -3056,7 +3520,6 @@ int zebra_rib_queue_evpn_rem_vtep_del(vrf_id_t vrf_id, vni_t vni, return mq_add_handler(w, rib_meta_queue_evpn_add); } - /* Create new meta queue. A destructor function doesn't seem to be necessary here. */ @@ -3174,6 +3637,22 @@ static void rib_meta_queue_free(struct meta_queue *mq, struct list *l, } } +static void early_route_meta_queue_free(struct meta_queue *mq, struct list *l, + struct zebra_vrf *zvrf) +{ + struct zebra_early_route *zer; + struct listnode *node, *nnode; + + for (ALL_LIST_ELEMENTS(l, node, nnode, zer)) { + if (zvrf && zer->re->vrf_id != zvrf->vrf->vrf_id) + continue; + + XFREE(MTYPE_RE, zer); + node->data = NULL; + list_delete_node(l, node); + mq->size--; + } +} void meta_queue_free(struct meta_queue *mq, struct zebra_vrf *zvrf) { @@ -3188,6 +3667,9 @@ void meta_queue_free(struct meta_queue *mq, struct zebra_vrf *zvrf) case META_QUEUE_EVPN: evpn_meta_queue_free(mq, mq->subq[i], zvrf); break; + case META_QUEUE_EARLY_ROUTE: + early_route_meta_queue_free(mq, mq->subq[i], zvrf); + break; case META_QUEUE_EARLY_LABEL: early_label_meta_queue_free(mq, mq->subq[i], zvrf); break; @@ -3334,17 +3816,6 @@ static void rib_addnode(struct route_node *rn, rib_link(rn, re, process); } -static void rib_re_nhg_free(struct route_entry *re) -{ - if (re->nhe && re->nhe_id) { - assert(re->nhe->id == re->nhe_id); - route_entry_update_nhe(re, NULL); - } else if (re->nhe && re->nhe->nhg.nexthop) - nexthops_free(re->nhe->nhg.nexthop); - - nexthops_free(re->fib_ng.nexthop); -} - /* * rib_unlink * @@ -3539,188 +4010,67 @@ void _route_entry_dump(const char *func, union prefixconstptr pp, for (ALL_NEXTHOPS(re->nhe->nhg, nexthop)) _route_entry_dump_nh(re, straddr, nexthop); - if (zebra_nhg_get_backup_nhg(re->nhe)) { - zlog_debug("%s: backup nexthops:", straddr); - - nhg = zebra_nhg_get_backup_nhg(re->nhe); - for (ALL_NEXTHOPS_PTR(nhg, nexthop)) - _route_entry_dump_nh(re, straddr, nexthop); - } - - zlog_debug("%s: dump complete", straddr); -} - -/* - * Internal route-add implementation; there are a couple of different public - * signatures. Callers in this path are responsible for the memory they - * allocate: if they allocate a nexthop_group or backup nexthop info, they - * must free those objects. If this returns < 0, an error has occurred and the - * route_entry 're' has not been captured; the caller should free that also. - * - * -1 -> error - * 0 -> Add - * 1 -> update - */ -int rib_add_multipath_nhe(afi_t afi, safi_t safi, struct prefix *p, - struct prefix_ipv6 *src_p, struct route_entry *re, - struct nhg_hash_entry *re_nhe, bool startup) -{ - struct nhg_hash_entry *nhe = NULL; - struct route_table *table; - struct route_node *rn; - struct route_entry *same = NULL, *first_same = NULL; - int ret = 0; - int same_count = 0; - rib_dest_t *dest; - - if (!re || !re_nhe) - return -1; - - assert(!src_p || !src_p->prefixlen || afi == AFI_IP6); - - /* Lookup table. */ - table = zebra_vrf_get_table_with_table_id(afi, safi, re->vrf_id, - re->table); - if (!table) - return -1; - - if (re->nhe_id > 0) { - nhe = zebra_nhg_lookup_id(re->nhe_id); - - if (!nhe) { - flog_err( - EC_ZEBRA_TABLE_LOOKUP_FAILED, - "Zebra failed to find the nexthop hash entry for id=%u in a route entry", - re->nhe_id); - - return -1; - } - } else { - /* Lookup nhe from route information */ - nhe = zebra_nhg_rib_find_nhe(re_nhe, afi); - if (!nhe) { - char buf2[PREFIX_STRLEN] = ""; - - flog_err( - EC_ZEBRA_TABLE_LOOKUP_FAILED, - "Zebra failed to find or create a nexthop hash entry for %pFX%s%s", - p, src_p ? " from " : "", - src_p ? prefix2str(src_p, buf2, sizeof(buf2)) - : ""); - - return -1; - } - } - - /* - * Attach the re to the nhe's nexthop group. - * - * TODO: This will need to change when we start getting IDs from upper - * level protocols, as the refcnt might be wrong, since it checks - * if old_id != new_id. - */ - route_entry_update_nhe(re, nhe); - - /* Make it sure prefixlen is applied to the prefix. */ - apply_mask(p); - if (src_p) - apply_mask_ipv6(src_p); - - /* Set default distance by route type. */ - if (re->distance == 0) - re->distance = route_distance(re->type); - - /* Lookup route node.*/ - rn = srcdest_rnode_get(table, p, src_p); - - /* - * If same type of route are installed, treat it as a implicit - * withdraw. If the user has specified the No route replace semantics - * for the install don't do a route replace. - */ - RNODE_FOREACH_RE (rn, same) { - if (CHECK_FLAG(same->status, ROUTE_ENTRY_REMOVED)) { - same_count++; - continue; - } - - /* Compare various route_entry properties */ - if (rib_compare_routes(re, same)) { - same_count++; - - if (first_same == NULL) - first_same = same; - } - } - - same = first_same; - - if (!startup && - (re->flags & ZEBRA_FLAG_SELFROUTE) && zrouter.asic_offloaded) { - if (!same) { - if (IS_ZEBRA_DEBUG_RIB) - zlog_debug("prefix: %pRN is a self route where we do not have an entry for it. Dropping this update, it's useless", rn); - /* - * We are not on startup, this is a self route - * and we have asic offload. Which means - * we are getting a callback for a entry - * that was already deleted to the kernel - * but an earlier response was just handed - * back. Drop it on the floor - */ - rib_re_nhg_free(re); + if (zebra_nhg_get_backup_nhg(re->nhe)) { + zlog_debug("%s: backup nexthops:", straddr); - XFREE(MTYPE_RE, re); - return ret; - } + nhg = zebra_nhg_get_backup_nhg(re->nhe); + for (ALL_NEXTHOPS_PTR(nhg, nexthop)) + _route_entry_dump_nh(re, straddr, nexthop); } - /* If this route is kernel/connected route, notify the dataplane. */ - if (RIB_SYSTEM_ROUTE(re)) { - /* Notify dataplane */ - dplane_sys_route_add(rn, re); - } + zlog_debug("%s: dump complete", straddr); +} - /* Link new re to node.*/ - if (IS_ZEBRA_DEBUG_RIB) { - rnode_debug(rn, re->vrf_id, - "Inserting route rn %p, re %p (%s) existing %p, same_count %d", - rn, re, zebra_route_string(re->type), same, - same_count); +static int rib_meta_queue_early_route_add(struct meta_queue *mq, void *data) +{ + struct zebra_early_route *ere = data; - if (IS_ZEBRA_DEBUG_RIB_DETAILED) - route_entry_dump(p, src_p, re); - } + listnode_add(mq->subq[META_QUEUE_EARLY_ROUTE], data); + mq->size++; - SET_FLAG(re->status, ROUTE_ENTRY_CHANGED); - rib_addnode(rn, re, 1); + if (IS_ZEBRA_DEBUG_RIB_DETAILED) + zlog_debug( + "Route %pFX(%u) queued for processing into sub-queue %s", + &ere->p, ere->re->vrf_id, + subqueue2str(META_QUEUE_EARLY_ROUTE)); - /* Free implicit route.*/ - if (same) { - ret = 1; - rib_delnode(rn, same); - } + return 0; +} - /* See if we can remove some RE entries that are queued for - * removal, but won't be considered in rib processing. - */ - dest = rib_dest_from_rnode(rn); - RNODE_FOREACH_RE_SAFE (rn, re, same) { - if (CHECK_FLAG(re->status, ROUTE_ENTRY_REMOVED)) { - /* If the route was used earlier, must retain it. */ - if (dest && re == dest->selected_fib) - continue; +/* + * Internal route-add implementation; there are a couple of different public + * signatures. Callers in this path are responsible for the memory they + * allocate: if they allocate a nexthop_group or backup nexthop info, they + * must free those objects. If this returns < 0, an error has occurred and the + * route_entry 're' has not been captured; the caller should free that also. + * + * -1 -> error + * 0 -> Add + * 1 -> update + */ +int rib_add_multipath_nhe(afi_t afi, safi_t safi, struct prefix *p, + struct prefix_ipv6 *src_p, struct route_entry *re, + struct nhg_hash_entry *re_nhe, bool startup) +{ + struct zebra_early_route *ere; - if (IS_ZEBRA_DEBUG_RIB) - rnode_debug(rn, re->vrf_id, "rn %p, removing unneeded re %p", - rn, re); + if (!re) + return -1; - rib_unlink(rn, re); - } - } + assert(!src_p || !src_p->prefixlen || afi == AFI_IP6); - route_unlock_node(rn); - return ret; + ere = XCALLOC(MTYPE_WQ_WRAPPER, sizeof(*ere)); + ere->afi = afi; + ere->safi = safi; + ere->p = *p; + if (src_p) + ere->src_p = *src_p; + ere->src_p_provided = !!src_p; + ere->re = re; + ere->re_nhe = re_nhe; + ere->startup = startup; + + return mq_add_handler(ere, rib_meta_queue_early_route_add); } /* @@ -3731,7 +4081,7 @@ int rib_add_multipath(afi_t afi, safi_t safi, struct prefix *p, struct nexthop_group *ng, bool startup) { int ret; - struct nhg_hash_entry nhe; + struct nhg_hash_entry nhe, *n; if (!re) return -1; @@ -3749,10 +4099,8 @@ int rib_add_multipath(afi_t afi, safi_t safi, struct prefix *p, else if (re->nhe_id > 0) nhe.id = re->nhe_id; - ret = rib_add_multipath_nhe(afi, safi, p, src_p, re, &nhe, startup); - - /* In this path, the callers expect memory to be freed. */ - nexthop_group_delete(&ng); + n = zebra_nhe_copy(&nhe, 0); + ret = rib_add_multipath_nhe(afi, safi, p, src_p, re, n, startup); /* In error cases, free the route also */ if (ret < 0) @@ -3767,212 +4115,40 @@ void rib_delete(afi_t afi, safi_t safi, vrf_id_t vrf_id, int type, uint32_t nhe_id, uint32_t table_id, uint32_t metric, uint8_t distance, bool fromkernel) { - struct route_table *table; - struct route_node *rn; - struct route_entry *re; - struct route_entry *fib = NULL; - struct route_entry *same = NULL; - struct nexthop *rtnh; - char buf2[INET6_ADDRSTRLEN]; - rib_dest_t *dest; - - assert(!src_p || !src_p->prefixlen || afi == AFI_IP6); - - /* Lookup table. */ - table = zebra_vrf_lookup_table_with_table_id(afi, safi, vrf_id, - table_id); - if (!table) - return; - - /* Apply mask. */ - apply_mask(p); - if (src_p) - apply_mask_ipv6(src_p); - - /* Lookup route node. */ - rn = srcdest_rnode_lookup(table, p, src_p); - if (!rn) { - if (IS_ZEBRA_DEBUG_RIB) { - char src_buf[PREFIX_STRLEN]; - struct vrf *vrf = vrf_lookup_by_id(vrf_id); - - if (src_p && src_p->prefixlen) - prefix2str(src_p, src_buf, sizeof(src_buf)); - else - src_buf[0] = '\0'; - - zlog_debug("%s[%d]:%pRN%s%s doesn't exist in rib", - vrf->name, table_id, rn, - (src_buf[0] != '\0') ? " from " : "", - src_buf); - } - return; - } - - dest = rib_dest_from_rnode(rn); - fib = dest->selected_fib; - - /* Lookup same type route. */ - RNODE_FOREACH_RE (rn, re) { - if (CHECK_FLAG(re->status, ROUTE_ENTRY_REMOVED)) - continue; - - if (re->type != type) - continue; - if (re->instance != instance) - continue; - if (CHECK_FLAG(re->flags, ZEBRA_FLAG_RR_USE_DISTANCE) && - distance != re->distance) - continue; - - if (re->type == ZEBRA_ROUTE_KERNEL && re->metric != metric) - continue; - if (re->type == ZEBRA_ROUTE_CONNECT && - (rtnh = re->nhe->nhg.nexthop) - && rtnh->type == NEXTHOP_TYPE_IFINDEX && nh) { - if (rtnh->ifindex != nh->ifindex) - continue; - same = re; - break; - } - - /* Make sure that the route found has the same gateway. */ - if (nhe_id && re->nhe_id == nhe_id) { - same = re; - break; - } - - if (nh == NULL) { - same = re; - break; - } - for (ALL_NEXTHOPS(re->nhe->nhg, rtnh)) { - /* - * No guarantee all kernel send nh with labels - * on delete. - */ - if (nexthop_same_no_labels(rtnh, nh)) { - same = re; - break; - } - } - - if (same) - break; - } - /* If same type of route can't be found and this message is from - kernel. */ - if (!same) { - /* - * In the past(HA!) we could get here because - * we were receiving a route delete from the - * kernel and we're not marking the proto - * as coming from it's appropriate originator. - * Now that we are properly noticing the fact - * that the kernel has deleted our route we - * are not going to get called in this path - * I am going to leave this here because - * this might still work this way on non-linux - * platforms as well as some weird state I have - * not properly thought of yet. - * If we can show that this code path is - * dead then we can remove it. - */ - if (fib && CHECK_FLAG(flags, ZEBRA_FLAG_SELFROUTE)) { - if (IS_ZEBRA_DEBUG_RIB) { - rnode_debug(rn, vrf_id, - "rn %p, re %p (%s) was deleted from kernel, adding", - rn, fib, - zebra_route_string(fib->type)); - } - if (zrouter.allow_delete || - CHECK_FLAG(dest->flags, RIB_ROUTE_ANY_QUEUED)) { - UNSET_FLAG(fib->status, ROUTE_ENTRY_INSTALLED); - /* Unset flags. */ - for (rtnh = fib->nhe->nhg.nexthop; rtnh; - rtnh = rtnh->next) - UNSET_FLAG(rtnh->flags, - NEXTHOP_FLAG_FIB); - - /* - * This is a non FRR route - * as such we should mark - * it as deleted - */ - dest->selected_fib = NULL; - } else { - /* This means someone else, other than Zebra, - * has deleted - * a Zebra router from the kernel. We will add - * it back */ - rib_install_kernel(rn, fib, NULL); - } - } else { - if (IS_ZEBRA_DEBUG_RIB) { - if (nh) - rnode_debug( - rn, vrf_id, - "via %s ifindex %d type %d doesn't exist in rib", - inet_ntop(afi2family(afi), - &nh->gate, buf2, - sizeof(buf2)), - nh->ifindex, type); - else - rnode_debug( - rn, vrf_id, - "type %d doesn't exist in rib", - type); - } - route_unlock_node(rn); - return; - } - } - - if (same) { - struct nexthop *tmp_nh; - - if (fromkernel && CHECK_FLAG(flags, ZEBRA_FLAG_SELFROUTE) && - !zrouter.allow_delete) { - rib_install_kernel(rn, same, NULL); - route_unlock_node(rn); - - return; - } - - /* Special handling for IPv4 or IPv6 routes sourced from - * EVPN - the nexthop (and associated MAC) need to be - * uninstalled if no more refs. - */ - for (ALL_NEXTHOPS(re->nhe->nhg, tmp_nh)) { - struct ipaddr vtep_ip; - - if (CHECK_FLAG(tmp_nh->flags, NEXTHOP_FLAG_EVPN)) { - memset(&vtep_ip, 0, sizeof(struct ipaddr)); - if (afi == AFI_IP) { - vtep_ip.ipa_type = IPADDR_V4; - memcpy(&(vtep_ip.ipaddr_v4), - &(tmp_nh->gate.ipv4), - sizeof(struct in_addr)); - } else { - vtep_ip.ipa_type = IPADDR_V6; - memcpy(&(vtep_ip.ipaddr_v6), - &(tmp_nh->gate.ipv6), - sizeof(struct in6_addr)); - } - zebra_rib_queue_evpn_route_del(re->vrf_id, - &vtep_ip, p); - } - } + struct zebra_early_route *ere; + struct route_entry *re = NULL; + struct nhg_hash_entry *nhe = NULL; - /* Notify dplane if system route changes */ - if (RIB_SYSTEM_ROUTE(re)) - dplane_sys_route_del(rn, same); + re = XCALLOC(MTYPE_RE, sizeof(struct route_entry)); + re->type = type; + re->instance = instance; + re->distance = distance; + re->flags = flags; + re->metric = metric; + re->table = table_id; + re->vrf_id = vrf_id; + re->uptime = monotime(NULL); + re->nhe_id = nhe_id; - rib_delnode(rn, same); + if (nh) { + nhe = zebra_nhg_alloc(); + nhe->nhg.nexthop = nexthop_dup(nh, NULL); } - route_unlock_node(rn); - return; + ere = XCALLOC(MTYPE_WQ_WRAPPER, sizeof(*ere)); + ere->afi = afi; + ere->safi = safi; + ere->p = *p; + if (src_p) + ere->src_p = *src_p; + ere->src_p_provided = !!src_p; + ere->re = re; + ere->re_nhe = nhe; + ere->startup = false; + ere->deletion = true; + ere->fromkernel = fromkernel; + + mq_add_handler(ere, rib_meta_queue_early_route_add); } @@ -3983,8 +4159,8 @@ int rib_add(afi_t afi, safi_t safi, vrf_id_t vrf_id, int type, uint8_t distance, route_tag_t tag, bool startup) { struct route_entry *re = NULL; - struct nexthop *nexthop = NULL; - struct nexthop_group *ng = NULL; + struct nexthop nexthop = {}; + struct nexthop_group ng = {}; /* Allocate new route_entry structure. */ re = XCALLOC(MTYPE_RE, sizeof(struct route_entry)); @@ -4004,15 +4180,12 @@ int rib_add(afi_t afi, safi_t safi, vrf_id_t vrf_id, int type, * we'll use that. Otherwise, pass the nexthop along directly. */ if (!nhe_id) { - ng = nexthop_group_new(); - /* Add nexthop. */ - nexthop = nexthop_new(); - *nexthop = *nh; - nexthop_group_add_sorted(ng, nexthop); + nexthop = *nh; + nexthop_group_add_sorted(&ng, &nexthop); } - return rib_add_multipath(afi, safi, p, src_p, re, ng, startup); + return rib_add_multipath(afi, safi, p, src_p, re, &ng, startup); } static const char *rib_update_event2str(enum rib_update_event event)