Skip to content

Commit

Permalink
[202305] Revert bgp suppress fib pending (#3093)
Browse files Browse the repository at this point in the history
What I did

Revert the feature.

Why I did it

Revert bgp suppress FIB functionality due to found FRR memory consumption issues and bugs.

How I verified it

Basic sanity check on t1-lag, regression in progress.
  • Loading branch information
stepanblyschak authored Dec 30, 2023
1 parent fc4b333 commit d4e0317
Show file tree
Hide file tree
Showing 8 changed files with 10 additions and 344 deletions.
16 changes: 2 additions & 14 deletions config/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -1767,7 +1767,7 @@ def load_minigraph(db, no_service_restart, traffic_shift_away, override_config,
cfggen_namespace_option = ['-n', str(namespace)]
clicommon.run_command([db_migrator, '-o', 'set_version'] + cfggen_namespace_option)

# Keep device isolated with TSA
# Keep device isolated with TSA
if traffic_shift_away:
clicommon.run_command(["TSA"], display_cmd=True)
if override_config:
Expand Down Expand Up @@ -2052,21 +2052,9 @@ def synchronous_mode(sync_mode):
config reload -y \n
Option 2. systemctl restart swss""" % sync_mode)

#
# 'suppress-fib-pending' command ('config suppress-fib-pending ...')
#
@config.command('suppress-fib-pending')
@click.argument('state', metavar='<enabled|disabled>', required=True, type=click.Choice(['enabled', 'disabled']))
@clicommon.pass_db
def suppress_pending_fib(db, state):
''' Enable or disable pending FIB suppression. Once enabled, BGP will not advertise routes that are not yet installed in the hardware '''

config_db = db.cfgdb
config_db.mod_entry('DEVICE_METADATA' , 'localhost', {"suppress-fib-pending" : state})

#
# 'yang_config_validation' command ('config yang_config_validation ...')
#
#
@config.command('yang_config_validation')
@click.argument('yang_config_validation', metavar='<enable|disable>', required=True)
def yang_config_validation(yang_config_validation):
Expand Down
38 changes: 0 additions & 38 deletions doc/Command-Reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -2434,26 +2434,6 @@ This command displays the routing policy that takes precedence over the other ro
Exit routemap
```
**show suppress-fib-pending**
This command is used to show the status of suppress pending FIB feature.
When enabled, BGP will not advertise routes which aren't yet offloaded.
- Usage:
```
show suppress-fib-pending
```
- Examples:
```
admin@sonic:~$ show suppress-fib-pending
Enabled
```
```
admin@sonic:~$ show suppress-fib-pending
Disabled
```
Go Back To [Beginning of the document](#) or [Beginning of this section](#bgp)
### BGP config commands
Expand Down Expand Up @@ -2546,24 +2526,6 @@ This command is used to remove particular IPv4 or IPv6 BGP neighbor configuratio
admin@sonic:~$ sudo config bgp remove neighbor SONIC02SPINE
```
**config suppress-fib-pending**
This command is used to enable or disable announcements of routes not yet installed in the HW.
Once enabled, BGP will not advertise routes which aren't yet offloaded.
- Usage:
```
config suppress-fib-pending <enabled|disabled>
```
- Examples:
```
admin@sonic:~$ sudo config suppress-fib-pending enabled
```
```
admin@sonic:~$ sudo config suppress-fib-pending disabled
```
Go Back To [Beginning of the document](#) or [Beginning of this section](#bgp)
## Console
Expand Down
105 changes: 3 additions & 102 deletions scripts/route_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,8 @@
import time
import signal
import traceback
import subprocess

from ipaddress import ip_network

from swsscommon import swsscommon
from utilities_common import chassis

Expand All @@ -73,9 +72,6 @@

PRINT_MSG_LEN_MAX = 1000

FRR_CHECK_RETRIES = 3
FRR_WAIT_TIME = 15

class Level(Enum):
ERR = 'ERR'
INFO = 'INFO'
Expand Down Expand Up @@ -148,7 +144,7 @@ def add_prefix(ip):
ip = ip + PREFIX_SEPARATOR + "32"
else:
ip = ip + PREFIX_SEPARATOR + "128"
return str(ip_network(ip))
return ip


def add_prefix_ifnot(ip):
Expand Down Expand Up @@ -324,31 +320,6 @@ def get_route_entries():
return (selector, subs, sorted(rt))


def is_suppress_fib_pending_enabled():
"""
Returns True if FIB suppression is enabled, False otherwise
"""
cfg_db = swsscommon.ConfigDBConnector()
cfg_db.connect()

state = cfg_db.get_entry('DEVICE_METADATA', 'localhost').get('suppress-fib-pending')

return state == 'enabled'


def get_frr_routes():
"""
Read routes from zebra through CLI command
:return frr routes dictionary
"""

output = subprocess.check_output('show ip route json', shell=True)
routes = json.loads(output)
output = subprocess.check_output('show ipv6 route json', shell=True)
routes.update(json.loads(output))
return routes


def get_interfaces():
"""
helper to read interface table from APPL-DB.
Expand Down Expand Up @@ -525,61 +496,6 @@ def filter_out_standalone_tunnel_routes(routes):
return updated_routes


def check_frr_pending_routes():
"""
Check FRR routes for offload flag presence by executing "show ip route json"
Returns a list of routes that have no offload flag.
"""

missed_rt = []

retries = FRR_CHECK_RETRIES
for i in range(retries):
missed_rt = []
frr_routes = get_frr_routes()

for _, entries in frr_routes.items():
for entry in entries:
if entry['protocol'] != 'bgp':
continue

# TODO: Also handle VRF routes. Currently this script does not check for VRF routes so it would be incorrect for us
# to assume they are installed in ASIC_DB, so we don't handle them.
if entry['vrfName'] != 'default':
continue

if not entry.get('offloaded', False):
missed_rt.append(entry)

if not missed_rt:
break

time.sleep(FRR_WAIT_TIME)

return missed_rt


def mitigate_installed_not_offloaded_frr_routes(missed_frr_rt, rt_appl):
"""
Mitigate installed but not offloaded FRR routes.
In case route exists in APPL_DB, this function will manually send a notification to fpmsyncd
to trigger the flow that sends offload flag to zebra.
It is designed to mitigate a problem when orchagent fails to send notification about installed route to fpmsyncd
or fpmsyncd not being able to read the notification or in case zebra fails to receive offload update due to variety of reasons.
All of the above mentioned cases must be considered as a bug, but even in that case we will report an error in the log but
given that this script ensures the route is installed in the hardware it will automitigate such a bug.
"""
db = swsscommon.DBConnector('APPL_STATE_DB', 0)
response_producer = swsscommon.NotificationProducer(db, f'{APPL_DB_NAME}_{swsscommon.APP_ROUTE_TABLE_NAME}_RESPONSE_CHANNEL')
for entry in [entry for entry in missed_frr_rt if entry['prefix'] in rt_appl]:
fvs = swsscommon.FieldValuePairs([('err_str', 'SWSS_RC_SUCCESS'), ('protocol', entry['protocol'])])
response_producer.send('SWSS_RC_SUCCESS', entry['prefix'], fvs)

print_message(syslog.LOG_ERR, f'Mitigated route {entry["prefix"]}', write_to_stdout=False)


def get_soc_ips(config_db):
mux_table = config_db.get_table('MUX_CABLE')
soc_ips = []
Expand Down Expand Up @@ -614,7 +530,7 @@ def filter_out_soc_ip_routes(routes):

if not soc_ips:
return routes

updated_routes = []
for route in routes:
if route not in soc_ips:
Expand Down Expand Up @@ -681,16 +597,12 @@ def check_routes():
If there are still some unjustifiable diffs, between APPL & ASIC DB,
related to routes report failure, else all good.
If there are FRR routes that aren't marked offloaded but all APPL & ASIC DB
routes are in sync report failure and perform a mitigation action.
:return (0, None) on sucess, else (-1, results) where results holds
the unjustifiable entries.
"""
intf_appl_miss = []
rt_appl_miss = []
rt_asic_miss = []
rt_frr_miss = []

results = {}
adds = []
Expand Down Expand Up @@ -744,22 +656,11 @@ def check_routes():
if rt_asic_miss:
results["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss

rt_frr_miss = check_frr_pending_routes()

if rt_frr_miss:
results["missed_FRR_routes"] = rt_frr_miss

if results:
print_message(syslog.LOG_WARNING, "Failure results: {", json.dumps(results, indent=4), "}")
print_message(syslog.LOG_WARNING, "Failed. Look at reported mismatches above")
print_message(syslog.LOG_WARNING, "add: ", json.dumps(adds, indent=4))
print_message(syslog.LOG_WARNING, "del: ", json.dumps(deletes, indent=4))

if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR but all routes in APPL_DB and ASIC_DB are in sync")
if is_suppress_fib_pending_enabled():
mitigate_installed_not_offloaded_frr_routes(rt_frr_miss, rt_appl)

return -1, results
else:
print_message(syslog.LOG_INFO, "All good!")
Expand Down
11 changes: 0 additions & 11 deletions show/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2124,17 +2124,6 @@ def peer(db, peer_ip):
click.echo(tabulate(bfd_body, bfd_headers))


# 'suppress-fib-pending' subcommand ("show suppress-fib-pending")
@cli.command('suppress-fib-pending')
@clicommon.pass_db
def suppress_pending_fib(db):
""" Show the status of suppress pending FIB feature """

field_values = db.cfgdb.get_entry('DEVICE_METADATA', 'localhost')
state = field_values.get('suppress-fib-pending', 'disabled').title()
click.echo(state)


# Load plugins and register them
helper = util_base.UtilHelper()
helper.load_and_register_plugins(plugins, cli)
Expand Down
3 changes: 1 addition & 2 deletions tests/mock_tables/config_db.json
Original file line number Diff line number Diff line change
Expand Up @@ -881,8 +881,7 @@
"mac": "1d:34:db:16:a6:00",
"platform": "x86_64-mlnx_msn3800-r0",
"peer_switch": "sonic-switch",
"type": "ToRRouter",
"suppress-fib-pending": "enabled"
"type": "ToRRouter"
},
"SNMP_COMMUNITY|msft": {
"TYPE": "RO"
Expand Down
27 changes: 4 additions & 23 deletions tests/route_check_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import time
from sonic_py_common import device_info
from unittest.mock import MagicMock, patch
from tests.route_check_test_data import APPL_DB, ARGS, ASIC_DB, CONFIG_DB, DEFAULT_CONFIG_DB, APPL_STATE_DB, DESCR, OP_DEL, OP_SET, PRE, RESULT, RET, TEST_DATA, UPD, FRR_ROUTES
from tests.route_check_test_data import APPL_DB, ARGS, ASIC_DB, CONFIG_DB, DEFAULT_CONFIG_DB, APPL_STATE_DB, DESCR, OP_DEL, OP_SET, PRE, RESULT, RET, TEST_DATA, UPD

import pytest

Expand Down Expand Up @@ -182,7 +182,7 @@ def pop(self):

print("state={} k={} op={} v={}".format(self.state, k, op, str(v)))
return (k, op, v)


def getDbConnector(self):
return self.dbconn
Expand Down Expand Up @@ -240,7 +240,6 @@ def setup(self):

def init(self):
route_check.UNIT_TESTING = 1
route_check.FRR_WAIT_TIME = 0

@pytest.fixture
def force_hang(self):
Expand All @@ -260,8 +259,7 @@ def mock_dbs(self):
patch("route_check.swsscommon.Table") as mock_table, \
patch("route_check.swsscommon.Select") as mock_sel, \
patch("route_check.swsscommon.SubscriberStateTable") as mock_subs, \
patch("route_check.swsscommon.ConfigDBConnector", return_value=mock_config_db), \
patch("route_check.swsscommon.NotificationProducer"):
patch("route_check.swsscommon.ConfigDBConnector", return_value=mock_config_db):
device_info.get_platform = MagicMock(return_value='unittest')
set_mock(mock_table, mock_conn, mock_sel, mock_subs, mock_config_db)
yield
Expand All @@ -275,16 +273,7 @@ def test_route_check(self, mock_dbs, test_num):
set_test_case_data(ct_data)
logger.info("Running test case {}: {}".format(test_num, ct_data[DESCR]))

with patch('sys.argv', ct_data[ARGS].split()), \
patch('route_check.subprocess.check_output') as mock_check_output:

routes = ct_data.get(FRR_ROUTES, {})

def side_effect(*args, **kwargs):
return json.dumps(routes)

mock_check_output.side_effect = side_effect

with patch('sys.argv', ct_data[ARGS].split()):
ret, res = route_check.main()
expect_ret = ct_data[RET] if RET in ct_data else 0
expect_res = ct_data[RESULT] if RESULT in ct_data else None
Expand Down Expand Up @@ -322,11 +311,3 @@ def test_logging(self):
assert len(msg) == 5
msg = route_check.print_message(syslog.LOG_ERR, "a", "b", "c", "d", "e", "f")
assert len(msg) == 5

def test_mitigate_routes(self, mock_dbs):
missed_frr_rt = [ { 'prefix': '192.168.0.1', 'protocol': 'bgp' } ]
rt_appl = [ '192.168.0.1' ]
with patch('sys.stdout', new_callable=StringIO) as mock_stdout:
route_check.mitigate_installed_not_offloaded_frr_routes(missed_frr_rt, rt_appl)
# Verify that the stdout are suppressed in this function
assert not mock_stdout.getvalue()
Loading

0 comments on commit d4e0317

Please sign in to comment.