Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[reboot-history] Add reboot history to state db #5933

Merged
merged 5 commits into from
Nov 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -418,16 +418,6 @@ sudo cp $IMAGE_CONFIGS/pcie-check/pcie-check.service $FILESYSTEM_ROOT_USR_LIB_SY
echo "pcie-check.service" | sudo tee -a $GENERATED_SERVICE_FILE
sudo cp $IMAGE_CONFIGS/pcie-check/pcie-check.sh $FILESYSTEM_ROOT/usr/bin/

# Copy systemd timer configuration
# It implements delayed start of services
sudo cp $BUILD_TEMPLATES/process-reboot-cause.timer $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable process-reboot-cause.timer

# Copy process-reboot-cause service files
sudo cp $IMAGE_CONFIGS/process-reboot-cause/process-reboot-cause.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
echo "process-reboot-cause.service" | sudo tee -a $GENERATED_SERVICE_FILE
sudo cp $IMAGE_CONFIGS/process-reboot-cause/process-reboot-cause $FILESYSTEM_ROOT/usr/bin/

## Install package without starting service
## ref: https://wiki.debian.org/chroot
sudo tee -a $FILESYSTEM_ROOT/usr/sbin/policy-rc.d > /dev/null <<EOF
Expand Down
3 changes: 3 additions & 0 deletions src/sonic-host-services-data/debian/rules
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ build:
override_dh_installsystemd:
dh_installsystemd --no-start --name=caclmgrd
dh_installsystemd --no-start --name=procdockerstatsd
dh_installsystemd --no-start --name=determine-reboot-cause
dh_installsystemd --no-start --name=process-reboot-cause

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[Unit]
Description=Reboot cause determination service
Requires=rc-local.service
After=rc-local.service

[Service]
Type=simple
ExecStart=/usr/local/bin/determine-reboot-cause

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
@@ -1,28 +1,32 @@
#!/usr/bin/env python
#!/usr/bin/env python3
#
# process-reboot-cause
# determine-reboot-cause
#
# Program designed to run once, soon after system boot which will
# determine the cause of the previous reboot and store it to the disk,
#

try:
import datetime
import json
import os
import pwd
import re
import sys

from sonic_py_common import device_info, logger

except ImportError as err:
raise ImportError("%s - required module not found" % str(err))

VERSION = "1.0"

SYSLOG_IDENTIFIER = "process-reboot-cause"
SYSLOG_IDENTIFIER = "determine-reboot-cause"

REBOOT_CAUSE_DIR = "/host/reboot-cause/"
REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "reboot-cause.txt"
PREVIOUS_REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "previous-reboot-cause.txt"
REBOOT_CAUSE_HISTORY_DIR = "/host/reboot-cause/history/"
REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "reboot-cause.txt")
PREVIOUS_REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "previous-reboot-cause.json")
FIRST_BOOT_PLATFORM_FILE = "/tmp/notify_firstboot_to_platform"
REBOOT_TYPE_KEXEC_FILE = "/proc/cmdline"
# The following SONIC_BOOT_TYPEs come from the warm/fast reboot script which is in sonic-utilities
Expand All @@ -45,7 +49,7 @@ sonic_logger = logger.Logger(SYSLOG_IDENTIFIER)
# ============================= Functions =============================
def parse_warmfast_reboot_from_proc_cmdline():
if os.path.isfile(REBOOT_TYPE_KEXEC_FILE):
with open(REBOOT_TYPE_KEXEC_FILE, "r") as cause_file:
with open(REBOOT_TYPE_KEXEC_FILE) as cause_file:
cause_file_kexec = cause_file.readline()
m = re.search(REBOOT_TYPE_KEXEC_PATTERN_WARM, cause_file_kexec)
if m and m.group(1):
Expand All @@ -56,69 +60,100 @@ def parse_warmfast_reboot_from_proc_cmdline():
return None


def find_software_reboot_cause():
software_reboot_cause = REBOOT_CAUSE_UNKNOWN

def find_software_reboot_cause_from_reboot_cause_file():
software_reboot_cause = None
if os.path.isfile(REBOOT_CAUSE_FILE):
with open(REBOOT_CAUSE_FILE, "r") as cause_file:
with open(REBOOT_CAUSE_FILE) as cause_file:
software_reboot_cause = cause_file.readline().rstrip('\n')
sonic_logger.log_info("{} indicates the reboot cause: {}".format(REBOOT_CAUSE_FILE, software_reboot_cause))
else:
sonic_logger.log_info("Reboot cause file {} not found".format(REBOOT_CAUSE_FILE))
return software_reboot_cause

if os.path.isfile(FIRST_BOOT_PLATFORM_FILE):
if software_reboot_cause == REBOOT_CAUSE_UNKNOWN:
version_info = device_info.get_sonic_version_info()
build_version = version_info['build_version'] if version_info else "unknown"
software_reboot_cause += " (First boot of SONiC version {})".format(build_version)
os.remove(FIRST_BOOT_PLATFORM_FILE)

def find_first_boot_version():
build_version = "unknown"
version_info = device_info.get_sonic_version_info()
if version_info:
build_version = version_info['build_version']
return " (First boot of SONiC version {})".format(build_version)


def find_software_reboot_cause():
software_reboot_cause = find_software_reboot_cause_from_reboot_cause_file()
if software_reboot_cause == REBOOT_CAUSE_UNKNOWN:
if os.path.isfile(FIRST_BOOT_PLATFORM_FILE):
software_reboot_cause += find_first_boot_version()
os.remove(FIRST_BOOT_PLATFORM_FILE)
return software_reboot_cause


def find_proc_cmdline_reboot_cause():
proc_cmdline_reboot_cause = parse_warmfast_reboot_from_proc_cmdline()

if proc_cmdline_reboot_cause:
sonic_logger.log_info("/proc/cmdline indicates reboot type: {}".format(proc_cmdline_reboot_cause))
else:
sonic_logger.log_info("No reboot cause found from /proc/cmdline")

return proc_cmdline_reboot_cause


def find_hardware_reboot_cause():
hardware_reboot_cause = None
return proc_cmdline_reboot_cause

def get_reboot_cause_from_platform():
# Until all platform vendors have provided sonic_platform packages,
# if there is no sonic_platform package installed, we only provide
# software-related reboot causes.
try:
import sonic_platform
platform = sonic_platform.platform.Platform()
chassis = platform.get_chassis()
return chassis.get_reboot_cause()
except ImportError as err:
sonic_logger.log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.")

platform = sonic_platform.platform.Platform()

chassis = platform.get_chassis()
def find_hardware_reboot_cause():
hardware_reboot_cause = None

hardware_reboot_cause_major, hardware_reboot_cause_minor = chassis.get_reboot_cause()
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"

if hardware_reboot_cause_major == chassis.REBOOT_CAUSE_NON_HARDWARE:
# The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will
# contain any software-related reboot info. We will use it as the previous cause.
pass
elif hardware_reboot_cause_major == chassis.REBOOT_CAUSE_HARDWARE_OTHER:
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
else:
hardware_reboot_cause = hardware_reboot_cause_major
except ImportError as err:
sonic_logger.log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.")
hardware_reboot_cause_major, hardware_reboot_cause_minor = get_reboot_cause_from_platform()
sonic_logger.log_info("Platform api returns reboot cause {}, {}".format(hardware_reboot_cause_major, hardware_reboot_cause_minor))

if hardware_reboot_cause_major == REBOOT_CAUSE_NON_HARDWARE:
# The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will
# contain any software-related reboot info. We will use it as the previous cause.
pass
elif hardware_reboot_cause_major == REBOOT_CAUSE_HARDWARE_OTHER:
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
else:
hardware_reboot_cause = hardware_reboot_cause_major

if hardware_reboot_cause:
sonic_logger.log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause))
else:
sonic_logger.log_info("No reboot cause found from platform api")

return hardware_reboot_cause
return hardware_reboot_cause, hardware_reboot_cause_minor

def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
# resultant dictionary
reboot_cause_dict = {}
reboot_cause_dict['gen_time'] = gen_time
reboot_cause_dict['cause'] = previous_reboot_cause
reboot_cause_dict['user'] = "N/A"
reboot_cause_dict['time'] = "N/A"
reboot_cause_dict['comment'] = comment if comment is not None else "N/A"

if re.search(r'User issued', previous_reboot_cause):
# Match with "User issued '{}' command [User: {}, Time: {}]"
match = re.search(r'User issued \'(.*)\' command \[User: (.*), Time: (.*)\]', previous_reboot_cause)
if match is not None:
reboot_cause_dict['cause'] = match.group(1)
reboot_cause_dict['user'] = match.group(2)
reboot_cause_dict['time'] = match.group(3)

return reboot_cause_dict


def main():
Expand All @@ -139,23 +174,23 @@ def main():
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
os.remove(PREVIOUS_REBOOT_CAUSE_FILE)

# Set a default previous reboot cause
previous_reboot_cause = REBOOT_CAUSE_UNKNOWN
hardware_reboot_cause = None
additional_reboot_info = None

# 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()

# 2. Check if the previous reboot was caused by hardware
# If yes, the hardware reboot cause will be treated as the reboot cause
hardware_reboot_cause = find_hardware_reboot_cause()
(hardware_reboot_cause, additional_reboot_info) = find_hardware_reboot_cause()

# 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related
# reboot info. We will use it as the previous cause.
software_reboot_cause = find_software_reboot_cause()

# The main decision logic of the reboot cause:
# If there is a reboot cause indicated by /proc/cmdline, it should be warmreboot/fastreboot
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
# will be treated as the reboot cause
# Elif there is a reboot cause indicated by platform API,
# the hardware_reboot_cause will be treated as the reboot cause
Expand All @@ -167,12 +202,26 @@ def main():
else:
previous_reboot_cause = software_reboot_cause

# Write the previous reboot cause to PREVIOUS_REBOOT_CAUSE_FILE
with open(PREVIOUS_REBOOT_CAUSE_FILE, "w") as prev_cause_file:
prev_cause_file.write(previous_reboot_cause)
# Current time
reboot_cause_gen_time = str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))

# Save the previous cause info into its history file as json format
reboot_cause_dict = get_reboot_cause_dict(previous_reboot_cause, additional_reboot_info, reboot_cause_gen_time)

# Create reboot-cause-#time#.json under history directory
REBOOT_CAUSE_HISTORY_FILE = os.path.join(REBOOT_CAUSE_HISTORY_DIR, "reboot-cause-{}.json".format(reboot_cause_gen_time))

# Create REBOOT_CAUSE_HISTORY_DIR if it doesn't exist
if not os.path.exists(REBOOT_CAUSE_HISTORY_DIR):
os.makedirs(REBOOT_CAUSE_HISTORY_DIR)

# Write the previous reboot cause to REBOOT_CAUSE_HISTORY_FILE as a JSON format
with open(REBOOT_CAUSE_HISTORY_FILE, "w") as reboot_cause_history_file:
json.dump(reboot_cause_dict, reboot_cause_history_file)

# Create a symbolic link to previous-reboot-cause.json file
os.symlink(REBOOT_CAUSE_HISTORY_FILE, PREVIOUS_REBOOT_CAUSE_FILE)

# Also log the previous reboot cause to the syslog
sonic_logger.log_info("Previous reboot cause: {}".format(previous_reboot_cause))

# Remove the old REBOOT_CAUSE_FILE
if os.path.exists(REBOOT_CAUSE_FILE):
Expand Down
100 changes: 100 additions & 0 deletions src/sonic-host-services/scripts/process-reboot-cause
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
#
# process-reboot-cause
#
# Program designed to read the previous reboot-cause files, log the last previous reboot-cause.
# And read the saved reboot-cause history files and save the reboot cause in the state-db.
#

try:
import json
import os
import pwd
import sys

import swsssdk
from sonic_py_common import logger
except ImportError as err:
raise ImportError("%s - required module not found" % str(err))

VERSION = "1.0"

SYSLOG_IDENTIFIER = "process-reboot-cause"

REBOOT_CAUSE_DIR = "/host/reboot-cause/"
REBOOT_CAUSE_HISTORY_DIR = "/host/reboot-cause/history/"
PREVIOUS_REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "previous-reboot-cause.json")
USER_ISSUED_REBOOT_CAUSE_REGEX ="User issued \'{}\' command [User: {}, Time: {}]"

REBOOT_CAUSE_UNKNOWN = "Unknown"
REBOOT_CAUSE_TABLE_NAME = "REBOOT_CAUSE"

REDIS_HOSTIP = "127.0.0.1"
state_db = None

# Global logger class instance
sonic_logger = logger.Logger(SYSLOG_IDENTIFIER)


# ============================= Functions =============================
def read_reboot_cause_files_and_save_state_db():
# Connect State DB
state_db = swsssdk.SonicV2Connector(host=REDIS_HOSTIP)
state_db.connect(state_db.STATE_DB)

# Sort the previous reboot cause files by creation time
REBOOT_FILE_LIST = [os.path.join(REBOOT_CAUSE_HISTORY_DIR, i) for i in os.listdir(REBOOT_CAUSE_HISTORY_DIR)]
TIME_SORTED_FULL_REBOOT_FILE_LIST = sorted(REBOOT_FILE_LIST, key=os.path.getmtime, reverse=True)

data = []
# Read each sorted previous reboot cause file and update the state db with previous reboot cause information
for i in range(min(10, len(TIME_SORTED_FULL_REBOOT_FILE_LIST))):
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i]
if os.path.isfile(x):
with open(x, "r") as cause_file:
data = json.load(cause_file)
_hash = '{}|{}'.format(REBOOT_CAUSE_TABLE_NAME, data['gen_time'])
state_db.set(state_db.STATE_DB, _hash, 'cause', data['cause'])
state_db.set(state_db.STATE_DB, _hash, 'time', data['time'])
state_db.set(state_db.STATE_DB, _hash, 'user', data['user'])
state_db.set(state_db.STATE_DB, _hash, 'comment', data['comment'])

if len(TIME_SORTED_FULL_REBOOT_FILE_LIST) > 10:
for i in range(len(TIME_SORTED_FULL_REBOOT_FILE_LIST)):
if i >= 10:
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i]
os.remove(x)


def main():
# Configure logger to log all messages INFO level and higher
sonic_logger.set_min_log_priority_info()

sonic_logger.log_info("Starting up...")

if not os.geteuid() == 0:
sonic_logger.log_error("User {} does not have permission to execute".format(pwd.getpwuid(os.getuid()).pw_name))
sys.exit("This utility must be run as root")

# Set a default previous reboot cause
previous_reboot_cause = REBOOT_CAUSE_UNKNOWN

# Read the most recent reboot cause file and log data to syslog
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
with open(PREVIOUS_REBOOT_CAUSE_FILE, "r") as last_cause_file:
data = json.load(last_cause_file)
if data['user']:
previous_reboot_cause = USER_ISSUED_REBOOT_CAUSE_REGEX.format(data['cause'], data['user'], data['time'])
else:
previous_reboot_cause = "{}".format(data['cause'])

# Log the last reboot cause to the syslog
sonic_logger.log_info("Previous reboot cause: {}".format(previous_reboot_cause))

if os.path.exists(REBOOT_CAUSE_HISTORY_DIR):
# Read the previous reboot cause from saved reboot-cause files and save the previous reboot cause upto 10 entry to the state db
read_reboot_cause_files_and_save_state_db()


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions src/sonic-host-services/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
scripts = [
'scripts/caclmgrd',
'scripts/procdockerstatsd',
'scripts/determine-reboot-cause',
'scripts/process-reboot-cause',
],
install_requires = [
'sonic-py-common',
Expand Down
Loading