From 09a525fcbe6eb1d6ad26b2eb99013e52cabb915d Mon Sep 17 00:00:00 2001 From: kk-inoue-esol <76925382+kk-inoue-esol@users.noreply.github.com> Date: Fri, 25 Mar 2022 18:28:09 +0900 Subject: [PATCH] feat(system_monitor): change method of CPU usage monitoring (#557) * feat(lidar_detection): changing default input topic name of lidar detection nodes (#433) Signed-off-by: kk-inoue-esol * feat(system_monitor): change method of CPU usage monitoring Signed-off-by: kk-inoue-esol Co-authored-by: Taichi Higashide --- .../config/cpu_monitor.param.yaml | 3 +- system/system_monitor/docs/ros_parameters.md | 20 ++++++----- .../cpu_monitor/cpu_monitor_base.hpp | 24 +++++++------ .../src/cpu_monitor/cpu_monitor_base.cpp | 35 +++++++++++-------- 4 files changed, 46 insertions(+), 36 deletions(-) diff --git a/system/system_monitor/config/cpu_monitor.param.yaml b/system/system_monitor/config/cpu_monitor.param.yaml index 2f049519aa17d..cae88d6a965b7 100644 --- a/system/system_monitor/config/cpu_monitor.param.yaml +++ b/system/system_monitor/config/cpu_monitor.param.yaml @@ -2,6 +2,7 @@ ros__parameters: usage_warn: 0.96 usage_error: 1.00 - usage_count: 2 + usage_warn_count: 2 + usage_error_count: 2 usage_avg: true msr_reader_port: 7634 diff --git a/system/system_monitor/docs/ros_parameters.md b/system/system_monitor/docs/ros_parameters.md index e42f843c78992..442be2f29ca9c 100644 --- a/system/system_monitor/docs/ros_parameters.md +++ b/system/system_monitor/docs/ros_parameters.md @@ -4,15 +4,17 @@ cpu_monitor: -| Name | Type | Unit | Default | Notes | -| :-------------- | :---: | :-----: | :-----: | :---------------------------------------------------------------------------- | -| temp_warn | float | DegC | 90.0 | Generates warning when CPU temperature reaches a specified value or higher. | -| temp_error | float | DegC | 95.0 | Generates error when CPU temperature reaches a specified value or higher. | -| usage_warn | float | %(1e-2) | 0.90 | Generates warning when CPU usage reaches a specified value or higher. | -| usage_error | float | %(1e-2) | 1.00 | Generates error when CPU usage reaches a specified value or higher. | -| load1_warn | float | %(1e-2) | 0.90 | Generates warning when load average 1min reaches a specified value or higher. | -| load5_warn | float | %(1e-2) | 0.80 | Generates warning when load average 5min reaches a specified value or higher. | -| msr_reader_port | int | n/a | 7634 | Port number to connect to msr_reader. | +| Name | Type | Unit | Default | Notes | +| :---------------- | :---: | :-----: | :-----: | :--------------------------------------------------------------------------------------------------------- | +| temp_warn | float | DegC | 90.0 | Generates warning when CPU temperature reaches a specified value or higher. | +| temp_error | float | DegC | 95.0 | Generates error when CPU temperature reaches a specified value or higher. | +| usage_warn | float | %(1e-2) | 0.90 | Generates warning when CPU usage reaches a specified value or higher and last for usage_warn_count counts. | +| usage_error | float | %(1e-2) | 1.00 | Generates error when CPU usage reaches a specified value or higher and last for usage_error_count counts. | +| usage_warn_count | int | n/a | 2 | Generates warning when CPU usage reaches usage_warn value or higher and last for a specified counts. | +| usage_error_count | int | n/a | 2 | Generates error when CPU usage reaches usage_error value or higher and last for a specified counts. | +| load1_warn | float | %(1e-2) | 0.90 | Generates warning when load average 1min reaches a specified value or higher. | +| load5_warn | float | %(1e-2) | 0.80 | Generates warning when load average 5min reaches a specified value or higher. | +| msr_reader_port | int | n/a | 7634 | Port number to connect to msr_reader. | ## HDD Monitor diff --git a/system/system_monitor/include/system_monitor/cpu_monitor/cpu_monitor_base.hpp b/system/system_monitor/include/system_monitor/cpu_monitor/cpu_monitor_base.hpp index f53ba45d043b9..a9bf7f17cdecf 100644 --- a/system/system_monitor/include/system_monitor/cpu_monitor/cpu_monitor_base.hpp +++ b/system/system_monitor/include/system_monitor/cpu_monitor/cpu_monitor_base.hpp @@ -135,17 +135,19 @@ class CPUMonitorBase : public rclcpp::Node diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics - char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name - int num_cores_; //!< @brief number of cores - std::vector temps_; //!< @brief CPU list for temperature - std::vector freqs_; //!< @brief CPU list for frequency - std::vector usage_check_cnt_; //!< @brief CPU list for usage over check counter - bool mpstat_exists_; //!< @brief flag if mpstat exists - - float usage_warn_; //!< @brief CPU usage(%) to generate warning - float usage_error_; //!< @brief CPU usage(%) to generate error - int usage_count_; //!< @brief CPU usage(%) usage over continuous count - bool usage_avg_; //!< @brief Check CPU usage calculated as averages among all processors + char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name + int num_cores_; //!< @brief number of cores + std::vector temps_; //!< @brief CPU list for temperature + std::vector freqs_; //!< @brief CPU list for frequency + std::vector usage_warn_check_cnt_; //!< @brief CPU list for usage over warn check counter + std::vector usage_error_check_cnt_; //!< @brief CPU list for usage over error check counter + bool mpstat_exists_; //!< @brief flag if mpstat exists + + float usage_warn_; //!< @brief CPU usage(%) to generate warning + float usage_error_; //!< @brief CPU usage(%) to generate error + int usage_warn_count_; //!< @brief continuous count over usage_warn_ to generate warning + int usage_error_count_; //!< @brief continuous count over usage_error_ to generate error + bool usage_avg_; //!< @brief Check CPU usage calculated as averages among all processors /** * @brief CPU temperature status messages diff --git a/system/system_monitor/src/cpu_monitor/cpu_monitor_base.cpp b/system/system_monitor/src/cpu_monitor/cpu_monitor_base.cpp index 44e375052c542..90766fbcc79fc 100644 --- a/system/system_monitor/src/cpu_monitor/cpu_monitor_base.cpp +++ b/system/system_monitor/src/cpu_monitor/cpu_monitor_base.cpp @@ -47,12 +47,14 @@ CPUMonitorBase::CPUMonitorBase(const std::string & node_name, const rclcpp::Node mpstat_exists_(false), usage_warn_(declare_parameter("usage_warn", 0.96)), usage_error_(declare_parameter("usage_error", 1.00)), - usage_count_(declare_parameter("usage_count", 2)), + usage_warn_count_(declare_parameter("usage_warn_count", 2)), + usage_error_count_(declare_parameter("usage_error_count", 2)), usage_avg_(declare_parameter("usage_avg", true)) { gethostname(hostname_, sizeof(hostname_)); num_cores_ = boost::thread::hardware_concurrency(); - usage_check_cnt_.resize(num_cores_ + 2); // 2 = all + dummy + usage_warn_check_cnt_.resize(num_cores_ + 2); // 2 = all + dummy + usage_error_check_cnt_.resize(num_cores_ + 2); // 2 = all + dummy // Check if command exists fs::path p = bp::search_path("mpstat"); @@ -233,7 +235,8 @@ void CPUMonitorBase::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & st } catch (const std::exception & e) { stat.summary(DiagStatus::ERROR, "mpstat exception"); stat.add("mpstat", e.what()); - std::fill(usage_check_cnt_.begin(), usage_check_cnt_.end(), 0); + std::fill(usage_warn_check_cnt_.begin(), usage_warn_check_cnt_.end(), 0); + std::fill(usage_error_check_cnt_.begin(), usage_error_check_cnt_.end(), 0); cpu_usage.all.status = CpuStatus::STALE; cpu_usage.cpus.clear(); publishCpuUsage(cpu_usage); @@ -268,22 +271,24 @@ int CPUMonitorBase::CpuUsageToLevel(const std::string & cpu_name, float usage) } // convert CPU usage to level - int level; - if (usage >= usage_error_) { - usage_check_cnt_[idx] = usage_count_; - level = DiagStatus::ERROR; - } else if (usage >= usage_warn_) { - if (usage_check_cnt_[idx] < usage_count_) { - usage_check_cnt_[idx]++; - } - if (usage_check_cnt_[idx] >= usage_count_) { - level = DiagStatus::ERROR; + int level = DiagStatus::OK; + if (usage >= usage_warn_) { + if (usage_warn_check_cnt_[idx] < usage_warn_count_) { + usage_warn_check_cnt_[idx]++; } else { level = DiagStatus::WARN; } } else { - usage_check_cnt_[idx] = 0; - level = DiagStatus::OK; + usage_warn_check_cnt_[idx] = 0; + } + if (usage >= usage_error_) { + if (usage_error_check_cnt_[idx] < usage_error_count_) { + usage_error_check_cnt_[idx]++; + } else { + level = DiagStatus::ERROR; + } + } else { + usage_error_check_cnt_[idx] = 0; } return level;