Skip to content

Commit

Permalink
change method of CPU usage monitoring
Browse files Browse the repository at this point in the history
Signed-off-by: kk-inoue-esol <kk-inoue@esol.co.jp>
  • Loading branch information
kk-inoue-esol committed Mar 22, 2022
1 parent 85da02b commit 6c16055
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 36 deletions.
3 changes: 2 additions & 1 deletion system/system_monitor/config/cpu_monitor.param.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
ros__parameters:
usage_warn: 0.96
usage_error: 1.00
usage_count: 2
usage_warn_count: 2
usage_error_count: 2
usage_avg: true
msr_reader_port: 7634
20 changes: 11 additions & 9 deletions system/system_monitor/docs/ros_parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@

cpu_monitor:

| Name | Type | Unit | Default | Notes |
| :-------------- | :---: | :-----: | :-----: | :---------------------------------------------------------------------------- |
| temp_warn | float | DegC | 90.0 | Generates warning when CPU temperature reaches a specified value or higher. |
| temp_error | float | DegC | 95.0 | Generates error when CPU temperature reaches a specified value or higher. |
| usage_warn | float | %(1e-2) | 0.90 | Generates warning when CPU usage reaches a specified value or higher. |
| usage_error | float | %(1e-2) | 1.00 | Generates error when CPU usage reaches a specified value or higher. |
| load1_warn | float | %(1e-2) | 0.90 | Generates warning when load average 1min reaches a specified value or higher. |
| load5_warn | float | %(1e-2) | 0.80 | Generates warning when load average 5min reaches a specified value or higher. |
| msr_reader_port | int | n/a | 7634 | Port number to connect to msr_reader. |
| Name | Type | Unit | Default | Notes |
| :---------------- | :---: | :-----: | :-----: | :--------------------------------------------------------------------------------------------------------- |
| temp_warn | float | DegC | 90.0 | Generates warning when CPU temperature reaches a specified value or higher. |
| temp_error | float | DegC | 95.0 | Generates error when CPU temperature reaches a specified value or higher. |
| usage_warn | float | %(1e-2) | 0.90 | Generates warning when CPU usage reaches a specified value or higher and last for usage_warn_count counts. |
| usage_error | float | %(1e-2) | 1.00 | Generates error when CPU usage reaches a specified value or higher and last for usage_error_count counts. |
| usage_warn_count | int | n/a | 2 | Generates warning when CPU usage reaches usage_warn value or higher and last for a specified counts. |
| usage_error_count | int | n/a | 2 | Generates error when CPU usage reaches usage_error value or higher and last for a specified counts. |
| load1_warn | float | %(1e-2) | 0.90 | Generates warning when load average 1min reaches a specified value or higher. |
| load5_warn | float | %(1e-2) | 0.80 | Generates warning when load average 5min reaches a specified value or higher. |
| msr_reader_port | int | n/a | 7634 | Port number to connect to msr_reader. |

## <u>HDD Monitor</u>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,17 +135,19 @@ class CPUMonitorBase : public rclcpp::Node

diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics

char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name
int num_cores_; //!< @brief number of cores
std::vector<cpu_temp_info> temps_; //!< @brief CPU list for temperature
std::vector<cpu_freq_info> freqs_; //!< @brief CPU list for frequency
std::vector<int> usage_check_cnt_; //!< @brief CPU list for usage over check counter
bool mpstat_exists_; //!< @brief flag if mpstat exists

float usage_warn_; //!< @brief CPU usage(%) to generate warning
float usage_error_; //!< @brief CPU usage(%) to generate error
int usage_count_; //!< @brief CPU usage(%) usage over continuous count
bool usage_avg_; //!< @brief Check CPU usage calculated as averages among all processors
char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name
int num_cores_; //!< @brief number of cores
std::vector<cpu_temp_info> temps_; //!< @brief CPU list for temperature
std::vector<cpu_freq_info> freqs_; //!< @brief CPU list for frequency
std::vector<int> usage_warn_check_cnt_; //!< @brief CPU list for usage over warn check counter
std::vector<int> usage_error_check_cnt_; //!< @brief CPU list for usage over error check counter
bool mpstat_exists_; //!< @brief flag if mpstat exists

float usage_warn_; //!< @brief CPU usage(%) to generate warning
float usage_error_; //!< @brief CPU usage(%) to generate error
int usage_warn_count_; //!< @brief continuous count over usage_warn_ to generate warning
int usage_error_count_; //!< @brief continuous count over usage_error_ to generate error
bool usage_avg_; //!< @brief Check CPU usage calculated as averages among all processors

/**
* @brief CPU temperature status messages
Expand Down
35 changes: 20 additions & 15 deletions system/system_monitor/src/cpu_monitor/cpu_monitor_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,14 @@ CPUMonitorBase::CPUMonitorBase(const std::string & node_name, const rclcpp::Node
mpstat_exists_(false),
usage_warn_(declare_parameter<float>("usage_warn", 0.96)),
usage_error_(declare_parameter<float>("usage_error", 1.00)),
usage_count_(declare_parameter<int>("usage_count", 2)),
usage_warn_count_(declare_parameter<int>("usage_warn_count", 2)),
usage_error_count_(declare_parameter<int>("usage_error_count", 2)),
usage_avg_(declare_parameter<bool>("usage_avg", true))
{
gethostname(hostname_, sizeof(hostname_));
num_cores_ = boost::thread::hardware_concurrency();
usage_check_cnt_.resize(num_cores_ + 2); // 2 = all + dummy
usage_warn_check_cnt_.resize(num_cores_ + 2); // 2 = all + dummy
usage_error_check_cnt_.resize(num_cores_ + 2); // 2 = all + dummy

// Check if command exists
fs::path p = bp::search_path("mpstat");
Expand Down Expand Up @@ -233,7 +235,8 @@ void CPUMonitorBase::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & st
} catch (const std::exception & e) {
stat.summary(DiagStatus::ERROR, "mpstat exception");
stat.add("mpstat", e.what());
std::fill(usage_check_cnt_.begin(), usage_check_cnt_.end(), 0);
std::fill(usage_warn_check_cnt_.begin(), usage_warn_check_cnt_.end(), 0);
std::fill(usage_error_check_cnt_.begin(), usage_error_check_cnt_.end(), 0);
cpu_usage.all.status = CpuStatus::STALE;
cpu_usage.cpus.clear();
publishCpuUsage(cpu_usage);
Expand Down Expand Up @@ -268,22 +271,24 @@ int CPUMonitorBase::CpuUsageToLevel(const std::string & cpu_name, float usage)
}

// convert CPU usage to level
int level;
if (usage >= usage_error_) {
usage_check_cnt_[idx] = usage_count_;
level = DiagStatus::ERROR;
} else if (usage >= usage_warn_) {
if (usage_check_cnt_[idx] < usage_count_) {
usage_check_cnt_[idx]++;
}
if (usage_check_cnt_[idx] >= usage_count_) {
level = DiagStatus::ERROR;
int level = DiagStatus::OK;
if (usage >= usage_warn_) {
if (usage_warn_check_cnt_[idx] < usage_warn_count_) {
usage_warn_check_cnt_[idx]++;
} else {
level = DiagStatus::WARN;
}
} else {
usage_check_cnt_[idx] = 0;
level = DiagStatus::OK;
usage_warn_check_cnt_[idx] = 0;
}
if (usage >= usage_error_) {
if (usage_error_check_cnt_[idx] < usage_error_count_) {
usage_error_check_cnt_[idx]++;
} else {
level = DiagStatus::ERROR;
}
} else {
usage_error_check_cnt_[idx] = 0;
}

return level;
Expand Down

0 comments on commit 6c16055

Please sign in to comment.