From bf825194931565a7078ddfa0e27cd6e3420b889f Mon Sep 17 00:00:00 2001 From: "noriyuki.h" Date: Wed, 6 Apr 2022 13:37:16 +0900 Subject: [PATCH] feat: add CRC error monitoring to net_monitor Signed-off-by: noriyuki.h Signed-off-by: v-nakayama7440-esol --- .../system_monitor/net_monitor.param.yaml | 2 + .../diagnostic_aggregator/system.param.yaml | 6 + .../config/net_monitor.param.yaml | 2 + system/system_monitor/docs/ros_parameters.md | 10 +- .../system_monitor/docs/topics_net_monitor.md | 18 ++ .../net_monitor/net_monitor.hpp | 110 +++++++- .../src/net_monitor/net_monitor.cpp | 264 +++++++++++++----- 7 files changed, 327 insertions(+), 85 deletions(-) diff --git a/launch/tier4_system_launch/config/system_monitor/net_monitor.param.yaml b/launch/tier4_system_launch/config/system_monitor/net_monitor.param.yaml index f8dc832d7b2f2..e91aa4ca3fbb3 100644 --- a/launch/tier4_system_launch/config/system_monitor/net_monitor.param.yaml +++ b/launch/tier4_system_launch/config/system_monitor/net_monitor.param.yaml @@ -3,3 +3,5 @@ devices: ["*"] traffic_reader_port: 7636 monitor_program: "greengrass" + crc_error_check_duration: 1 + crc_error_count_threshold: 1 diff --git a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml index 2cc4bca27724a..12f1b386bed3a 100644 --- a/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml +++ b/system/system_error_monitor/config/diagnostic_aggregator/system.param.yaml @@ -122,6 +122,12 @@ contains: [": Network Traffic"] timeout: 3.0 + netowork_crc_error: + type: diagnostic_aggregator/GenericAnalyzer + path: network_crc_error + contains: [": Network CRC Error"] + timeout: 3.0 + storage: type: diagnostic_aggregator/AnalyzerGroup path: storage diff --git a/system/system_monitor/config/net_monitor.param.yaml b/system/system_monitor/config/net_monitor.param.yaml index 953d32d788ccf..686ee349b0765 100644 --- a/system/system_monitor/config/net_monitor.param.yaml +++ b/system/system_monitor/config/net_monitor.param.yaml @@ -3,3 +3,5 @@ devices: ["*"] traffic_reader_port: 7636 monitor_program: "greengrass" + crc_error_check_duration: 1 + crc_error_count_threshold: 1 diff --git a/system/system_monitor/docs/ros_parameters.md b/system/system_monitor/docs/ros_parameters.md index 779297492e00a..044c1eb10a5d0 100644 --- a/system/system_monitor/docs/ros_parameters.md +++ b/system/system_monitor/docs/ros_parameters.md @@ -53,10 +53,12 @@ mem_monitor: net_monitor: -| Name | Type | Unit | Default | Notes | -| :--------- | :----------: | :-----: | :-----: | :----------------------------------------------------------------------------------- | -| devices | list[string] | n/a | none | The name of network interface to monitor. (e.g. eth0, \* for all network interfaces) | -| usage_warn | float | %(1e-2) | 0.95 | Generates warning when network usage reaches a specified value or higher. | +| Name | Type | Unit | Default | Notes | +| :------------------------ | :----------: | :-----: | :-----: | :-------------------------------------------------------------------------------------------------------------- | +| devices | list[string] | n/a | none | The name of network interface to monitor. (e.g. eth0, \* for all network interfaces) | +| usage_warn | float | %(1e-2) | 0.95 | Generates warning when network usage reaches a specified value or higher. | +| crc_error_check_duration | int | sec | 1 | CRC error check duration. | +| crc_error_count_threshold | int | n/a | 1 | Generates warning when count of CRC errors during CRC error check duration reaches a specified value or higher. | ## NTP Monitor diff --git a/system/system_monitor/docs/topics_net_monitor.md b/system/system_monitor/docs/topics_net_monitor.md index 36fdeea890e6a..261cede53de21 100644 --- a/system/system_monitor/docs/topics_net_monitor.md +++ b/system/system_monitor/docs/topics_net_monitor.md @@ -63,3 +63,21 @@ | key | value (example) | | ----- | ----------------------------------------------------- | | error | [nethogs -t] execve failed: No such file or directory | + +## Network CRC Error + +/diagnostics/net_monitor: Network CRC Error + +[summary] + +| level | message | +| ----- | ------- | +| OK | OK | + +[values] + +| key | value (example) | +| ------------------------------------------ | --------------- | +| Network [0-9]: interface name | wlp82s0 | +| Network [0-9]: total rx_crc_errors | 0 | +| Network [0-9]: rx_crc_errors per unit time | 0 | diff --git a/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp b/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp index 8aafb4b08b083..53f3e1250c9f0 100644 --- a/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp +++ b/system/system_monitor/include/system_monitor/net_monitor/net_monitor.hpp @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -55,11 +56,6 @@ class NetMonitor : public rclcpp::Node */ ~NetMonitor(); - /** - * @brief Update the diagnostic state. - */ - void update(); - /** * @brief Shutdown nl80211 object */ @@ -86,6 +82,15 @@ class NetMonitor : public rclcpp::Node void monitorTraffic( diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** + * @brief check CRC error + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @note NOLINT syntax is needed since diagnostic_updater asks for a non-const reference + * to pass diagnostic message updated in this function to diagnostic publish calls. + */ + void checkCrcError( + diagnostic_updater::DiagnosticStatusWrapper & stat); // NOLINT(runtime/references) + /** * @brief get wireless speed * @param [in] ifa_name interface name @@ -93,17 +98,104 @@ class NetMonitor : public rclcpp::Node */ float getWirelessSpeed(const char * ifa_name); + /** + * @brief timer callback + */ + void onTimer(); + + /** + * @brief update Network information list + */ + void updateNetworkInfoList(); + + /** + * @brief check NetMonitor General Infomation + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @return check result + */ + bool checkGeneralInfo(diagnostic_updater::DiagnosticStatusWrapper & stat); + + /** + * @brief Network information + */ + struct NetworkInfo + { + int mtu_errno; //!< @brief errno set by ioctl() with SIOCGIFMTU + int ethtool_errno; //!< @brief errno set by ioctl() with SIOCETHTOOL + bool is_running; //!< @brief resource allocated flag + std::string interface_name; //!< @brief interface name + float speed; //!< @brief network capacity + int mtu; //!< @brief MTU + float rx_traffic; //!< @brief traffic received + float tx_traffic; //!< @brief traffic transmitted + float rx_usage; //!< @brief network capacity usage rate received + float tx_usage; //!< @brief network capacity usage rate transmitted + unsigned int rx_bytes; //!< @brief total bytes received + unsigned int rx_errors; //!< @brief bad packets received + unsigned int tx_bytes; //!< @brief total bytes transmitted + unsigned int tx_errors; //!< @brief packet transmit problems + unsigned int collisions; //!< @brief number of collisions during packet transmissions + + NetworkInfo() + : mtu_errno(0), + ethtool_errno(0), + is_running(false), + interface_name(""), + speed(0.0), + mtu(0), + rx_traffic(0.0), + tx_traffic(0.0), + rx_usage(0.0), + tx_usage(0.0), + rx_bytes(0), + rx_errors(0), + tx_bytes(0), + tx_errors(0), + collisions(0) + { + } + }; + + /** + * @brief determine if it is a supported network + * @param [in] net_info network infomation + * @param [in] index index of network infomation index + * @param [out] stat diagnostic message passed directly to diagnostic publish calls + * @param [out] error_str error string + * @return result of determining whether it is a supported network + */ + bool isSupportedNetwork( + const NetworkInfo & net_info, int index, diagnostic_updater::DiagnosticStatusWrapper & stat, + std::string & error_str); + diagnostic_updater::Updater updater_; //!< @brief Updater class which advertises to /diagnostics + rclcpp::TimerBase::SharedPtr timer_; //!< @brief timer to get Network information char hostname_[HOST_NAME_MAX + 1]; //!< @brief host name std::map bytes_; //!< @brief list of bytes rclcpp::Time last_update_time_; //!< @brief last update time std::vector device_params_; //!< @brief list of devices - NL80211 nl80211_; // !< @brief 802.11 netlink-based interface + NL80211 nl80211_; //!< @brief 802.11 netlink-based interface + int getifaddrs_errno_; //!< @brief errno set by getifaddrs() + std::vector net_info_list_; //!< @brief list of Network information - std::string monitor_program_; //!< @brief nethogs monitor program name - bool nethogs_all_; //!< @brief nethogs result all mode - int traffic_reader_port_; //!< @brief port number to connect to traffic_reader + /** + * @brief CRC errors information + */ + typedef struct crc_errors + { + std::deque errors_queue; //!< @brief queue that holds count of CRC errors + unsigned int last_rx_crc_errors; //!< @brief rx_crc_error at the time of the last monitoring + + crc_errors() : last_rx_crc_errors(0) {} + } crc_errors; + std::map crc_errors_; //!< @brief list of CRC errors + + std::string monitor_program_; //!< @brief nethogs monitor program name + bool nethogs_all_; //!< @brief nethogs result all mode + int traffic_reader_port_; //!< @brief port number to connect to traffic_reader + unsigned int crc_error_check_duration_; //!< @brief CRC error check duration + unsigned int crc_error_count_threshold_; //!< @brief CRC error count threshold /** * @brief Network usage status messages diff --git a/system/system_monitor/src/net_monitor/net_monitor.cpp b/system/system_monitor/src/net_monitor/net_monitor.cpp index c1c06ee6f1d4d..983af5668a082 100644 --- a/system/system_monitor/src/net_monitor/net_monitor.cpp +++ b/system/system_monitor/src/net_monitor/net_monitor.cpp @@ -47,8 +47,12 @@ NetMonitor::NetMonitor(const rclcpp::NodeOptions & options) device_params_( declare_parameter>("devices", std::vector())), monitor_program_(declare_parameter("monitor_program", "greengrass")), - traffic_reader_port_(declare_parameter("traffic_reader_port", TRAFFIC_READER_PORT)) + traffic_reader_port_(declare_parameter("traffic_reader_port", TRAFFIC_READER_PORT)), + crc_error_check_duration_(declare_parameter("crc_error_check_duration", 1)), + crc_error_count_threshold_(declare_parameter("crc_error_count_threshold", 1)) { + using namespace std::literals::chrono_literals; + if (monitor_program_.empty()) { monitor_program_ = GET_ALL_STR; nethogs_all_ = true; @@ -60,23 +64,27 @@ NetMonitor::NetMonitor(const rclcpp::NodeOptions & options) updater_.setHardwareID(hostname_); updater_.add("Network Usage", this, &NetMonitor::checkUsage); updater_.add("Network Traffic", this, &NetMonitor::monitorTraffic); + updater_.add("Network CRC Error", this, &NetMonitor::checkCrcError); nl80211_.init(); + + // get Network information for the first time + updateNetworkInfoList(); + + timer_ = rclcpp::create_timer(this, get_clock(), 1s, std::bind(&NetMonitor::onTimer, this)); } NetMonitor::~NetMonitor() { shutdown_nl80211(); } -void NetMonitor::update() { updater_.force_update(); } - void NetMonitor::shutdown_nl80211() { nl80211_.shutdown(); } -void NetMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +void NetMonitor::onTimer() { updateNetworkInfoList(); } + +void NetMonitor::updateNetworkInfoList() { - // Remember start time to measure elapsed time - const auto t_start = SystemMonitorUtility::startMeasurement(); + net_info_list_.clear(); if (device_params_.empty()) { - stat.summary(DiagStatus::ERROR, "invalid device parameter"); return; } @@ -86,22 +94,12 @@ void NetMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) rclcpp::Duration duration = this->now() - last_update_time_; // Get network interfaces + getifaddrs_errno_ = 0; if (getifaddrs(&ifas) < 0) { - stat.summary(DiagStatus::ERROR, "getifaddrs error"); - stat.add("getifaddrs", strerror(errno)); + getifaddrs_errno_ = errno; return; } - int level = DiagStatus::OK; - int whole_level = DiagStatus::OK; - int index = 0; - std::string error_str; - float rx_traffic{0.0}; - float tx_traffic{0.0}; - float rx_usage{0.0}; - float tx_usage{0.0}; - std::vector interface_names; - for (ifa = ifas; ifa; ifa = ifa->ifa_next) { // Skip no addr if (!ifa->ifa_addr) { @@ -127,90 +125,117 @@ void NetMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) struct ifreq ifrc; struct ethtool_cmd edata; + net_info_list_.emplace_back(); + auto & net_info = net_info_list_.back(); + + net_info.interface_name = std::string(ifa->ifa_name); + // Get MTU information fd = socket(AF_INET, SOCK_DGRAM, 0); strncpy(ifrm.ifr_name, ifa->ifa_name, IFNAMSIZ - 1); if (ioctl(fd, SIOCGIFMTU, &ifrm) < 0) { - if (errno == ENOTSUP) { - stat.add(fmt::format("Network {}: status", index), "Not Supported"); - } else { - stat.add(fmt::format("Network {}: status", index), "Error"); - error_str = "ioctl error"; - } - - stat.add(fmt::format("Network {}: interface name", index), ifa->ifa_name); - stat.add("ioctl(SIOCGIFMTU)", strerror(errno)); - - ++index; + net_info.mtu_errno = errno; close(fd); - interface_names.push_back(ifa->ifa_name); continue; } // Get network capacity - float speed = 0.0; strncpy(ifrc.ifr_name, ifa->ifa_name, IFNAMSIZ - 1); ifrc.ifr_data = (caddr_t)&edata; edata.cmd = ETHTOOL_GSET; if (ioctl(fd, SIOCETHTOOL, &ifrc) < 0) { // possibly wireless connection, get bitrate(MBit/s) - speed = nl80211_.getBitrate(ifa->ifa_name); - if (speed <= 0) { - if (errno == ENOTSUP) { - stat.add(fmt::format("Network {}: status", index), "Not Supported"); - } else { - stat.add(fmt::format("Network {}: status", index), "Error"); - error_str = "ioctl error"; - } - - stat.add(fmt::format("Network {}: interface name", index), ifa->ifa_name); - stat.add("ioctl(SIOCETHTOOL)", strerror(errno)); - - ++index; + net_info.speed = nl80211_.getBitrate(ifa->ifa_name); + if (net_info.speed <= 0) { + net_info.ethtool_errno = errno; close(fd); - interface_names.push_back(ifa->ifa_name); continue; } } else { - speed = edata.speed; + net_info.speed = edata.speed; } - level = (ifa->ifa_flags & IFF_RUNNING) ? DiagStatus::OK : DiagStatus::ERROR; + net_info.is_running = (ifa->ifa_flags & IFF_RUNNING); auto * stats = (struct rtnl_link_stats *)ifa->ifa_data; - if (bytes_.find(ifa->ifa_name) != bytes_.end()) { - rx_traffic = toMbit(stats->rx_bytes - bytes_[ifa->ifa_name].rx_bytes) / duration.seconds(); - tx_traffic = toMbit(stats->tx_bytes - bytes_[ifa->ifa_name].tx_bytes) / duration.seconds(); - rx_usage = rx_traffic / speed; - tx_usage = tx_traffic / speed; + if (bytes_.find(net_info.interface_name) != bytes_.end()) { + net_info.rx_traffic = + toMbit(stats->rx_bytes - bytes_[net_info.interface_name].rx_bytes) / duration.seconds(); + net_info.tx_traffic = + toMbit(stats->tx_bytes - bytes_[net_info.interface_name].tx_bytes) / duration.seconds(); + net_info.rx_usage = net_info.rx_traffic / net_info.speed; + net_info.tx_usage = net_info.tx_traffic / net_info.speed; } - stat.add(fmt::format("Network {}: status", index), usage_dict_.at(level)); - stat.add(fmt::format("Network {}: interface name", index), ifa->ifa_name); - stat.addf(fmt::format("Network {}: rx_usage", index), "%.2f%%", rx_usage * 1e+2); - stat.addf(fmt::format("Network {}: tx_usage", index), "%.2f%%", tx_usage * 1e+2); - stat.addf(fmt::format("Network {}: rx_traffic", index), "%.2f MBit/s", rx_traffic); - stat.addf(fmt::format("Network {}: tx_traffic", index), "%.2f MBit/s", tx_traffic); - stat.addf(fmt::format("Network {}: capacity", index), "%.1f MBit/s", speed); - stat.add(fmt::format("Network {}: mtu", index), ifrm.ifr_mtu); - stat.add(fmt::format("Network {}: rx_bytes", index), stats->rx_bytes); - stat.add(fmt::format("Network {}: rx_errors", index), stats->rx_errors); - stat.add(fmt::format("Network {}: tx_bytes", index), stats->tx_bytes); - stat.add(fmt::format("Network {}: tx_errors", index), stats->tx_errors); - stat.add(fmt::format("Network {}: collisions", index), stats->collisions); + net_info.mtu = ifrm.ifr_mtu; + net_info.rx_bytes = stats->rx_bytes; + net_info.rx_errors = stats->rx_errors; + net_info.tx_bytes = stats->tx_bytes; + net_info.tx_errors = stats->tx_errors; + net_info.collisions = stats->collisions; close(fd); - bytes_[ifa->ifa_name].rx_bytes = stats->rx_bytes; - bytes_[ifa->ifa_name].tx_bytes = stats->tx_bytes; - - ++index; + bytes_[net_info.interface_name].rx_bytes = stats->rx_bytes; + bytes_[net_info.interface_name].tx_bytes = stats->tx_bytes; - interface_names.push_back(ifa->ifa_name); + // Get the count of CRC errors + crc_errors & crc_ers = crc_errors_[net_info.interface_name]; + crc_ers.errors_queue.push_back(stats->rx_crc_errors - crc_ers.last_rx_crc_errors); + while (crc_ers.errors_queue.size() > crc_error_check_duration_) { + crc_ers.errors_queue.pop_front(); + } + crc_ers.last_rx_crc_errors = stats->rx_crc_errors; } freeifaddrs(ifas); + last_update_time_ = this->now(); +} + +void NetMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (!checkGeneralInfo(stat)) { + return; + } + + int level = DiagStatus::OK; + int whole_level = DiagStatus::OK; + int index = 0; + std::string error_str; + std::vector interface_names; + + for (const auto & net_info : net_info_list_) { + if (!isSupportedNetwork(net_info, index, stat, error_str)) { + ++index; + interface_names.push_back(net_info.interface_name); + continue; + } + + level = net_info.is_running ? DiagStatus::OK : DiagStatus::ERROR; + + stat.add(fmt::format("Network {}: status", index), usage_dict_.at(level)); + stat.add(fmt::format("Network {}: interface name", index), net_info.interface_name); + stat.addf(fmt::format("Network {}: rx_usage", index), "%.2f%%", net_info.rx_usage * 1e+2); + stat.addf(fmt::format("Network {}: tx_usage", index), "%.2f%%", net_info.tx_usage * 1e+2); + stat.addf(fmt::format("Network {}: rx_traffic", index), "%.2f MBit/s", net_info.rx_traffic); + stat.addf(fmt::format("Network {}: tx_traffic", index), "%.2f MBit/s", net_info.tx_traffic); + stat.addf(fmt::format("Network {}: capacity", index), "%.1f MBit/s", net_info.speed); + stat.add(fmt::format("Network {}: mtu", index), net_info.mtu); + stat.add(fmt::format("Network {}: rx_bytes", index), net_info.rx_bytes); + stat.add(fmt::format("Network {}: rx_errors", index), net_info.rx_errors); + stat.add(fmt::format("Network {}: tx_bytes", index), net_info.tx_bytes); + stat.add(fmt::format("Network {}: tx_errors", index), net_info.tx_errors); + stat.add(fmt::format("Network {}: collisions", index), net_info.collisions); + + ++index; + + interface_names.push_back(net_info.interface_name); + } + // Check if specified device exists for (const auto & device : device_params_) { // Skip if all devices specified @@ -234,12 +259,107 @@ void NetMonitor::checkUsage(diagnostic_updater::DiagnosticStatusWrapper & stat) stat.summary(whole_level, usage_dict_.at(whole_level)); } - last_update_time_ = this->now(); + // Measure elapsed time since start time and report + SystemMonitorUtility::stopMeasurement(t_start, stat); +} + +void NetMonitor::checkCrcError(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + // Remember start time to measure elapsed time + const auto t_start = SystemMonitorUtility::startMeasurement(); + + if (!checkGeneralInfo(stat)) { + return; + } + + int whole_level = DiagStatus::OK; + int index = 0; + std::string error_str; + + for (const auto & net_info : net_info_list_) { + if (!isSupportedNetwork(net_info, index, stat, error_str)) { + ++index; + continue; + } + + crc_errors & crc_ers = crc_errors_[net_info.interface_name]; + unsigned int unit_rx_crc_errors = 0; + + for (auto errors : crc_ers.errors_queue) { + unit_rx_crc_errors += errors; + } + + stat.add(fmt::format("Network {}: interface name", index), net_info.interface_name); + stat.add(fmt::format("Network {}: total rx_crc_errors", index), crc_ers.last_rx_crc_errors); + stat.add(fmt::format("Network {}: rx_crc_errors per unit time", index), unit_rx_crc_errors); + + if (unit_rx_crc_errors >= crc_error_count_threshold_) { + whole_level = std::max(whole_level, static_cast(DiagStatus::WARN)); + error_str = "CRC error"; + } + + ++index; + } + + if (!error_str.empty()) { + stat.summary(whole_level, error_str); + } else { + stat.summary(whole_level, "OK"); + } // Measure elapsed time since start time and report SystemMonitorUtility::stopMeasurement(t_start, stat); } +bool NetMonitor::checkGeneralInfo(diagnostic_updater::DiagnosticStatusWrapper & stat) +{ + if (device_params_.empty()) { + stat.summary(DiagStatus::ERROR, "invalid device parameter"); + return false; + } + + if (getifaddrs_errno_ != 0) { + stat.summary(DiagStatus::ERROR, "getifaddrs error"); + stat.add("getifaddrs", strerror(getifaddrs_errno_)); + return false; + } + return true; +} + +bool NetMonitor::isSupportedNetwork( + const NetworkInfo & net_info, int index, diagnostic_updater::DiagnosticStatusWrapper & stat, + std::string & error_str) +{ + // MTU information + if (net_info.mtu_errno != 0) { + if (net_info.mtu_errno == ENOTSUP) { + stat.add(fmt::format("Network {}: status", index), "Not Supported"); + } else { + stat.add(fmt::format("Network {}: status", index), "Error"); + error_str = "ioctl error"; + } + + stat.add(fmt::format("Network {}: interface name", index), net_info.interface_name); + stat.add("ioctl(SIOCGIFMTU)", strerror(net_info.mtu_errno)); + return false; + } + + // network capacity + if (net_info.speed <= 0) { + if (net_info.ethtool_errno == ENOTSUP) { + stat.add(fmt::format("Network {}: status", index), "Not Supported"); + } else { + stat.add(fmt::format("Network {}: status", index), "Error"); + error_str = "ioctl error"; + } + + stat.add(fmt::format("Network {}: interface name", index), net_info.interface_name); + stat.add("ioctl(SIOCETHTOOL)", strerror(net_info.ethtool_errno)); + return false; + } + return true; +} + #include // workaround for build errors void NetMonitor::monitorTraffic(diagnostic_updater::DiagnosticStatusWrapper & stat)