Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RDMA collector #3176

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions collector/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ func readUintFromFile(path string) (uint64, error) {
return value, nil
}

func readStringFromFile(path string) string {
data, err := os.ReadFile(path)
if err != nil {
return ""
}
return strings.TrimSpace(string(data))
}

var metricNameRegex = regexp.MustCompile(`_*[^0-9A-Za-z_]+_*`)

// SanitizeMetricName sanitize the given metric name by replacing invalid characters by underscores.
Expand Down
316 changes: 316 additions & 0 deletions collector/rdma_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,316 @@
// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build !nordma
// +build !nordma

// The hard work of collecting data from the kernel via the MLNX_OFED interfaces is done by
// https://github.com/Mellanox/rdmamap
// by Mellanox. Used under the Apache 2.0 license.

package collector

import (
"fmt"
"log/slog"
"os"
"path/filepath"
"regexp"
"strings"
"sync"

"github.com/Mellanox/rdmamap"
"github.com/alecthomas/kingpin/v2"
"github.com/prometheus/client_golang/prometheus"
)

var (
rdmaDeviceInclude = kingpin.Flag("collector.rdma.device-include", "Regexp of rdma devices to include (mutually exclusive to device-exclude).").String()
rdmaDeviceExclude = kingpin.Flag("collector.rdma.device-exclude", "Regexp of rdma devices to exclude (mutually exclusive to device-include).").String()
rdmaIncludedMetrics = kingpin.Flag("collector.rdma.metrics-include", "Regexp of rdma stats to include.").Default(".*").String()

lookupTable = map[string]string{
"port_rcv_data": "mlx5_port_rcv_data_total",
"port_rcv_packets": "mlx5_port_rcv_packets_total",
"port_multicast_rcv_packets": "mlx5_port_multicast_rcv_packets_total",
"port_unicast_rcv_packets": "mlx5_port_unicast_rcv_packets_total",
"port_xmit_data": "mlx5_port_xmit_data_total",
"port_xmit_packets": "mlx5_port_xmit_packets_total",
"port_rcv_switch_relay_errors": "mlx5_port_rcv_switch_relay_errors_total",
"port_rcv_errors": "mlx5_port_rcv_errors_total",
"port_rcv_constraint_errors": "mlx5_port_rcv_constraint_errors_total",
"local_link_integrity_errors": "mlx5_local_link_integrity_errors_total",
"port_xmit_wait": "mlx5_port_xmit_wait_total",
"port_multicast_xmit_packets": "mlx5_port_multicast_xmit_packets_total",
"port_unicast_xmit_packets": "mlx5_port_unicast_xmit_packets_total",
"port_xmit_discards": "mlx5_port_xmit_discards_total",
"port_xmit_constraint_errors": "mlx5_port_xmit_constraint_errors_total",
"port_rcv_remote_physical_errors": "mlx5_port_rcv_remote_physical_errors_total",
"symbol_error": "mlx5_symbol_error_total",
"VL15_dropped": "mlx5_vl15_dropped_total",
"link_error_recovery": "mlx5_link_error_recovery_total",
"link_downed": "mlx5_link_downed_total",
"duplicate_request": "mlx5_duplicate_request_total",
"implied_nak_seq_err": "mlx5_implied_nak_seq_err_total",
"lifespan": "mlx5_lifespan_ms",
"local_ack_timeout_err": "mlx5_local_ack_timeout_err_total",
"np_cnp_sent": "mlx5_np_cnp_sent_total",
"np_ecn_marked_roce_packets": "mlx5_np_ecn_marked_roce_packets_total",
"out_of_buffer": "mlx5_out_of_buffer_total",
"out_of_sequence": "mlx5_out_of_sequence_total",
"packet_seq_err": "mlx5_packet_seq_err_total",
"req_cqe_error": "mlx5_req_cqe_error_total",
"req_cqe_flush_error": "mlx5_req_cqe_flush_error_total",
"req_remote_access_errors": "mlx5_req_remote_access_errors_total",
"req_remote_invalid_request": "mlx5_req_remote_invalid_request_total",
"resp_cqe_error": "mlx5_resp_cqe_error_total",
"resp_cqe_flush_error": "mlx5_resp_cqe_flush_error_total",
"resp_local_length_error": "mlx5_resp_local_length_error_total",
"resp_remote_access_errors": "mlx5_resp_remote_access_errors_total",
"rnr_nak_retry_err": "mlx5_rnr_nak_retry_err_total",
"rp_cnp_handled": "mlx5_rp_cnp_handled_total",
"rp_cnp_ignored": "mlx5_rp_cnp_ignored_total",
"rx_atomic_requests": "mlx5_rx_atomic_requests_total",
"rx_dct_connect": "mlx5_rx_dct_connect_total",
"rx_read_requests": "mlx5_rx_read_requests_total",
"rx_write_requests": "mlx5_rx_write_requests_total",
"rx_icrc_encapsulated": "mlx5_rx_icrc_encapsulated_total",
"roce_adp_retrans": "mlx5_roce_adp_retrans_total",
"roce_adp_retrans_to": "mlx5_roce_adp_retrans_to_total",
"roce_slow_restart": "mlx5_roce_slow_restart_total",
"roce_slow_restart_cnps": "mlx5_roce_slow_restart_cnps_total",
"roce_slow_restart_trans": "mlx5_roce_slow_restart_trans_total",
}

// https://enterprise-support.nvidia.com/s/article/understanding-mlx5-linux-counters-and-status-parameters
portCounters = map[string]string{
"mlx5_port_rcv_data_total": "Total number of data octets received on all VLs from the port (divided by 4, counting in double words)",
"mlx5_port_rcv_packets_total": "Total number of received packets (may include packets with errors)",
"mlx5_port_multicast_rcv_packets_total": "Total number of multicast packets received (including those with errors)",
"mlx5_port_unicast_rcv_packets_total": "Total number of unicast packets received (including those with errors)",
"mlx5_port_xmit_data_total": "Total number of data octets transmitted on all VLs from the port (divided by 4, counting in double words)",
"mlx5_port_xmit_packets_total": "Total number of transmitted packets (may include packets with errors)",
"mlx5_port_rcv_switch_relay_errors_total": "Total number of packets discarded because they could not be forwarded by the switch relay",
"mlx5_port_rcv_errors_total": "Total number of received packets with errors",
"mlx5_port_rcv_constraint_errors_total": "Total number of packets discarded due to constraints on the switch physical port",
"mlx5_local_link_integrity_errors_total": "Total number of times local physical errors exceeded the threshold and caused a local link integrity failure",
"mlx5_port_xmit_wait_total": "Total number of ticks during which the port had data to transmit but no data was sent due to insufficient credits or lack of arbitration",
"mlx5_port_multicast_xmit_packets_total": "Total number of multicast packets transmitted (including those with errors)",
"mlx5_port_unicast_xmit_packets_total": "Total number of unicast packets transmitted (including those with errors)",
"mlx5_port_xmit_discards_total": "Total number of outbound packets discarded because the port is down or congested",
"mlx5_port_xmit_constraint_errors_total": "Total number of packets not transmitted due to constraints on the switch physical port",
"mlx5_port_rcv_remote_physical_errors_total": "Total number of packets marked with the EBP delimiter received on the port",
"mlx5_symbol_error_total": "Total number of minor link errors detected on one or more physical lanes",
"mlx5_vl15_dropped_total": "Total number of incoming VL15 packets dropped due to resource limitations (e.g., lack of buffers)",
"mlx5_link_error_recovery_total": "Total number of successful link error recovery processes completed by the Port Training state machine",
"mlx5_link_downed_total": "Total number of failed link error recovery processes that caused the link to be downed",
}

hwCounters = map[string]string{
"mlx5_duplicate_request_total": "Total number of received packets that were duplicates of previous requests",
"mlx5_implied_nak_seq_err_total": "Total number of times the requested ACK had a PSN larger than the expected PSN for an RDMA read or response",
"mlx5_lifespan_ms": "Maximum period in milliseconds which defines the aging of counter reads",
"mlx5_local_ack_timeout_err_total": "Total number of times the QP's ACK timer expired for RC, XRC, or DCT QPs at the sender side (retry limit not exceeded)",
"mlx5_np_cnp_sent_total": "Total number of CNP packets sent by the Notification Point due to congestion in the RoCEv2 IP header (ECN bits)",
"mlx5_np_ecn_marked_roce_packets_total": "Total number of RoCEv2 packets received marked with ECN (congestion experienced)",
"mlx5_out_of_buffer_total": "Total number of drops due to lack of WQE for the associated QPs",
"mlx5_out_of_sequence_total": "Total number of out-of-sequence packets received",
"mlx5_packet_seq_err_total": "Total number of received NAK sequence error packets (QP retry limit not exceeded)",
"mlx5_req_cqe_error_total": "Total number of times the requester detected CQEs completed with errors",
"mlx5_req_cqe_flush_error_total": "Total number of times the requester detected CQEs completed with flushed errors",
"mlx5_req_remote_access_errors_total": "Total number of times the requester detected remote access errors",
"mlx5_req_remote_invalid_request_total": "Total number of times the requester detected remote invalid request errors",
"mlx5_resp_cqe_error_total": "Total number of times the responder detected CQEs completed with errors",
"mlx5_resp_cqe_flush_error_total": "Total number of times the responder detected CQEs completed with flushed errors",
"mlx5_resp_local_length_error_total": "Total number of times the responder detected local length errors",
"mlx5_resp_remote_access_errors_total": "Total number of times the responder detected remote access errors",
"mlx5_rnr_nak_retry_err_total": "Total number of received RNR NAK packets (QP retry limit not exceeded)",
"mlx5_rp_cnp_handled_total": "Total number of CNP packets handled by the Reaction Point HCA to throttle transmission rate",
"mlx5_rp_cnp_ignored_total": "Total number of CNP packets ignored by the Reaction Point HCA",
"mlx5_rx_atomic_requests_total": "Total number of received ATOMIC requests for associated QPs",
"mlx5_rx_dct_connect_total": "Total number of received connection requests for associated DCTs",
"mlx5_rx_read_requests_total": "Total number of received READ requests for associated QPs",
"mlx5_rx_write_requests_total": "Total number of received WRITE requests for associated QPs",
"mlx5_rx_icrc_encapsulated_total": "Total number of RoCE packets with ICRC errors",
"mlx5_roce_adp_retrans_total": "Total number of adaptive retransmissions for RoCE traffic",
"mlx5_roce_adp_retrans_to_total": "Total number of times RoCE traffic reached timeout due to adaptive retransmission",
"mlx5_roce_slow_restart_total": "Total number of times RoCE slow restart was used",
"mlx5_roce_slow_restart_cnps_total": "Total number of times RoCE slow restart generated CNP packets",
"mlx5_roce_slow_restart_trans_total": "Total number of times RoCE slow restart changed state to slow restart",
}
)

type rdmaCollector struct {
entries map[string]*prometheus.Desc
entriesMutex sync.Mutex
deviceFilter deviceFilter
infoDesc *prometheus.Desc
metricsPattern *regexp.Regexp
logger *slog.Logger
}

// makeRdmaCollector is the internal constructor for rdmaCollector.
func makeRdmaCollector(logger *slog.Logger) (*rdmaCollector, error) {
if *rdmaDeviceInclude != "" {
logger.Info("Parsed flag --collector.rdma.device-include", "flag", *rdmaDeviceInclude)
}
if *rdmaDeviceExclude != "" {
logger.Info("Parsed flag --collector.rdma.device-exclude", "flag", *rdmaDeviceExclude)
}
if *rdmaIncludedMetrics != "" {
logger.Info("Parsed flag --collector.rdma.metrics-include", "flag", *rdmaIncludedMetrics)
}

// Update paths to respect the mount points setup.
for _, dir := range []*string{
&rdmamap.RdmaClassDir,
&rdmamap.RdmaIbUcmDir,
&rdmamap.RdmaUmadDir,
&rdmamap.RdmaUverbsDir,
&rdmamap.PciDevDir,
&rdmamap.AuxDevDir,
} {
*dir = strings.TrimPrefix(*dir, "/sys")
*dir = sysFilePath(*dir)
}
for _, dir := range []*string{
&rdmamap.RdmaUcmDevice,
&rdmamap.RdmaDeviceDir,
} {
*dir = rootfsFilePath(*dir)
}

entries := make(map[string]*prometheus.Desc, len(portCounters)+len(hwCounters))
for _, counters := range []map[string]string{portCounters, hwCounters} {
for metric, help := range counters {
entries[metric] = prometheus.NewDesc(
buildRdmaFQName(metric),
help,
[]string{"device", "port", "interfaces"}, nil,
)
}
}

// Pre-populate some common rdma metrics.
return &rdmaCollector{
deviceFilter: newDeviceFilter(*rdmaDeviceExclude, *rdmaDeviceInclude),
metricsPattern: regexp.MustCompile(*rdmaIncludedMetrics),
logger: logger,
entries: entries,
infoDesc: prometheus.NewDesc(
buildRdmaFQName("info"),
"A metric with a constant '1' value labeled by device, vendor_id, device_id, firmware_version, driver_version.",
[]string{"device", "vendor_id", "device_id", "firmware_version", "driver_version"}, nil,
),
}, nil
}

func init() {
registerCollector("rdma", defaultDisabled, NewRdmaCollector)
}

// Generate the fully-qualified metric name for the rdma metric.
func buildRdmaFQName(metric string) string {
metricName := strings.TrimLeft(strings.ToLower(SanitizeMetricName(metric)), "_")
return prometheus.BuildFQName(namespace, "rdma", metricName)
}

// NewRdmaCollector returns a new Collector exposing rdma stats.
func NewRdmaCollector(logger *slog.Logger) (Collector, error) {
return makeRdmaCollector(logger)
}

func getNetworkInterfaces(rdmaDeviceName string) string {
var ifs []string

dir := filepath.Join(rdmamap.RdmaClassDir, rdmaDeviceName, "device", "net")
fd, err := os.Open(dir)
if err != nil {
return ""
}
defer fd.Close()

fileInfos, err := fd.Readdir(-1)
if err != nil {
return ""
}

for i := range fileInfos {
if fileInfos[i].Name() == "." || fileInfos[i].Name() == ".." {
continue
}
ifs = append(ifs, fileInfos[i].Name())
}
return strings.Join(ifs, ",")
}

func (c *rdmaCollector) Update(ch chan<- prometheus.Metric) error {
rdmaDevices := rdmamap.GetRdmaDeviceList()
if len(rdmaDevices) == 0 {
return fmt.Errorf("no rdma devices found")
}

for _, device := range rdmaDevices {
if c.deviceFilter.ignored(device) {
continue
}

interfaces := getNetworkInterfaces(device)

stats, err := rdmamap.GetRdmaSysfsAllPortsStats(device)
if err != nil {
c.logger.Error("rdma stats error", "err", err, "device", device)
continue
}

updateFunc := func(key string, value float64, labelValues ...string) {
metric, ok := lookupTable[key]
if !ok {
c.logger.Warn("rdma metric not found in lookup table", "key", key)
return
}
if !c.metricsPattern.MatchString(metric) {
c.logger.Debug("rdma metric excluded", "metric", metric)
return
}
entry := c.entry(metric)
if entry == nil {
c.logger.Warn("rdma metric not found", "metric", metric)
return
}
ch <- prometheus.MustNewConstMetric(entry, prometheus.GaugeValue,
value, labelValues...)
}

for _, portstats := range stats.PortStats {
for _, stat := range append(portstats.HwStats, portstats.Stats...) {
updateFunc(stat.Name, float64(stat.Value), device, fmt.Sprintf("%d", portstats.Port), interfaces)
}
}

vendorID := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, device, "device", "vendor"))
deviceID := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, device, "device", "device"))
firmwareVersion := readStringFromFile(filepath.Join(rdmamap.RdmaClassDir, "mlx5_0", "fw_ver"))
driverVersion := readStringFromFile(sysFilePath("module/mlx5_core/version"))
ch <- prometheus.MustNewConstMetric(c.infoDesc, prometheus.GaugeValue, 1.0,
device, vendorID, deviceID, firmwareVersion, driverVersion)
}

return nil
}

func (c *rdmaCollector) entry(key string) *prometheus.Desc {
c.entriesMutex.Lock()
defer c.entriesMutex.Unlock()
return c.entries[key]
}
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/prometheus/node_exporter
go 1.22.0

require (
github.com/Mellanox/rdmamap v1.1.1-0.20241212105033-37bd11cc4c57
github.com/alecthomas/kingpin/v2 v2.4.0
github.com/beevik/ntp v1.4.3
github.com/coreos/go-systemd/v22 v22.5.0
Expand Down Expand Up @@ -48,6 +49,8 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/siebenmann/go-kstat v0.0.0-20210513183136-173c9b0a9973 // indirect
github.com/vishvananda/netlink v1.1.0 // indirect
github.com/vishvananda/netns v0.0.4 // indirect
github.com/xhit/go-str2duration/v2 v2.1.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
github.com/Mellanox/rdmamap v1.1.1-0.20241212105033-37bd11cc4c57 h1:ffMnYJFt7Bgp/2s2fOsQ0LpKfuU4xCk4afAtQG1wuBM=
github.com/Mellanox/rdmamap v1.1.1-0.20241212105033-37bd11cc4c57/go.mod h1:D3ffy5KqtmeWfuW0cX/GQW0J6S3k8aORk4bf9CBOhng=
github.com/alecthomas/kingpin/v2 v2.4.0 h1:f48lwail6p8zpO1bC4TxtqACaGqHYA22qkHjHpqDjYY=
github.com/alecthomas/kingpin/v2 v2.4.0/go.mod h1:0gyi0zQnjuFk8xrkNKamJoyUo382HRL7ATRpFZCw6tE=
github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc=
Expand Down Expand Up @@ -96,6 +98,11 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0=
github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE=
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU=
github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8=
github.com/vishvananda/netns v0.0.4/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM=
github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc=
github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
Expand All @@ -112,6 +119,7 @@ golang.org/x/oauth2 v0.24.0 h1:KTBBxWqUa0ykRPLtV69rRto9TLXcqYkeswu48x/gvNE=
golang.org/x/oauth2 v0.24.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ=
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20211031064116-611d5d643895/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
Expand Down
Loading