Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

infiniband: Add new collector for InfiniBand statistics #450

Merged
merged 5 commits into from
Feb 16, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AUTHORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ The following individuals have contributed code to this repository
* Ken Herner <ken@modulus.io>
* Matt Layher <mdlayher@gmail.com>
* Matthias Rampke <matthias@rampke.de>
* Robert Clark <robert.d.clark@hpe.com>
* Siavash Safi <siavash.safi@gmail.com>
* Stephen Shirley <kormat@gmail.com>
* Steve Durrheimer <s.durrheimer@gmail.com>
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ entropy | Exposes available entropy. | Linux
filefd | Exposes file descriptor statistics from `/proc/sys/fs/file-nr`. | Linux
filesystem | Exposes filesystem statistics, such as disk space used. | Darwin, Dragonfly, FreeBSD, Linux, OpenBSD
hwmon | Expose hardware monitoring and sensor data from `/sys/class/hwmon/`. | Linux
infiniband | Exposes network statistics specific to InfiniBand configurations. | Linux
loadavg | Exposes load average. | Darwin, Dragonfly, FreeBSD, Linux, NetBSD, OpenBSD, Solaris
mdadm | Exposes statistics about devices in `/proc/mdstat` (does nothing if no `/proc/mdstat` present). | Linux
meminfo | Exposes memory statistics. | Darwin, Dragonfly, FreeBSD, Linux
Expand Down
32 changes: 32 additions & 0 deletions collector/fixtures/e2e-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,38 @@ node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp2"} 84
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp3"} 84
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp4"} 84
node_hwmon_temp_max_celsius{chip="platform_coretemp_1",sensor="temp5"} 84
# HELP node_infiniband_link_downed_total Number of times the link failed to recover from an error state and went down
# TYPE node_infiniband_link_downed_total counter
node_infiniband_link_downed_total{device="mlx4_0",port="1"} 0
node_infiniband_link_downed_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_link_error_recovery_total Number of times the link successfully recovered from an error state
# TYPE node_infiniband_link_error_recovery_total counter
node_infiniband_link_error_recovery_total{device="mlx4_0",port="1"} 0
node_infiniband_link_error_recovery_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_multicast_packets_received_total Number of multicast packets received (including errors)
# TYPE node_infiniband_multicast_packets_received_total counter
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="1"} 93
node_infiniband_multicast_packets_received_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_multicast_packets_transmitted_total Number of multicast packets transmitted (including errors)
# TYPE node_infiniband_multicast_packets_transmitted_total counter
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="1"} 16
node_infiniband_multicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_port_data_received_bytes Number of data octets received on all links
# TYPE node_infiniband_port_data_received_bytes counter
node_infiniband_port_data_received_bytes{device="mlx4_0",port="1"} 4.631917e+06
node_infiniband_port_data_received_bytes{device="mlx4_0",port="2"} 0
# HELP node_infiniband_port_data_transmitted_bytes Number of data octets transmitted on all links
# TYPE node_infiniband_port_data_transmitted_bytes counter
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="1"} 3.73344e+06
node_infiniband_port_data_transmitted_bytes{device="mlx4_0",port="2"} 0
# HELP node_infiniband_unicast_packets_received_total Number of unicast packets received (including errors)
# TYPE node_infiniband_unicast_packets_received_total counter
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="1"} 61148
node_infiniband_unicast_packets_received_total{device="mlx4_0",port="2"} 0
# HELP node_infiniband_unicast_packets_transmitted_total Number of unicast packets transmitted (including errors)
# TYPE node_infiniband_unicast_packets_transmitted_total counter
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="1"} 61239
node_infiniband_unicast_packets_transmitted_total{device="mlx4_0",port="2"} 0
# HELP node_intr Total number of interrupts serviced.
# TYPE node_intr counter
node_intr 8.885917e+06
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
93
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
16
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
4631917
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3733440
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
61148
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
61239
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0
177 changes: 177 additions & 0 deletions collector/infiniband_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build linux
// +build !noinfiniband

package collector

import (
"errors"
"path/filepath"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
)

const infinibandPath = "class/infiniband"

var (
errInfinibandNoDevicesFound = errors.New("no InfiniBand devices detected")
errInfinibandNoPortsFound = errors.New("no InfiniBand ports detected")
)

type infinibandCollector struct {
metricDescs map[string]*prometheus.Desc
counters map[string]infinibandMetric
}

type infinibandMetric struct {
File string
Help string
}

func init() {
Factories["infiniband"] = NewInfiniBandCollector
}

func NewInfiniBandCollector() (Collector, error) {
var i infinibandCollector

// Filenames of all InfiniBand counter metrics including a detailed description.
i.counters = map[string]infinibandMetric{
"link_downed_total": {"link_downed", "Number of times the link failed to recover from an error state and went down"},
"link_error_recovery_total": {"link_error_recovery", "Number of times the link successfully recovered from an error state"},
"multicast_packets_received_total": {"multicast_rcv_packets", "Number of multicast packets received (including errors)"},
"multicast_packets_transmitted_total": {"multicast_xmit_packets", "Number of multicast packets transmitted (including errors)"},
"port_data_received_bytes": {"port_rcv_data", "Number of data octets received on all links"},
"port_data_transmitted_bytes": {"port_xmit_data", "Number of data octets transmitted on all links"},
"unicast_packets_received_total": {"unicast_rcv_packets", "Number of unicast packets received (including errors)"},
"unicast_packets_transmitted_total": {"unicast_xmit_packets", "Number of unicast packets transmitted (including errors)"},
}

subsystem := "infiniband"
i.metricDescs = make(map[string]*prometheus.Desc)

for metricName, infinibandMetric := range i.counters {
i.metricDescs[metricName] = prometheus.NewDesc(
prometheus.BuildFQName(Namespace, subsystem, metricName),
infinibandMetric.Help,
[]string{"device", "port"},
nil,
)
}

return &i, nil
}

// infinibandDevices retrieves a list of InfiniBand devices.
func infinibandDevices(infinibandPath string) ([]string, error) {
devices, err := filepath.Glob(filepath.Join(infinibandPath, "/*"))
if err != nil {
return nil, err
}

if len(devices) < 1 {
log.Debugf("Unable to detect InfiniBand devices")
err = errInfinibandNoDevicesFound
return nil, err
}

// Extract just the filenames which equate to the device names.
for i, device := range devices {
devices[i] = filepath.Base(device)
}

return devices, nil
}

// Retrieve a list of ports for the InfiniBand device.
func infinibandPorts(infinibandPath, device string) ([]string, error) {
ports, err := filepath.Glob(filepath.Join(infinibandPath, device, "ports/*"))
if err != nil {
return nil, err
}

if len(ports) < 1 {
log.Debugf("Unable to detect ports for %s", device)
err = errInfinibandNoPortsFound
return nil, err
}

// Extract just the filenames which equates to the port numbers.
for i, port := range ports {
ports[i] = filepath.Base(port)
}

return ports, nil
}

func readMetric(directory, metricFile string) (uint64, error) {
metric, err := readUintFromFile(filepath.Join(directory, metricFile))
if err != nil {
log.Debugf("Error reading %q file", metricFile)
return 0, err
}

return metric, nil
}

func (c *infinibandCollector) Update(ch chan<- prometheus.Metric) (err error) {
devices, err := infinibandDevices(sysFilePath(infinibandPath))

// If no devices are found or another error is raised while attempting to find devices,
// InfiniBand is likely not installed and the collector should be skipped.
switch err {
case nil:
case errInfinibandNoDevicesFound:
return nil
default:
return err
}

for _, device := range devices {
ports, err := infinibandPorts(sysFilePath(infinibandPath), device)

// If no ports are found for the specified device, skip to the next device.
switch err {
case nil:
case errInfinibandNoPortsFound:
continue
default:
return err
}

for _, port := range ports {
portFiles := sysFilePath(filepath.Join(infinibandPath, device, "ports", port))

// Add metrics for the InfiniBand counters.
for metricName, infinibandMetric := range c.counters {
metric, err := readMetric(filepath.Join(portFiles, "counters"), infinibandMetric.File)
if err != nil {
return err
}

ch <- prometheus.MustNewConstMetric(
c.metricDescs[metricName],
prometheus.CounterValue,
float64(metric),
device,
port,
)
}
}
}

return nil
}
40 changes: 40 additions & 0 deletions collector/infiniband_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package collector

import (
"testing"
)

func TestInfiniBandDevices(t *testing.T) {
devices, err := infinibandDevices("fixtures/sys/class/infiniband")
if err != nil {
t.Fatal(err)
}

if l := len(devices); l != 1 {
t.Fatal("Retrieved an unexpected number of InfiniBand devices: %d", l)
}
}

func TestInfiniBandPorts(t *testing.T) {
ports, err := infinibandPorts("fixtures/sys/class/infiniband", "mlx4_0")
if err != nil {
t.Fatal(err)
}

if l := len(ports); l != 2 {
t.Fatal("Retrieved an unexpected number of InfiniBand ports: %d", l)
}
}
1 change: 1 addition & 0 deletions end-to-end-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ collectors=$(cat << COLLECTORS
entropy
filefd
hwmon
infiniband
ksmd
loadavg
mdadm
Expand Down
2 changes: 1 addition & 1 deletion node_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import (
)

const (
defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs"
defaultCollectors = "conntrack,cpu,diskstats,entropy,edac,filefd,filesystem,hwmon,infiniband,loadavg,mdadm,meminfo,netdev,netstat,sockstat,stat,textfile,time,uname,vmstat,wifi,zfs"
)

var (
Expand Down