Skip to content
This repository has been archived by the owner on Mar 27, 2024. It is now read-only.

fix(nvme): handle string and float values #993

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 32 additions & 26 deletions modules/nvme/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"path/filepath"
"strconv"
"time"
)

Expand Down Expand Up @@ -42,32 +43,32 @@ func (n *NVMe) collectNVMeDevice(mx map[string]int64, devicePath string) error {

device := extractDeviceFromPath(devicePath)

mx["device_"+device+"_temperature"] = int64(float64(stats.Temperature) - 273.15) // Kelvin => Celsius
mx["device_"+device+"_percentage_used"] = stats.PercentUsed
mx["device_"+device+"_available_spare"] = stats.AvailSpare
mx["device_"+device+"_data_units_read"] = stats.DataUnitsRead * 1000 * 512
mx["device_"+device+"_data_units_written"] = stats.DataUnitsWritten * 1000 * 512
mx["device_"+device+"_host_read_commands"] = stats.HostReadCommands
mx["device_"+device+"_host_write_commands"] = stats.HostWriteCommands
mx["device_"+device+"_power_cycles"] = stats.PowerCycles
mx["device_"+device+"_power_on_time"] = stats.PowerOnHours * 3600 // hours => seconds
mx["device_"+device+"_unsafe_shutdowns"] = stats.UnsafeShutdowns
mx["device_"+device+"_media_errors"] = stats.MediaErrors
mx["device_"+device+"_num_err_log_entries"] = stats.NumErrLogEntries
mx["device_"+device+"_controller_busy_time"] = stats.ControllerBusyTime * 60 // minutes => seconds
mx["device_"+device+"_warning_temp_time"] = stats.WarningTempTime * 60 // minutes => seconds
mx["device_"+device+"_critical_comp_time"] = stats.CriticalCompTime * 60 // minutes => seconds
mx["device_"+device+"_thm_temp1_trans_count"] = stats.ThmTemp1TransCount
mx["device_"+device+"_thm_temp2_trans_count"] = stats.ThmTemp2TransCount
mx["device_"+device+"_thm_temp1_total_time"] = stats.ThmTemp1TotalTime // seconds
mx["device_"+device+"_thm_temp2_total_time"] = stats.ThmTemp2TotalTime // seconds

mx["device_"+device+"_critical_warning_available_spare"] = boolToInt(stats.CriticalWarning&1 != 0)
mx["device_"+device+"_critical_warning_temp_threshold"] = boolToInt(stats.CriticalWarning&(1<<1) != 0)
mx["device_"+device+"_critical_warning_nvm_subsystem_reliability"] = boolToInt(stats.CriticalWarning&(1<<2) != 0)
mx["device_"+device+"_critical_warning_read_only"] = boolToInt(stats.CriticalWarning&(1<<3) != 0)
mx["device_"+device+"_critical_warning_volatile_mem_backup_failed"] = boolToInt(stats.CriticalWarning&(1<<4) != 0)
mx["device_"+device+"_critical_warning_persistent_memory_read_only"] = boolToInt(stats.CriticalWarning&(1<<5) != 0)
mx["device_"+device+"_temperature"] = int64(float64(parseValue(stats.Temperature)) - 273.15) // Kelvin => Celsius
mx["device_"+device+"_percentage_used"] = parseValue(stats.PercentUsed)
mx["device_"+device+"_available_spare"] = parseValue(stats.AvailSpare)
mx["device_"+device+"_data_units_read"] = parseValue(stats.DataUnitsRead) * 1000 * 512 // units => bytes
mx["device_"+device+"_data_units_written"] = parseValue(stats.DataUnitsWritten) * 1000 * 512 // units => bytes
mx["device_"+device+"_host_read_commands"] = parseValue(stats.HostReadCommands)
mx["device_"+device+"_host_write_commands"] = parseValue(stats.HostWriteCommands)
mx["device_"+device+"_power_cycles"] = parseValue(stats.PowerCycles)
mx["device_"+device+"_power_on_time"] = parseValue(stats.PowerOnHours) * 3600 // hours => seconds
mx["device_"+device+"_unsafe_shutdowns"] = parseValue(stats.UnsafeShutdowns)
mx["device_"+device+"_media_errors"] = parseValue(stats.MediaErrors)
mx["device_"+device+"_num_err_log_entries"] = parseValue(stats.NumErrLogEntries)
mx["device_"+device+"_controller_busy_time"] = parseValue(stats.ControllerBusyTime) * 60 // minutes => seconds
mx["device_"+device+"_warning_temp_time"] = parseValue(stats.WarningTempTime) * 60 // minutes => seconds
mx["device_"+device+"_critical_comp_time"] = parseValue(stats.CriticalCompTime) * 60 // minutes => seconds
mx["device_"+device+"_thm_temp1_trans_count"] = parseValue(stats.ThmTemp1TransCount)
mx["device_"+device+"_thm_temp2_trans_count"] = parseValue(stats.ThmTemp2TransCount)
mx["device_"+device+"_thm_temp1_total_time"] = parseValue(stats.ThmTemp1TotalTime) // seconds
mx["device_"+device+"_thm_temp2_total_time"] = parseValue(stats.ThmTemp2TotalTime) // seconds

mx["device_"+device+"_critical_warning_available_spare"] = boolToInt(parseValue(stats.CriticalWarning)&1 != 0)
mx["device_"+device+"_critical_warning_temp_threshold"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<1) != 0)
mx["device_"+device+"_critical_warning_nvm_subsystem_reliability"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<2) != 0)
mx["device_"+device+"_critical_warning_read_only"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<3) != 0)
mx["device_"+device+"_critical_warning_volatile_mem_backup_failed"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<4) != 0)
mx["device_"+device+"_critical_warning_persistent_memory_read_only"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<5) != 0)

return nil
}
Expand Down Expand Up @@ -110,3 +111,8 @@ func boolToInt(v bool) int64 {
}
return 0
}

func parseValue(s nvmeNumber) int64 {
v, _ := strconv.ParseFloat(string(s), 64)
return int64(v)
}
59 changes: 34 additions & 25 deletions modules/nvme/exec.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package nvme

import (
"bytes"
"context"
"encoding/json"
"os/exec"
Expand All @@ -9,37 +10,45 @@ import (

type nvmeDeviceList struct {
Devices []struct {
DevicePath string `json:"DevicePath"`
UsedBytes int64 `json:"UsedBytes"`
PhysicalSize int64 `json:"PhysicalSize"`
SectorSize int64 `json:"SectorSize"`
DevicePath string `json:"DevicePath"`
UsedBytes nvmeNumber `json:"UsedBytes"`
PhysicalSize nvmeNumber `json:"PhysicalSize"`
SectorSize nvmeNumber `json:"SectorSize"`
}
}

// See "Health Information Log Page" in the Current Specification Version
// https://nvmexpress.org/developers/nvme-specification/
type nvmeDeviceSmartLog struct {
CriticalWarning int64 `json:"critical_warning"`
Temperature int64 `json:"temperature"`
AvailSpare int64 `json:"avail_spare"`
SpareThresh int64 `json:"spare_thresh"`
PercentUsed int64 `json:"percent_used"`
DataUnitsRead int64 `json:"data_units_read"`
DataUnitsWritten int64 `json:"data_units_written"`
HostReadCommands int64 `json:"host_read_commands"`
HostWriteCommands int64 `json:"host_write_commands"`
ControllerBusyTime int64 `json:"controller_busy_time"`
PowerCycles int64 `json:"power_cycles"`
PowerOnHours int64 `json:"power_on_hours"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
MediaErrors int64 `json:"media_errors"`
NumErrLogEntries int64 `json:"num_err_log_entries"`
WarningTempTime int64 `json:"warning_temp_time"`
CriticalCompTime int64 `json:"critical_comp_time"`
ThmTemp1TransCount int64 `json:"thm_temp1_trans_count"`
ThmTemp2TransCount int64 `json:"thm_temp2_trans_count"`
ThmTemp1TotalTime int64 `json:"thm_temp1_total_time"`
ThmTemp2TotalTime int64 `json:"thm_temp2_total_time"`
CriticalWarning nvmeNumber `json:"critical_warning"`
Temperature nvmeNumber `json:"temperature"`
AvailSpare nvmeNumber `json:"avail_spare"`
SpareThresh nvmeNumber `json:"spare_thresh"`
PercentUsed nvmeNumber `json:"percent_used"`
DataUnitsRead nvmeNumber `json:"data_units_read"`
DataUnitsWritten nvmeNumber `json:"data_units_written"`
HostReadCommands nvmeNumber `json:"host_read_commands"`
HostWriteCommands nvmeNumber `json:"host_write_commands"`
ControllerBusyTime nvmeNumber `json:"controller_busy_time"`
PowerCycles nvmeNumber `json:"power_cycles"`
PowerOnHours nvmeNumber `json:"power_on_hours"`
UnsafeShutdowns nvmeNumber `json:"unsafe_shutdowns"`
MediaErrors nvmeNumber `json:"media_errors"`
NumErrLogEntries nvmeNumber `json:"num_err_log_entries"`
WarningTempTime nvmeNumber `json:"warning_temp_time"`
CriticalCompTime nvmeNumber `json:"critical_comp_time"`
ThmTemp1TransCount nvmeNumber `json:"thm_temp1_trans_count"`
ThmTemp2TransCount nvmeNumber `json:"thm_temp2_trans_count"`
ThmTemp1TotalTime nvmeNumber `json:"thm_temp1_total_time"`
ThmTemp2TotalTime nvmeNumber `json:"thm_temp2_total_time"`
}

// nvme-cli 2.1.1 exposes some values as strings
type nvmeNumber string

func (n *nvmeNumber) UnmarshalJSON(b []byte) error {
*n = nvmeNumber(bytes.Trim(b, "\""))
return nil
}

type nvmeCLIExec struct {
Expand Down
167 changes: 157 additions & 10 deletions modules/nvme/nvme_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,19 @@ import (
)

var (
dataNVMeListJSON, _ = os.ReadFile("testdata/nvme-list.json")
dataNVMeListEmptyJSON, _ = os.ReadFile("testdata/nvme-list-empty.json")
dataNVMeSmartLogJSON, _ = os.ReadFile("testdata/nvme-smart-log.json")
dataNVMeListJSON, _ = os.ReadFile("testdata/nvme-list.json")
dataNVMeListEmptyJSON, _ = os.ReadFile("testdata/nvme-list-empty.json")
dataNVMeSmartLogJSON, _ = os.ReadFile("testdata/nvme-smart-log.json")
dataNVMeSmartLogStringJSON, _ = os.ReadFile("testdata/nvme-smart-log-string.json")
dataNVMeSmartLogFloatJSON, _ = os.ReadFile("testdata/nvme-smart-log-float.json")
)

func Test_testDataIsValid(t *testing.T) {
for name, data := range map[string][]byte{
"dataNVMeListJSON": dataNVMeListJSON,
"dataNVMeListEmptyJSON": dataNVMeListEmptyJSON,
"dataNVMeSmartLogJSON": dataNVMeSmartLogJSON,
"dataNVMeListJSON": dataNVMeListJSON,
"dataNVMeListEmptyJSON": dataNVMeListEmptyJSON,
"dataNVMeSmartLogStringJSON": dataNVMeSmartLogStringJSON,
"dataNVMeSmartLogFloatJSON": dataNVMeSmartLogFloatJSON,
} {
require.NotNilf(t, data, name)
}
Expand Down Expand Up @@ -177,6 +180,132 @@ func TestNVMe_Collect(t *testing.T) {
},
},
},
"success if all calls successful with string values": {
{
prepare: prepareCaseStringValuesOK,
check: func(t *testing.T, n *NVMe) {
mx := n.Collect()

expected := map[string]int64{
"device_nvme0n1_available_spare": 100,
"device_nvme0n1_controller_busy_time": 497040,
"device_nvme0n1_critical_comp_time": 0,
"device_nvme0n1_critical_warning_available_spare": 0,
"device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0,
"device_nvme0n1_critical_warning_persistent_memory_read_only": 0,
"device_nvme0n1_critical_warning_read_only": 0,
"device_nvme0n1_critical_warning_temp_threshold": 0,
"device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0,
"device_nvme0n1_data_units_read": 5068041216000,
"device_nvme0n1_data_units_written": 69712734208000,
"device_nvme0n1_host_read_commands": 313528805,
"device_nvme0n1_host_write_commands": 1928062610,
"device_nvme0n1_media_errors": 0,
"device_nvme0n1_num_err_log_entries": 110,
"device_nvme0n1_percentage_used": 2,
"device_nvme0n1_power_cycles": 64,
"device_nvme0n1_power_on_time": 17906400,
"device_nvme0n1_temperature": 36,
"device_nvme0n1_thm_temp1_total_time": 0,
"device_nvme0n1_thm_temp1_trans_count": 0,
"device_nvme0n1_thm_temp2_total_time": 0,
"device_nvme0n1_thm_temp2_trans_count": 0,
"device_nvme0n1_unsafe_shutdowns": 39,
"device_nvme0n1_warning_temp_time": 0,
"device_nvme1n1_available_spare": 100,
"device_nvme1n1_controller_busy_time": 497040,
"device_nvme1n1_critical_comp_time": 0,
"device_nvme1n1_critical_warning_available_spare": 0,
"device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0,
"device_nvme1n1_critical_warning_persistent_memory_read_only": 0,
"device_nvme1n1_critical_warning_read_only": 0,
"device_nvme1n1_critical_warning_temp_threshold": 0,
"device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0,
"device_nvme1n1_data_units_read": 5068041216000,
"device_nvme1n1_data_units_written": 69712734208000,
"device_nvme1n1_host_read_commands": 313528805,
"device_nvme1n1_host_write_commands": 1928062610,
"device_nvme1n1_media_errors": 0,
"device_nvme1n1_num_err_log_entries": 110,
"device_nvme1n1_percentage_used": 2,
"device_nvme1n1_power_cycles": 64,
"device_nvme1n1_power_on_time": 17906400,
"device_nvme1n1_temperature": 36,
"device_nvme1n1_thm_temp1_total_time": 0,
"device_nvme1n1_thm_temp1_trans_count": 0,
"device_nvme1n1_thm_temp2_total_time": 0,
"device_nvme1n1_thm_temp2_trans_count": 0,
"device_nvme1n1_unsafe_shutdowns": 39,
"device_nvme1n1_warning_temp_time": 0,
}

assert.Equal(t, expected, mx)
},
},
},
"success if all calls successful with float values": {
{
prepare: prepareCaseFloatValuesOK,
check: func(t *testing.T, n *NVMe) {
mx := n.Collect()

expected := map[string]int64{
"device_nvme0n1_available_spare": 100,
"device_nvme0n1_controller_busy_time": 497040,
"device_nvme0n1_critical_comp_time": 0,
"device_nvme0n1_critical_warning_available_spare": 0,
"device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0,
"device_nvme0n1_critical_warning_persistent_memory_read_only": 0,
"device_nvme0n1_critical_warning_read_only": 0,
"device_nvme0n1_critical_warning_temp_threshold": 0,
"device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0,
"device_nvme0n1_data_units_read": 5068041216000,
"device_nvme0n1_data_units_written": 69712734208000,
"device_nvme0n1_host_read_commands": 313528805,
"device_nvme0n1_host_write_commands": 1928062610,
"device_nvme0n1_media_errors": 0,
"device_nvme0n1_num_err_log_entries": 110,
"device_nvme0n1_percentage_used": 2,
"device_nvme0n1_power_cycles": 64,
"device_nvme0n1_power_on_time": 17906400,
"device_nvme0n1_temperature": 36,
"device_nvme0n1_thm_temp1_total_time": 0,
"device_nvme0n1_thm_temp1_trans_count": 0,
"device_nvme0n1_thm_temp2_total_time": 0,
"device_nvme0n1_thm_temp2_trans_count": 0,
"device_nvme0n1_unsafe_shutdowns": 39,
"device_nvme0n1_warning_temp_time": 0,
"device_nvme1n1_available_spare": 100,
"device_nvme1n1_controller_busy_time": 497040,
"device_nvme1n1_critical_comp_time": 0,
"device_nvme1n1_critical_warning_available_spare": 0,
"device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0,
"device_nvme1n1_critical_warning_persistent_memory_read_only": 0,
"device_nvme1n1_critical_warning_read_only": 0,
"device_nvme1n1_critical_warning_temp_threshold": 0,
"device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0,
"device_nvme1n1_data_units_read": 5068041216000,
"device_nvme1n1_data_units_written": 69712734208000,
"device_nvme1n1_host_read_commands": 313528805,
"device_nvme1n1_host_write_commands": 1928062610,
"device_nvme1n1_media_errors": 0,
"device_nvme1n1_num_err_log_entries": 110,
"device_nvme1n1_percentage_used": 2,
"device_nvme1n1_power_cycles": 64,
"device_nvme1n1_power_on_time": 17906400,
"device_nvme1n1_temperature": 36,
"device_nvme1n1_thm_temp1_total_time": 0,
"device_nvme1n1_thm_temp1_trans_count": 0,
"device_nvme1n1_thm_temp2_total_time": 0,
"device_nvme1n1_thm_temp2_trans_count": 0,
"device_nvme1n1_unsafe_shutdowns": 39,
"device_nvme1n1_warning_temp_time": 0,
}

assert.Equal(t, expected, mx)
},
},
},
"fail if 'nvme list' returns an empty list": {
{
prepare: prepareCaseEmptyList,
Expand Down Expand Up @@ -227,6 +356,14 @@ func prepareCaseOK(n *NVMe) {
n.exec = &mockNVMeCLIExec{}
}

func prepareCaseStringValuesOK(n *NVMe) {
n.exec = &mockNVMeCLIExec{smartLogString: true}
}

func prepareCaseFloatValuesOK(n *NVMe) {
n.exec = &mockNVMeCLIExec{smartLogFloat: true}
}

func prepareCaseEmptyList(n *NVMe) {
n.exec = &mockNVMeCLIExec{emptyList: true}
}
Expand All @@ -240,9 +377,11 @@ func prepareCaseErrOnSmartLog(n *NVMe) {
}

type mockNVMeCLIExec struct {
errOnList bool
errOnSmartLog bool
emptyList bool
errOnList bool
errOnSmartLog bool
emptyList bool
smartLogString bool
smartLogFloat bool
}

func (m *mockNVMeCLIExec) list() (*nvmeDeviceList, error) {
Expand Down Expand Up @@ -271,8 +410,16 @@ func (m *mockNVMeCLIExec) smartLog(_ string) (*nvmeDeviceSmartLog, error) {
return nil, errors.New("mock.smartLog() no devices error")
}

data := dataNVMeSmartLogJSON
if m.smartLogString {
data = dataNVMeSmartLogStringJSON
}
if m.smartLogFloat {
data = dataNVMeSmartLogFloatJSON
}

var v nvmeDeviceSmartLog
if err := json.Unmarshal(dataNVMeSmartLogJSON, &v); err != nil {
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}

Expand Down
24 changes: 24 additions & 0 deletions modules/nvme/testdata/nvme-smart-log-float.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"critical_warning": 0,
"temperature": 310.0,
"avail_spare": 100.0,
"spare_thresh": 5.0,
"percent_used": 2.0,
"endurance_grp_critical_warning_summary": 0,
"data_units_read": 9898518.0,
"data_units_written": 136157684.0,
"host_read_commands": 313528805.0,
"host_write_commands": 1928062610.0,
"controller_busy_time": 8284.0,
"power_cycles": 64.0,
"power_on_hours": 4974.0,
"unsafe_shutdowns": 39.0,
"media_errors": 0,
"num_err_log_entries": 110.0,
"warning_temp_time": 0,
"critical_comp_time": 0,
"thm_temp1_trans_count": 0,
"thm_temp2_trans_count": 0,
"thm_temp1_total_time": 0,
"thm_temp2_total_time": 0
}
Loading