Skip to content
This repository has been archived by the owner on Mar 27, 2024. It is now read-only.

Commit

Permalink
fix(nvme): handle string and float values (#993)
Browse files Browse the repository at this point in the history
  • Loading branch information
ilyam8 authored Nov 19, 2022
1 parent 49775c8 commit 87fddac
Show file tree
Hide file tree
Showing 5 changed files with 271 additions and 61 deletions.
58 changes: 32 additions & 26 deletions modules/nvme/collect.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"path/filepath"
"strconv"
"time"
)

Expand Down Expand Up @@ -42,32 +43,32 @@ func (n *NVMe) collectNVMeDevice(mx map[string]int64, devicePath string) error {

device := extractDeviceFromPath(devicePath)

mx["device_"+device+"_temperature"] = int64(float64(stats.Temperature) - 273.15) // Kelvin => Celsius
mx["device_"+device+"_percentage_used"] = stats.PercentUsed
mx["device_"+device+"_available_spare"] = stats.AvailSpare
mx["device_"+device+"_data_units_read"] = stats.DataUnitsRead * 1000 * 512
mx["device_"+device+"_data_units_written"] = stats.DataUnitsWritten * 1000 * 512
mx["device_"+device+"_host_read_commands"] = stats.HostReadCommands
mx["device_"+device+"_host_write_commands"] = stats.HostWriteCommands
mx["device_"+device+"_power_cycles"] = stats.PowerCycles
mx["device_"+device+"_power_on_time"] = stats.PowerOnHours * 3600 // hours => seconds
mx["device_"+device+"_unsafe_shutdowns"] = stats.UnsafeShutdowns
mx["device_"+device+"_media_errors"] = stats.MediaErrors
mx["device_"+device+"_num_err_log_entries"] = stats.NumErrLogEntries
mx["device_"+device+"_controller_busy_time"] = stats.ControllerBusyTime * 60 // minutes => seconds
mx["device_"+device+"_warning_temp_time"] = stats.WarningTempTime * 60 // minutes => seconds
mx["device_"+device+"_critical_comp_time"] = stats.CriticalCompTime * 60 // minutes => seconds
mx["device_"+device+"_thm_temp1_trans_count"] = stats.ThmTemp1TransCount
mx["device_"+device+"_thm_temp2_trans_count"] = stats.ThmTemp2TransCount
mx["device_"+device+"_thm_temp1_total_time"] = stats.ThmTemp1TotalTime // seconds
mx["device_"+device+"_thm_temp2_total_time"] = stats.ThmTemp2TotalTime // seconds

mx["device_"+device+"_critical_warning_available_spare"] = boolToInt(stats.CriticalWarning&1 != 0)
mx["device_"+device+"_critical_warning_temp_threshold"] = boolToInt(stats.CriticalWarning&(1<<1) != 0)
mx["device_"+device+"_critical_warning_nvm_subsystem_reliability"] = boolToInt(stats.CriticalWarning&(1<<2) != 0)
mx["device_"+device+"_critical_warning_read_only"] = boolToInt(stats.CriticalWarning&(1<<3) != 0)
mx["device_"+device+"_critical_warning_volatile_mem_backup_failed"] = boolToInt(stats.CriticalWarning&(1<<4) != 0)
mx["device_"+device+"_critical_warning_persistent_memory_read_only"] = boolToInt(stats.CriticalWarning&(1<<5) != 0)
mx["device_"+device+"_temperature"] = int64(float64(parseValue(stats.Temperature)) - 273.15) // Kelvin => Celsius
mx["device_"+device+"_percentage_used"] = parseValue(stats.PercentUsed)
mx["device_"+device+"_available_spare"] = parseValue(stats.AvailSpare)
mx["device_"+device+"_data_units_read"] = parseValue(stats.DataUnitsRead) * 1000 * 512 // units => bytes
mx["device_"+device+"_data_units_written"] = parseValue(stats.DataUnitsWritten) * 1000 * 512 // units => bytes
mx["device_"+device+"_host_read_commands"] = parseValue(stats.HostReadCommands)
mx["device_"+device+"_host_write_commands"] = parseValue(stats.HostWriteCommands)
mx["device_"+device+"_power_cycles"] = parseValue(stats.PowerCycles)
mx["device_"+device+"_power_on_time"] = parseValue(stats.PowerOnHours) * 3600 // hours => seconds
mx["device_"+device+"_unsafe_shutdowns"] = parseValue(stats.UnsafeShutdowns)
mx["device_"+device+"_media_errors"] = parseValue(stats.MediaErrors)
mx["device_"+device+"_num_err_log_entries"] = parseValue(stats.NumErrLogEntries)
mx["device_"+device+"_controller_busy_time"] = parseValue(stats.ControllerBusyTime) * 60 // minutes => seconds
mx["device_"+device+"_warning_temp_time"] = parseValue(stats.WarningTempTime) * 60 // minutes => seconds
mx["device_"+device+"_critical_comp_time"] = parseValue(stats.CriticalCompTime) * 60 // minutes => seconds
mx["device_"+device+"_thm_temp1_trans_count"] = parseValue(stats.ThmTemp1TransCount)
mx["device_"+device+"_thm_temp2_trans_count"] = parseValue(stats.ThmTemp2TransCount)
mx["device_"+device+"_thm_temp1_total_time"] = parseValue(stats.ThmTemp1TotalTime) // seconds
mx["device_"+device+"_thm_temp2_total_time"] = parseValue(stats.ThmTemp2TotalTime) // seconds

mx["device_"+device+"_critical_warning_available_spare"] = boolToInt(parseValue(stats.CriticalWarning)&1 != 0)
mx["device_"+device+"_critical_warning_temp_threshold"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<1) != 0)
mx["device_"+device+"_critical_warning_nvm_subsystem_reliability"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<2) != 0)
mx["device_"+device+"_critical_warning_read_only"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<3) != 0)
mx["device_"+device+"_critical_warning_volatile_mem_backup_failed"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<4) != 0)
mx["device_"+device+"_critical_warning_persistent_memory_read_only"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<5) != 0)

return nil
}
Expand Down Expand Up @@ -110,3 +111,8 @@ func boolToInt(v bool) int64 {
}
return 0
}

func parseValue(s nvmeNumber) int64 {
v, _ := strconv.ParseFloat(string(s), 64)
return int64(v)
}
59 changes: 34 additions & 25 deletions modules/nvme/exec.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package nvme

import (
"bytes"
"context"
"encoding/json"
"os/exec"
Expand All @@ -9,37 +10,45 @@ import (

type nvmeDeviceList struct {
Devices []struct {
DevicePath string `json:"DevicePath"`
UsedBytes int64 `json:"UsedBytes"`
PhysicalSize int64 `json:"PhysicalSize"`
SectorSize int64 `json:"SectorSize"`
DevicePath string `json:"DevicePath"`
UsedBytes nvmeNumber `json:"UsedBytes"`
PhysicalSize nvmeNumber `json:"PhysicalSize"`
SectorSize nvmeNumber `json:"SectorSize"`
}
}

// See "Health Information Log Page" in the Current Specification Version
// https://nvmexpress.org/developers/nvme-specification/
type nvmeDeviceSmartLog struct {
CriticalWarning int64 `json:"critical_warning"`
Temperature int64 `json:"temperature"`
AvailSpare int64 `json:"avail_spare"`
SpareThresh int64 `json:"spare_thresh"`
PercentUsed int64 `json:"percent_used"`
DataUnitsRead int64 `json:"data_units_read"`
DataUnitsWritten int64 `json:"data_units_written"`
HostReadCommands int64 `json:"host_read_commands"`
HostWriteCommands int64 `json:"host_write_commands"`
ControllerBusyTime int64 `json:"controller_busy_time"`
PowerCycles int64 `json:"power_cycles"`
PowerOnHours int64 `json:"power_on_hours"`
UnsafeShutdowns int64 `json:"unsafe_shutdowns"`
MediaErrors int64 `json:"media_errors"`
NumErrLogEntries int64 `json:"num_err_log_entries"`
WarningTempTime int64 `json:"warning_temp_time"`
CriticalCompTime int64 `json:"critical_comp_time"`
ThmTemp1TransCount int64 `json:"thm_temp1_trans_count"`
ThmTemp2TransCount int64 `json:"thm_temp2_trans_count"`
ThmTemp1TotalTime int64 `json:"thm_temp1_total_time"`
ThmTemp2TotalTime int64 `json:"thm_temp2_total_time"`
CriticalWarning nvmeNumber `json:"critical_warning"`
Temperature nvmeNumber `json:"temperature"`
AvailSpare nvmeNumber `json:"avail_spare"`
SpareThresh nvmeNumber `json:"spare_thresh"`
PercentUsed nvmeNumber `json:"percent_used"`
DataUnitsRead nvmeNumber `json:"data_units_read"`
DataUnitsWritten nvmeNumber `json:"data_units_written"`
HostReadCommands nvmeNumber `json:"host_read_commands"`
HostWriteCommands nvmeNumber `json:"host_write_commands"`
ControllerBusyTime nvmeNumber `json:"controller_busy_time"`
PowerCycles nvmeNumber `json:"power_cycles"`
PowerOnHours nvmeNumber `json:"power_on_hours"`
UnsafeShutdowns nvmeNumber `json:"unsafe_shutdowns"`
MediaErrors nvmeNumber `json:"media_errors"`
NumErrLogEntries nvmeNumber `json:"num_err_log_entries"`
WarningTempTime nvmeNumber `json:"warning_temp_time"`
CriticalCompTime nvmeNumber `json:"critical_comp_time"`
ThmTemp1TransCount nvmeNumber `json:"thm_temp1_trans_count"`
ThmTemp2TransCount nvmeNumber `json:"thm_temp2_trans_count"`
ThmTemp1TotalTime nvmeNumber `json:"thm_temp1_total_time"`
ThmTemp2TotalTime nvmeNumber `json:"thm_temp2_total_time"`
}

// nvme-cli 2.1.1 exposes some values as strings
type nvmeNumber string

func (n *nvmeNumber) UnmarshalJSON(b []byte) error {
*n = nvmeNumber(bytes.Trim(b, "\""))
return nil
}

type nvmeCLIExec struct {
Expand Down
167 changes: 157 additions & 10 deletions modules/nvme/nvme_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,19 @@ import (
)

var (
dataNVMeListJSON, _ = os.ReadFile("testdata/nvme-list.json")
dataNVMeListEmptyJSON, _ = os.ReadFile("testdata/nvme-list-empty.json")
dataNVMeSmartLogJSON, _ = os.ReadFile("testdata/nvme-smart-log.json")
dataNVMeListJSON, _ = os.ReadFile("testdata/nvme-list.json")
dataNVMeListEmptyJSON, _ = os.ReadFile("testdata/nvme-list-empty.json")
dataNVMeSmartLogJSON, _ = os.ReadFile("testdata/nvme-smart-log.json")
dataNVMeSmartLogStringJSON, _ = os.ReadFile("testdata/nvme-smart-log-string.json")
dataNVMeSmartLogFloatJSON, _ = os.ReadFile("testdata/nvme-smart-log-float.json")
)

func Test_testDataIsValid(t *testing.T) {
for name, data := range map[string][]byte{
"dataNVMeListJSON": dataNVMeListJSON,
"dataNVMeListEmptyJSON": dataNVMeListEmptyJSON,
"dataNVMeSmartLogJSON": dataNVMeSmartLogJSON,
"dataNVMeListJSON": dataNVMeListJSON,
"dataNVMeListEmptyJSON": dataNVMeListEmptyJSON,
"dataNVMeSmartLogStringJSON": dataNVMeSmartLogStringJSON,
"dataNVMeSmartLogFloatJSON": dataNVMeSmartLogFloatJSON,
} {
require.NotNilf(t, data, name)
}
Expand Down Expand Up @@ -177,6 +180,132 @@ func TestNVMe_Collect(t *testing.T) {
},
},
},
"success if all calls successful with string values": {
{
prepare: prepareCaseStringValuesOK,
check: func(t *testing.T, n *NVMe) {
mx := n.Collect()

expected := map[string]int64{
"device_nvme0n1_available_spare": 100,
"device_nvme0n1_controller_busy_time": 497040,
"device_nvme0n1_critical_comp_time": 0,
"device_nvme0n1_critical_warning_available_spare": 0,
"device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0,
"device_nvme0n1_critical_warning_persistent_memory_read_only": 0,
"device_nvme0n1_critical_warning_read_only": 0,
"device_nvme0n1_critical_warning_temp_threshold": 0,
"device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0,
"device_nvme0n1_data_units_read": 5068041216000,
"device_nvme0n1_data_units_written": 69712734208000,
"device_nvme0n1_host_read_commands": 313528805,
"device_nvme0n1_host_write_commands": 1928062610,
"device_nvme0n1_media_errors": 0,
"device_nvme0n1_num_err_log_entries": 110,
"device_nvme0n1_percentage_used": 2,
"device_nvme0n1_power_cycles": 64,
"device_nvme0n1_power_on_time": 17906400,
"device_nvme0n1_temperature": 36,
"device_nvme0n1_thm_temp1_total_time": 0,
"device_nvme0n1_thm_temp1_trans_count": 0,
"device_nvme0n1_thm_temp2_total_time": 0,
"device_nvme0n1_thm_temp2_trans_count": 0,
"device_nvme0n1_unsafe_shutdowns": 39,
"device_nvme0n1_warning_temp_time": 0,
"device_nvme1n1_available_spare": 100,
"device_nvme1n1_controller_busy_time": 497040,
"device_nvme1n1_critical_comp_time": 0,
"device_nvme1n1_critical_warning_available_spare": 0,
"device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0,
"device_nvme1n1_critical_warning_persistent_memory_read_only": 0,
"device_nvme1n1_critical_warning_read_only": 0,
"device_nvme1n1_critical_warning_temp_threshold": 0,
"device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0,
"device_nvme1n1_data_units_read": 5068041216000,
"device_nvme1n1_data_units_written": 69712734208000,
"device_nvme1n1_host_read_commands": 313528805,
"device_nvme1n1_host_write_commands": 1928062610,
"device_nvme1n1_media_errors": 0,
"device_nvme1n1_num_err_log_entries": 110,
"device_nvme1n1_percentage_used": 2,
"device_nvme1n1_power_cycles": 64,
"device_nvme1n1_power_on_time": 17906400,
"device_nvme1n1_temperature": 36,
"device_nvme1n1_thm_temp1_total_time": 0,
"device_nvme1n1_thm_temp1_trans_count": 0,
"device_nvme1n1_thm_temp2_total_time": 0,
"device_nvme1n1_thm_temp2_trans_count": 0,
"device_nvme1n1_unsafe_shutdowns": 39,
"device_nvme1n1_warning_temp_time": 0,
}

assert.Equal(t, expected, mx)
},
},
},
"success if all calls successful with float values": {
{
prepare: prepareCaseFloatValuesOK,
check: func(t *testing.T, n *NVMe) {
mx := n.Collect()

expected := map[string]int64{
"device_nvme0n1_available_spare": 100,
"device_nvme0n1_controller_busy_time": 497040,
"device_nvme0n1_critical_comp_time": 0,
"device_nvme0n1_critical_warning_available_spare": 0,
"device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0,
"device_nvme0n1_critical_warning_persistent_memory_read_only": 0,
"device_nvme0n1_critical_warning_read_only": 0,
"device_nvme0n1_critical_warning_temp_threshold": 0,
"device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0,
"device_nvme0n1_data_units_read": 5068041216000,
"device_nvme0n1_data_units_written": 69712734208000,
"device_nvme0n1_host_read_commands": 313528805,
"device_nvme0n1_host_write_commands": 1928062610,
"device_nvme0n1_media_errors": 0,
"device_nvme0n1_num_err_log_entries": 110,
"device_nvme0n1_percentage_used": 2,
"device_nvme0n1_power_cycles": 64,
"device_nvme0n1_power_on_time": 17906400,
"device_nvme0n1_temperature": 36,
"device_nvme0n1_thm_temp1_total_time": 0,
"device_nvme0n1_thm_temp1_trans_count": 0,
"device_nvme0n1_thm_temp2_total_time": 0,
"device_nvme0n1_thm_temp2_trans_count": 0,
"device_nvme0n1_unsafe_shutdowns": 39,
"device_nvme0n1_warning_temp_time": 0,
"device_nvme1n1_available_spare": 100,
"device_nvme1n1_controller_busy_time": 497040,
"device_nvme1n1_critical_comp_time": 0,
"device_nvme1n1_critical_warning_available_spare": 0,
"device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0,
"device_nvme1n1_critical_warning_persistent_memory_read_only": 0,
"device_nvme1n1_critical_warning_read_only": 0,
"device_nvme1n1_critical_warning_temp_threshold": 0,
"device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0,
"device_nvme1n1_data_units_read": 5068041216000,
"device_nvme1n1_data_units_written": 69712734208000,
"device_nvme1n1_host_read_commands": 313528805,
"device_nvme1n1_host_write_commands": 1928062610,
"device_nvme1n1_media_errors": 0,
"device_nvme1n1_num_err_log_entries": 110,
"device_nvme1n1_percentage_used": 2,
"device_nvme1n1_power_cycles": 64,
"device_nvme1n1_power_on_time": 17906400,
"device_nvme1n1_temperature": 36,
"device_nvme1n1_thm_temp1_total_time": 0,
"device_nvme1n1_thm_temp1_trans_count": 0,
"device_nvme1n1_thm_temp2_total_time": 0,
"device_nvme1n1_thm_temp2_trans_count": 0,
"device_nvme1n1_unsafe_shutdowns": 39,
"device_nvme1n1_warning_temp_time": 0,
}

assert.Equal(t, expected, mx)
},
},
},
"fail if 'nvme list' returns an empty list": {
{
prepare: prepareCaseEmptyList,
Expand Down Expand Up @@ -227,6 +356,14 @@ func prepareCaseOK(n *NVMe) {
n.exec = &mockNVMeCLIExec{}
}

func prepareCaseStringValuesOK(n *NVMe) {
n.exec = &mockNVMeCLIExec{smartLogString: true}
}

func prepareCaseFloatValuesOK(n *NVMe) {
n.exec = &mockNVMeCLIExec{smartLogFloat: true}
}

func prepareCaseEmptyList(n *NVMe) {
n.exec = &mockNVMeCLIExec{emptyList: true}
}
Expand All @@ -240,9 +377,11 @@ func prepareCaseErrOnSmartLog(n *NVMe) {
}

type mockNVMeCLIExec struct {
errOnList bool
errOnSmartLog bool
emptyList bool
errOnList bool
errOnSmartLog bool
emptyList bool
smartLogString bool
smartLogFloat bool
}

func (m *mockNVMeCLIExec) list() (*nvmeDeviceList, error) {
Expand Down Expand Up @@ -271,8 +410,16 @@ func (m *mockNVMeCLIExec) smartLog(_ string) (*nvmeDeviceSmartLog, error) {
return nil, errors.New("mock.smartLog() no devices error")
}

data := dataNVMeSmartLogJSON
if m.smartLogString {
data = dataNVMeSmartLogStringJSON
}
if m.smartLogFloat {
data = dataNVMeSmartLogFloatJSON
}

var v nvmeDeviceSmartLog
if err := json.Unmarshal(dataNVMeSmartLogJSON, &v); err != nil {
if err := json.Unmarshal(data, &v); err != nil {
return nil, err
}

Expand Down
24 changes: 24 additions & 0 deletions modules/nvme/testdata/nvme-smart-log-float.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"critical_warning": 0,
"temperature": 310.0,
"avail_spare": 100.0,
"spare_thresh": 5.0,
"percent_used": 2.0,
"endurance_grp_critical_warning_summary": 0,
"data_units_read": 9898518.0,
"data_units_written": 136157684.0,
"host_read_commands": 313528805.0,
"host_write_commands": 1928062610.0,
"controller_busy_time": 8284.0,
"power_cycles": 64.0,
"power_on_hours": 4974.0,
"unsafe_shutdowns": 39.0,
"media_errors": 0,
"num_err_log_entries": 110.0,
"warning_temp_time": 0,
"critical_comp_time": 0,
"thm_temp1_trans_count": 0,
"thm_temp2_trans_count": 0,
"thm_temp1_total_time": 0,
"thm_temp2_total_time": 0
}
Loading

0 comments on commit 87fddac

Please sign in to comment.