forked from influxdata/telegraf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
smart.go
457 lines (395 loc) · 12.3 KB
/
smart.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
package smart
import (
"bufio"
"fmt"
"os/exec"
"path"
"regexp"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/plugins/inputs"
)
var (
// Device Model: APPLE SSD SM256E
// Product: HUH721212AL5204
// Model Number: TS128GMTE850
modelInfo = regexp.MustCompile("^(Device Model|Product|Model Number):\\s+(.*)$")
// Serial Number: S0X5NZBC422720
serialInfo = regexp.MustCompile("(?i)^Serial Number:\\s+(.*)$")
// LU WWN Device Id: 5 002538 655584d30
wwnInfo = regexp.MustCompile("^LU WWN Device Id:\\s+(.*)$")
// User Capacity: 251,000,193,024 bytes [251 GB]
usercapacityInfo = regexp.MustCompile("^User Capacity:\\s+([0-9,]+)\\s+bytes.*$")
// SMART support is: Enabled
smartEnabledInfo = regexp.MustCompile("^SMART support is:\\s+(\\w+)$")
// SMART overall-health self-assessment test result: PASSED
// SMART Health Status: OK
// PASSED, FAILED, UNKNOWN
smartOverallHealth = regexp.MustCompile("^(SMART overall-health self-assessment test result|SMART Health Status):\\s+(\\w+).*$")
// sasNvmeAttr is a SAS or NVME SMART attribute
sasNvmeAttr = regexp.MustCompile(`^([^:]+):\s+(.+)$`)
// ID# ATTRIBUTE_NAME FLAGS VALUE WORST THRESH FAIL RAW_VALUE
// 1 Raw_Read_Error_Rate -O-RC- 200 200 000 - 0
// 5 Reallocated_Sector_Ct PO--CK 100 100 000 - 0
// 192 Power-Off_Retract_Count -O--C- 097 097 000 - 14716
attribute = regexp.MustCompile("^\\s*([0-9]+)\\s(\\S+)\\s+([-P][-O][-S][-R][-C][-K])\\s+([0-9]+)\\s+([0-9]+)\\s+([0-9-]+)\\s+([-\\w]+)\\s+([\\w\\+\\.]+).*$")
deviceFieldIds = map[string]string{
"1": "read_error_rate",
"7": "seek_error_rate",
"190": "temp_c",
"194": "temp_c",
"199": "udma_crc_errors",
}
sasNvmeAttributes = map[string]struct {
ID string
Name string
Parse func(fields, deviceFields map[string]interface{}, str string) error
}{
"Accumulated start-stop cycles": {
ID: "4",
Name: "Start_Stop_Count",
},
"Accumulated load-unload cycles": {
ID: "193",
Name: "Load_Cycle_Count",
},
"Current Drive Temperature": {
ID: "194",
Name: "Temperature_Celsius",
Parse: parseTemperature,
},
"Temperature": {
ID: "194",
Name: "Temperature_Celsius",
Parse: parseTemperature,
},
"Power Cycles": {
ID: "12",
Name: "Power_Cycle_Count",
},
"Power On Hours": {
ID: "9",
Name: "Power_On_Hours",
},
"Media and Data Integrity Errors": {
Name: "Media_and_Data_Integrity_Errors",
},
"Error Information Log Entries": {
Name: "Error_Information_Log_Entries",
},
"Critical Warning": {
Name: "Critical_Warning",
Parse: func(fields, _ map[string]interface{}, str string) error {
var value int64
if _, err := fmt.Sscanf(str, "0x%x", &value); err != nil {
return err
}
fields["raw_value"] = value
return nil
},
},
"Available Spare": {
Name: "Available_Spare",
Parse: func(fields, deviceFields map[string]interface{}, str string) error {
return parseCommaSeperatedInt(fields, deviceFields, strings.TrimSuffix(str, "%"))
},
},
}
)
type Smart struct {
Path string
Nocheck string
Attributes bool
Excludes []string
Devices []string
UseSudo bool
Timeout internal.Duration
}
var sampleConfig = `
## Optionally specify the path to the smartctl executable
# path = "/usr/bin/smartctl"
## On most platforms smartctl requires root access.
## Setting 'use_sudo' to true will make use of sudo to run smartctl.
## Sudo must be configured to to allow the telegraf user to run smartctl
## without a password.
# use_sudo = false
## Skip checking disks in this power mode. Defaults to
## "standby" to not wake up disks that have stoped rotating.
## See --nocheck in the man pages for smartctl.
## smartctl version 5.41 and 5.42 have faulty detection of
## power mode and might require changing this value to
## "never" depending on your disks.
# nocheck = "standby"
## Gather all returned S.M.A.R.T. attribute metrics and the detailed
## information from each drive into the 'smart_attribute' measurement.
# attributes = false
## Optionally specify devices to exclude from reporting.
# excludes = [ "/dev/pass6" ]
## Optionally specify devices and device type, if unset
## a scan (smartctl --scan) for S.M.A.R.T. devices will
## done and all found will be included except for the
## excluded in excludes.
# devices = [ "/dev/ada0 -d atacam" ]
## Timeout for the smartctl command to complete.
# timeout = "30s"
`
func NewSmart() *Smart {
return &Smart{
Timeout: internal.Duration{Duration: time.Second * 30},
}
}
func (m *Smart) SampleConfig() string {
return sampleConfig
}
func (m *Smart) Description() string {
return "Read metrics from storage devices supporting S.M.A.R.T."
}
func (m *Smart) Gather(acc telegraf.Accumulator) error {
if len(m.Path) == 0 {
return fmt.Errorf("smartctl not found: verify that smartctl is installed and that smartctl is in your PATH")
}
devices := m.Devices
if len(devices) == 0 {
var err error
devices, err = m.scan()
if err != nil {
return err
}
}
m.getAttributes(acc, devices)
return nil
}
// Wrap with sudo
var runCmd = func(timeout internal.Duration, sudo bool, command string, args ...string) ([]byte, error) {
cmd := exec.Command(command, args...)
if sudo {
cmd = exec.Command("sudo", append([]string{"-n", command}, args...)...)
}
return internal.CombinedOutputTimeout(cmd, timeout.Duration)
}
// Scan for S.M.A.R.T. devices
func (m *Smart) scan() ([]string, error) {
out, err := runCmd(m.Timeout, m.UseSudo, m.Path, "--scan")
if err != nil {
return []string{}, fmt.Errorf("failed to run command '%s --scan': %s - %s", m.Path, err, string(out))
}
devices := []string{}
for _, line := range strings.Split(string(out), "\n") {
dev := strings.Split(line, " ")
if len(dev) > 1 && !excludedDev(m.Excludes, strings.TrimSpace(dev[0])) {
devices = append(devices, strings.TrimSpace(dev[0]))
}
}
return devices, nil
}
func excludedDev(excludes []string, deviceLine string) bool {
device := strings.Split(deviceLine, " ")
if len(device) != 0 {
for _, exclude := range excludes {
if device[0] == exclude {
return true
}
}
}
return false
}
// Get info and attributes for each S.M.A.R.T. device
func (m *Smart) getAttributes(acc telegraf.Accumulator, devices []string) {
var wg sync.WaitGroup
wg.Add(len(devices))
for _, device := range devices {
go gatherDisk(acc, m.Timeout, m.UseSudo, m.Attributes, m.Path, m.Nocheck, device, &wg)
}
wg.Wait()
}
// Command line parse errors are denoted by the exit code having the 0 bit set.
// All other errors are drive/communication errors and should be ignored.
func exitStatus(err error) (int, error) {
if exiterr, ok := err.(*exec.ExitError); ok {
if status, ok := exiterr.Sys().(syscall.WaitStatus); ok {
return status.ExitStatus(), nil
}
}
return 0, err
}
func gatherDisk(acc telegraf.Accumulator, timeout internal.Duration, usesudo, collectAttributes bool, smartctl, nocheck, device string, wg *sync.WaitGroup) {
defer wg.Done()
// smartctl 5.41 & 5.42 have are broken regarding handling of --nocheck/-n
args := []string{"--info", "--health", "--attributes", "--tolerance=verypermissive", "-n", nocheck, "--format=brief"}
args = append(args, strings.Split(device, " ")...)
out, e := runCmd(timeout, usesudo, smartctl, args...)
outStr := string(out)
// Ignore all exit statuses except if it is a command line parse error
exitStatus, er := exitStatus(e)
if er != nil {
acc.AddError(fmt.Errorf("failed to run command '%s %s': %s - %s", smartctl, strings.Join(args, " "), e, outStr))
return
}
deviceTags := map[string]string{}
deviceNode := strings.Split(device, " ")[0]
deviceTags["device"] = path.Base(deviceNode)
deviceFields := make(map[string]interface{})
deviceFields["exit_status"] = exitStatus
scanner := bufio.NewScanner(strings.NewReader(outStr))
for scanner.Scan() {
line := scanner.Text()
model := modelInfo.FindStringSubmatch(line)
if len(model) > 2 {
deviceTags["model"] = model[2]
}
serial := serialInfo.FindStringSubmatch(line)
if len(serial) > 1 {
deviceTags["serial_no"] = serial[1]
}
wwn := wwnInfo.FindStringSubmatch(line)
if len(wwn) > 1 {
deviceTags["wwn"] = strings.Replace(wwn[1], " ", "", -1)
}
capacity := usercapacityInfo.FindStringSubmatch(line)
if len(capacity) > 1 {
deviceTags["capacity"] = strings.Replace(capacity[1], ",", "", -1)
}
enabled := smartEnabledInfo.FindStringSubmatch(line)
if len(enabled) > 1 {
deviceTags["enabled"] = enabled[1]
}
health := smartOverallHealth.FindStringSubmatch(line)
if len(health) > 2 {
deviceFields["health_ok"] = (health[2] == "PASSED" || health[2] == "OK")
}
tags := map[string]string{}
fields := make(map[string]interface{})
if collectAttributes {
keys := [...]string{"device", "model", "serial_no", "wwn", "capacity", "enabled"}
for _, key := range keys {
if value, ok := deviceTags[key]; ok {
tags[key] = value
}
}
}
attr := attribute.FindStringSubmatch(line)
if len(attr) > 1 {
// attribute has been found, add it only if collectAttributes is true
if collectAttributes {
tags["id"] = attr[1]
tags["name"] = attr[2]
tags["flags"] = attr[3]
fields["exit_status"] = exitStatus
if i, err := strconv.ParseInt(attr[4], 10, 64); err == nil {
fields["value"] = i
}
if i, err := strconv.ParseInt(attr[5], 10, 64); err == nil {
fields["worst"] = i
}
if i, err := strconv.ParseInt(attr[6], 10, 64); err == nil {
fields["threshold"] = i
}
tags["fail"] = attr[7]
if val, err := parseRawValue(attr[8]); err == nil {
fields["raw_value"] = val
}
acc.AddFields("smart_attribute", fields, tags)
}
// If the attribute matches on the one in deviceFieldIds
// save the raw value to a field.
if field, ok := deviceFieldIds[attr[1]]; ok {
if val, err := parseRawValue(attr[8]); err == nil {
deviceFields[field] = val
}
}
} else {
// what was found is not a vendor attribute
if matches := sasNvmeAttr.FindStringSubmatch(line); len(matches) > 2 {
if attr, ok := sasNvmeAttributes[matches[1]]; ok {
tags["name"] = attr.Name
if attr.ID != "" {
tags["id"] = attr.ID
}
parse := parseCommaSeperatedInt
if attr.Parse != nil {
parse = attr.Parse
}
if err := parse(fields, deviceFields, matches[2]); err != nil {
continue
}
// if the field is classified as an attribute, only add it
// if collectAttributes is true
if collectAttributes {
acc.AddFields("smart_attribute", fields, tags)
}
}
}
}
}
acc.AddFields("smart_device", deviceFields, deviceTags)
}
func parseRawValue(rawVal string) (int64, error) {
// Integer
if i, err := strconv.ParseInt(rawVal, 10, 64); err == nil {
return i, nil
}
// Duration: 65h+33m+09.259s
unit := regexp.MustCompile("^(.*)([hms])$")
parts := strings.Split(rawVal, "+")
if len(parts) == 0 {
return 0, fmt.Errorf("Couldn't parse RAW_VALUE '%s'", rawVal)
}
duration := int64(0)
for _, part := range parts {
timePart := unit.FindStringSubmatch(part)
if len(timePart) == 0 {
continue
}
switch timePart[2] {
case "h":
duration += parseInt(timePart[1]) * int64(3600)
case "m":
duration += parseInt(timePart[1]) * int64(60)
case "s":
// drop fractions of seconds
duration += parseInt(strings.Split(timePart[1], ".")[0])
default:
// Unknown, ignore
}
}
return duration, nil
}
func parseInt(str string) int64 {
if i, err := strconv.ParseInt(str, 10, 64); err == nil {
return i
}
return 0
}
func parseCommaSeperatedInt(fields, _ map[string]interface{}, str string) error {
i, err := strconv.ParseInt(strings.Replace(str, ",", "", -1), 10, 64)
if err != nil {
return err
}
fields["raw_value"] = i
return nil
}
func parseTemperature(fields, deviceFields map[string]interface{}, str string) error {
var temp int64
if _, err := fmt.Sscanf(str, "%d C", &temp); err != nil {
return err
}
fields["raw_value"] = temp
deviceFields["temp_c"] = temp
return nil
}
func init() {
inputs.Add("smart", func() telegraf.Input {
m := NewSmart()
path, _ := exec.LookPath("smartctl")
if len(path) > 0 {
m.Path = path
}
m.Nocheck = "standby"
return m
})
}