-
Notifications
You must be signed in to change notification settings - Fork 0
/
36-diskstatus
executable file
·333 lines (292 loc) · 12.7 KB
/
36-diskstatus
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#!/bin/bash
# NAME: 36-diskstatus.sh
#
# DESCRIPTION: Report status of each drive installed, incuding temperature as
# reported by HDDtemp and drive testing status from smartctl.
# if HDDtemp not available or no temp found the script will
# fallback to smartctl scrapes for temperature information
#
# NVMe devices will use nvme-cli to obtain error and wear-level
# indicators.
#
# Originally Based on: https://github.com/yboetz/motd
#
# AUTHOR : Richard J. DURSO
# DATE : 08/20/2023
# VERSION : 1.13
##############################################################################
set -eu
# Check if we are root
if [ "$EUID" -ne 0 ]; then
echo "${0##*/}: Please run as root!" >&2
exit 1
fi
# Check if dependant utilities are installed
CHECK_UTILS="nvme smartctl"
for checkname in $CHECK_UTILS
do
if ! command -v "${checkname}" >/dev/null 2>&1; then
echo "${0##*/}: ${checkname} is not installed. Aborting." >&2
exit 1
fi
done
#---[ Variables and Constants ]-----------------------------------------------
# Warning and criticial temperatures vary by devices. Historically based on
# personal preferences with spinning rust HDDs. Acceptable temperatures for
# SATA SSDs and NVMe are higher. Adjust these as you see fit.
# Max Temperature (red color)
MAX_TEMP_C=45
MAX_TEMP_F=113
# Warning Temperature, percentage of MAX_TEMP
WARN_PERCENT=85
# NVMe/SATA SSD device wearlevel / percent left warnings
LIFE_WARN=30
LIFE_CRIT=15
# colors
# white="\e[39m"
yellow="\e[1;33m"
green="\e[1;32m"
red="\e[1;31m"
dim="\e[2m"
undim="\e[0m"
# hddtemp can already report F or C temperatures for SATA devices. If this script
# grabs a temp from smartctl and it is reported in "C" set the following to
# if you want that value converted to F.
convert_c_to_f=/bin/true
# logfiles to check. Leave empty to use journalctl, or specify log files to check
# logfiles=(/var/log/syslog /var/log/syslog.1)
logfiles=
# Define how many days to search for journalctl entries. As in days ago to today.
# Such as past 3 days.
journalctl_days=3
#------------------------------------------------------------------------------
#---[ You can updates these ]--------------------------------------------------
# Types of disks to look for. Used by awk to define disk by-id types to include
# IDE/SATA - you might add another for usb "usb-".
findthese="ata-|scsi-SATA_|nvme-"
# This is used by awk to remove unwanted disk devices which matched above.
ignorethese="nvme-(eui|nvme).*|nvme-.*_1[[:space:]]"
# This is used by sed to remove text from disk device names. This does not alter
# device selection like ones above. This just helps to make disk device names
# nicer (smaller).
sed_filter="s/^scsi-SATA_//; s/^ata-//; s/Series_//; s/^nvme-//; s/_with_Heatsink//;"
#-------------------------------------------------------------------------------
#---[ Do Not Update These ]-----------------------------------------------------
# display data in number of columns
COLUMNS=2
# These are not disks: partitions, cd-roms, etc.
notdisks="part[0-9]|sr[0-9]|/dm-|/md-|total"
# Collect all disk devices which don't match "notdisks"
alldisks=$(find /dev/disk/by-id/ -type l -printf "%p %l\n" | sort -k 2 | awk -F/ -v a="$notdisks" '$0 !~ a { print }')
# Dynamically generate simple disk names available (sda, sdb, sdc, ...) these change if disks are removed
readarray -t disksalias <<< "$(echo "${alldisks[@]}" | awk -F/ -v a="${findthese}" -v b="${ignorethese}" '$0 ~ a && $0 !~ b { print $7 }' | uniq)"
# Capture Raw Disk Data to Work on.
# Assumption: Last 15 chars can be used to find duplicate device. Combination of device alias and serial number.
# This will remove any duplicate devices.
readarray -t rawdisks <<< "$(echo "${alldisks[@]}" | awk -F'[/ ]' -v a="${findthese}" -v b="${ignorethese}" '$0 ~ a && $0 !~ b { print $5}' | rev | uniq -w 15 | rev)"
# Clean-up disk names to display with sed_filter
readarray -t disks <<< "$(printf '%s\n' "${rawdisks[@]}" | sed "${sed_filter}")"
# Grab a shorter subset of the name. It will return everything to the LEFT of the right-most "_"
readarray -t shortnames <<< "$(printf '%s\n' "${disks[@]}" | sed -r 's/^(.*)_.*$/\1/')"
# Dynamically grab last 4 digits of disk serial number
readarray -t diskserials <<< "$(printf '%s\n' "${disks[@]}" | sed -r 's/.*(....)$/\1/')"
#-------------------------------------------------------------------------------
# Display a colorized percent indicator
__colorize_percent_left() {
# If a percent_left determined, format and colorize it
if [[ -n ${percent_left} ]]; then
# Set lifetime left color based on status (NVMe provide this)
life="${dim} ${green}[${undim}${green}${percent_left}%${dim}]${undim}"
if [[ "${percent_left}" -lt "${LIFE_WARN}" ]]; then
# Set to yellow warning
life="${dim} ${yellow}[${undim}${yellow}${percent_left}%${dim}]${undim}"
elif [[ "${percent_left}" -lt "${LIFE_CRIT}" ]]; then
# Set to red critical
life="${dim} ${red}[${undim}${red}${percent_left}%${dim}]${undim}"
fi
fi
}
#-------------------------------------------------------------------------------
# System with only NVMe devices, might not find any results.
set +e
if [[ -n "${logfiles}" ]]; then
# get all lines with smartd entries from syslog
lines=$(tac "${logfiles[@]}" 2>/dev/null | grep -hiP 'smartd.*previous self-test')
else
# get all lines with smartd entries from jounrlctl within specified time
lines=$(journalctl -r -t smartd --since "$(date +%Y-%m-%d -d "${journalctl_days} days ago")" | grep -hiP 'smartd.*previous self-test')
fi
set -e
out=""
# generate data to display about devices
for i in "${!disksalias[@]}"; do #for every /dev/sdX device name
# Get smartd testing status
# Determine all possible names for the disk device
readarray -t possible_names <<< "$(echo "${alldisks[@]}" | awk -v a="${disksalias[$i]}" '$0 ~ a { print $1;exit }')"
for name in "${possible_names[@]}";do
life=""
percent_left=""
error_status_code=""
set +e
result=$(echo "${lines}" | awk -v a="${name}" '$0 ~ a {print $(NF-1),$NF; exit 1 }')
if [ $? -eq 1 ]; then
set -e
# SATA SSDs have a Wear_Leveling_Count or Percent_Lifetime_Remain that goes from 100 to 0 in column "VALUE" (4th column)
# if detected report as percent left
percent_left=$(smartctl -a "/dev/${disksalias[$i]}" | awk '/(Wear_Leveling_Count|Percent_Lifetime_Remain)/{print $4}')
if [[ -n ${percent_left} ]]; then
# Base10 to Base10 conversion to remove any leading zeros
percent_left="$(( 10#"${percent_left}" ))"
# if result is "without error" change to "passed" to make room for percent_left display
[[ "${result}" == "without error" ]] && result="passed"
__colorize_percent_left
fi
break
fi
set -e
# If no result, see if unexpected results can be found
if [[ -z ${result} ]]; then
if [ "$(echo "${lines}" | grep -m 1 -HiP "${name}.*self-test" | grep -ci "skip")" -eq 1 ]; then # Test skipped
result="skipped"
break
fi
if [ "$(echo "${lines}" | grep -m 1 -HiP "${name}.*self-test" | grep -ci "in progress")" -eq 1 ]; then # Test in progress
result="in progress"
break
fi
# Uncomment this to discover other reasons for no test results
# If this show nothing, then data has rotated out of where "$logfiles" is checking.
# tac $logfiles 2>/dev/null | grep -m 1 -HiP "${name}.*self-test"
# if NVMe device, get status from nvme-cli reporting
if [ "$(echo "${disksalias[$i]}" | grep -ci "nvme")" -eq 1 ]; then
# Get device wear level estimate percentage_used and subtract that from 100%
percent_left="$(( 100 - $(nvme smart-log -H "/dev/${disksalias[$i]}" | awk -F '[:% ]' '/^percentage_used/{print $3}') ))"
[ -n "${percent_left}" ] && __colorize_percent_left
# Use nvme-cli error-log to see if error count is equal to zero
if [ "$(nvme error-log -e 1 "/dev/${disksalias[$i]}" | awk '/^error_count.*: /{print $(NF)}')" -eq 0 ]; then
#result="without error"
result="passed"
break
else
# Not zero error-count
# Determine value of status_field, return just INVALID_FIELD, READ_ERROR, etc..
# examples:
# status_field : 0x4004(INVALID_FIELD: A reserved coded value or an unsupported value in a defined field)
# 0x2002(INVALID_FIELD: A reserved coded value or an unsupported value in a defined field)
# status_field : 0xc502(READ_ERROR: The read data could not be recovered from the media)
error_status_code="$(nvme error-log -e 1 "/dev/${disksalias[$i]}" | awk -F '[(:]' '/^status_field/{print $3}' | tr -d '[:blank:]')"
# INVALID_FIELD errors can usually be ignored. smartctl is known to issue unsupported commands which some
# Samsung devices log as an INVALID_FIELD or InvalidFieldinCommand instead of ignoring. For now, this type of error
# code will be ignored.
if echo "${error_status_code}" | grep -q -i "invalid"
then
result="passed"
error_status_code=""
else
#result="with error"
result="error"
fi
fi
else
# Still no result, device not being monitored? See if smartctl has a status we can show
result="$(smartctl -a "/dev/${disksalias[$i]}" | awk '/^SMART.*result:/{print $(NF)}')"
fi
fi
done
# if NVMe device get temp and unit from smartctl
if [ "$(echo "${disksalias[$i]}" | grep -ci "nvme")" -eq 1 ]; then
# Assume something like: "Temperature: 37 Celsius" is returned
devicetmp=$(smartctl -A "/dev/${disksalias[$i]}" | awk '/^Temperature:/{print $2, $3}')
temp=$(echo "${devicetmp}" | awk '{print $1}')
unit=$(echo "${devicetmp}" | awk '{print substr($2,1,1)}') # Only need 1st character
else
# Get SATA device temp from smartctl
# Assume something like:
# "194 Temperature_Celsius 0x0022 025 055 000 Old_age Always - 25 (Min/Max 22/44)"
# "190 Airflow_Temperature_Cel 0x0032 070 056 000 Old_age Always - 30"
temp=$(smartctl -A "/dev/${disksalias[$i]}" | awk '/.*Temperature_Cel.*/{print $10}')
unit="C"
fi
# if we have a temp see if we need to convert to F
if [ -n "$temp" ]; then
# See if we need to convert C to F
if "${convert_c_to_f}" && { [ "$unit" == "C" ] || [ "$unit" == "c" ]; }
then
temp=$(( 9 * "$temp" / 5 + 32))
unit="F"
fi
fi
# Determine if MAX_TEMP is based on C or F
case "${unit}" in
"f"|"F")
MAX_TEMP=$MAX_TEMP_F
;;
"c"|"C")
MAX_TEMP=$MAX_TEMP_C
;;
*) # Unknown value
MAX_TEMP=0
unit=""
;;
esac;
# HDDTEMP can return an error if a drive was hot-swapped since restart
# Deamon would need to be restarted to pickup new drive
case "${temp}" in
"ERR")
color=$red
;;
"UNK")
color=$dim
;;
"32"|"0") # 0C or 32F device likley has no actual temp sensor
temp="N/A"
color="$dim"
unit=""
;;
*) # A temperature value
if [[ "${temp}" -gt "${MAX_TEMP}" ]]; then
color="$red"
else
WARN_TEMP=$(( "${MAX_TEMP}" * "${WARN_PERCENT}" / 100))
if [[ "${temp}" -gt "${WARN_TEMP}" ]]; then
color="$yellow"
else
color="$green"
fi
fi
;;
esac;
# Set color based on known or unknown status
case "${result}" in
"without error"|"PASSED"|"passed")
status_color="$green"
;;
"with error"|"FAILED!"|"error")
status_color="$red"
;;
"skipped")
status_color="$yellow"
;;
"in progress")
status_color="$yellow"
;;
"a reset")
result="interrupted"
status_color="$yellow"
;;
*)
result="untested[${result}]"
status_color="$dim"
;;
esac;
# print temp & smartd error
out+="${shortnames[$i]}_${diskserials[$i]} (${disksalias[$i]}):,${color}${temp}${unit}${undim},${status_color}${result}${error_status_code}${life}${undim},"
# insert \n every $COLUMNS column
if [ $(( (i+1) % "$COLUMNS" )) -eq 0 ]; then
out+="\n"
fi
done
out+="\n"
printf "\ndisk status:\n"
# column: -t is table format, -s is column seperator, -d is do not display header
printf '%b' "$out" | column -t -s$',' -d --table-columns DEVICE,TMP1,STATUS,DEVICE,TMP2,STATUS --table-right TMP1,TMP2 | sed -e 's/^/ /'