forked from ngi644/datadog_nvml
-
Notifications
You must be signed in to change notification settings - Fork 2
/
nvml.py
88 lines (78 loc) · 3.53 KB
/
nvml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# encoding: utf-8
# project
from checks import AgentCheck
# psutil
import psutil
# pynvml
import pynvml
__version__ = '0.1.4'
__author__ = 'Takashi NAGAI, Alejandro Ferrari'
class NvmlCheck(AgentCheck):
def _dict2list(self, tags={}):
return [u"{k}:{v}".format(k=k, v=v) for k, v in tags.items()]
def check(self, instance):
pynvml.nvmlInit()
msg_list = []
try:
deviceCount = pynvml.nvmlDeviceGetCount()
except:
deviceCount = 0
for device_id in xrange(deviceCount):
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
name = pynvml.nvmlDeviceGetName(handle)
tags = dict(name="{}-{}".format(name, device_id))
d_tags = self._dict2list(tags)
# temperature info
try:
temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
self.gauge('nvml.temp.', temp, tags=d_tags)
except pynvml.NVMLError as err:
msg_list.append(u'nvmlDeviceGetTemperature:{}'.format(err))
# memory info
try:
mem = pynvml.nvmlDeviceGetMemoryInfo(handle)
self.gauge('nvml.mem.total', mem.total, tags=d_tags)
self.gauge('nvml.mem.used', mem.used, tags=d_tags)
self.gauge('nvml.mem.free', mem.free, tags=d_tags)
except pynvml.NVMLError as err:
msg_list.append(u'nvmlDeviceGetMemoryInfo:{}'.format(err))
# utilization GPU/Memory info
try:
util_rate = pynvml.nvmlDeviceGetUtilizationRates(handle)
self.gauge('nvml.util.gpu', util_rate.gpu, tags=d_tags)
self.gauge('nvml.util.memory', util_rate.memory, tags=d_tags)
except pynvml.NVMLError as err:
msg_list.append(u'nvmlDeviceGetUtilizationRates:{}'.format(err))
# utilization Encoder info
try:
util_encoder = pynvml.nvmlDeviceGetEncoderUtilization(handle)
self.log.debug('nvml.util.encoder %s' % long(util_encoder[0]))
self.gauge('nvml.util.encoder', long(util_encoder[0]), tags=d_tags)
except pynvml.NVMLError as err:
msg_list.append(u'nvmlDeviceGetEncoderUtilization:{}'.format(err))
# utilization Decoder info
try:
util_decoder = pynvml.nvmlDeviceGetDecoderUtilization(handle)
self.log.debug('nvml.util.decoder %s' % long(util_decoder[0]))
self.gauge('nvml.util.decoder', long(util_decoder[0]), tags=d_tags)
except pynvml.NVMLError as err:
msg_list.append(u'nvmlDeviceGetDecoderUtilization:{}'.format(err))
# Compute running processes
try:
cps = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
for ps in cps:
p_tags = tags.copy()
p_tags['pid'] = ps.pid
p_tags['name'] = pynvml.nvmlSystemGetProcessName(ps.pid)
p_tags = self._dict2list(p_tags)
self.gauge('nvml.process.used_gpu_memory', ps.usedGpuMemory, tags=p_tags)
except pynvml.NVMLError as err:
msg_list.append(u'nvmlDeviceGetComputeRunningProcesses:{}'.format(err))
if msg_list:
status = AgentCheck.CRITICAL
msg = u','.join(msg_list)
else:
status = AgentCheck.OK
msg = u'Ok'
pynvml.nvmlShutdown()
self.service_check('nvml.check', status, message=msg)