-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmonitor.py
114 lines (89 loc) · 4 KB
/
monitor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
### Monitor class
import os
import logging
from threading import Thread
import psutil
from transformers.utils import is_py3nvml_available, is_torch_available, logging
try:
import py3nvml.py3nvml as pynvml
except:
logging.warning("pynvml does not load, no monitoring available")
import time
class Monitor(object):
""" Class that monitors GPU utilization for a given time """
def __init__(self, sampling_rate=None, pid=None):
self.sampling_rate = sampling_rate if sampling_rate is not None else 0.01
self.pid = pid if pid is not None else os.getpid()
self.gpu = None
self.should_stop = False
self.thread = None
self.accounting_enabled = False
self.stats = []
def _find_gpu(self):
device_count = pynvml.nvmlDeviceGetCount()
if device_count == 1:
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
self.gpu = handle
else:
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
gpu_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
for gpu_process in gpu_processes:
if gpu_process.pid == self.pid:
self.gpu = handle
self.accounting_enabled = pynvml.nvmlDeviceGetAccountingMode(self.gpu) == pynvml.NVML_FEATURE_ENABLED
# Clear accounting statistics (requires root privileges)
#pynvml.nvmlDeviceSetAccountingMode(self.gpu, pynvml.NVML_FEATURE_DISABLED)
#pynvml.nvmlDeviceSetAccountingMode(self.gpu, pynvml.NVML_FEATURE_ENABLED)
def _monitor(self):
pynvml.nvmlInit()
self._find_gpu()
current_sample = []
while not self.should_stop:
used_cpu = 0
used_cpumem = 0
used_gpu = 0
used_gpumem = 0
cpu_process = psutil.Process(self.pid)
used_cpu = cpu_process.cpu_percent(self.sampling_rate) / psutil.cpu_count() # CPU utilization in %
used_cpumem = cpu_process.memory_info().rss // (1024*1024) # Memory use in MB
#for proc in pynvml.nvmlDeviceGetComputeRunningProcesses(self.gpu):
# print(
# "pid %d using %d bytes of memory on device %d."
# % (proc.pid, proc.usedGpuMemory, self.pid)
# )
gpu_processes = pynvml.nvmlDeviceGetComputeRunningProcesses(self.gpu)
memory = pynvml.nvmlDeviceGetMemoryInfo(self.gpu)
#used_gpumem = memory.used / 1024 / 1024
if self.accounting_enabled:
try:
stats = pynvml.nvmlDeviceGetAccountingStats(self.gpu, self.pid)
used_gpu = stats.gpuUtilization
except pynvml.NVMLError: # NVMLError_NotFound
pass
if not used_gpu:
util = pynvml.nvmlDeviceGetUtilizationRates(self.gpu)
try:
used_gpu = util.gpu / len(gpu_processes) # Approximate based on number of processes
for proc in gpu_processes:
if proc.pid == self.pid:
used_gpumem = proc.usedGpuMemory / 1024 / 1024
except:
#nvmlDeviceGetUtilization is wrong
used_gpu, used_gpumem = -1, -1
current_sample.append((used_cpu, used_cpumem, used_gpu, used_gpumem))
time.sleep(self.sampling_rate)
#self.stats.append([round(sum(x) / len(x)) for x in zip(*current_sample)])
#self.stats.extend(current_sample)
self.stats = [max(x) for x in zip(*current_sample)]
pynvml.nvmlShutdown()
def start_monitor(self):
self.should_stop = False
self.thread = Thread(target=self._monitor)
self.thread.start()
def stop_monitor(self):
self.should_stop = True
self.thread.join()
def get_stats(self):
#return [max(x) for x in zip(*self.stats)]
return self.stats