This repository has been archived by the owner on Oct 12, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature: Include GPU monitoring (#24)
- Loading branch information
1 parent
1d30fd3
commit 961f77b
Showing
12 changed files
with
348 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
package nvml | ||
|
||
type NvmlClient interface { | ||
Init() error | ||
Shutdown() error | ||
GetDeviceCount() (uint, error) | ||
|
||
DeviceGetHandleByIndex(index uint) (Device, error) | ||
DeviceGetMemoryInfo(device Device) (Memory, error) | ||
DeviceGetUtilizationRates(device Device) (GPUUtilization, error) | ||
} | ||
|
||
type GPUUtilization struct { | ||
GPU uint | ||
Memory uint | ||
} | ||
|
||
type Memory struct { | ||
Total uint64 // Total installed FB memory (in bytes). | ||
Free uint64 // Unallocated FB memory (in bytes). | ||
Used uint64 // Allocated FB memory (in bytes). | ||
} | ||
|
||
type Device interface { | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// +build linux | ||
|
||
package nvml | ||
|
||
import ( | ||
nvml_linux "github.com/mindprince/gonvml" | ||
) | ||
|
||
type LinuxDevice = nvml_linux.Device | ||
|
||
type LinuxNvmlClient struct { | ||
} | ||
|
||
func New() (*LinuxNvmlClient, error) { | ||
client := LinuxNvmlClient{} | ||
|
||
return &client, nil | ||
} | ||
|
||
func (client *LinuxNvmlClient) Init() error { | ||
return nvml_linux.Initialize() | ||
} | ||
|
||
func (client *LinuxNvmlClient) Shutdown() error { | ||
return nvml_linux.Shutdown() | ||
} | ||
|
||
func (client *LinuxNvmlClient) GetDeviceCount() (uint, error) { | ||
value, err := nvml_linux.DeviceCount() | ||
if err != nil { | ||
return 0, err | ||
} | ||
|
||
return uint(value), nil | ||
} | ||
|
||
func (client *LinuxNvmlClient) DeviceGetUtilizationRates(device Device) (GPUUtilization, error) { | ||
linuxDevice := device.(LinuxDevice) | ||
gpu, memory, err := linuxDevice.UtilizationRates() | ||
if err != nil { | ||
return GPUUtilization{GPU: 0, Memory: 0}, err | ||
} | ||
|
||
use := GPUUtilization{ | ||
GPU: gpu, | ||
Memory: memory, | ||
} | ||
return use, nil | ||
} | ||
|
||
func (client *LinuxNvmlClient) DeviceGetMemoryInfo(device Device) (Memory, error) { | ||
linuxDevice := device.(LinuxDevice) | ||
total, used, err := linuxDevice.MemoryInfo() | ||
if err != nil { | ||
return Memory{Used: used, Total: total}, err | ||
} | ||
return Memory{Used: used, Total: total}, nil | ||
} | ||
|
||
func (client *LinuxNvmlClient) DeviceGetHandleByIndex(index uint) (Device, error) { | ||
device, err := nvml_linux.DeviceHandleByIndex(uint(index)) | ||
if err != nil { | ||
return Device(device), err | ||
} | ||
return Device(device), nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
// +build windows | ||
|
||
package nvml | ||
|
||
import ( | ||
nvml_win "github.com/mxpv/nvml-go" | ||
) | ||
|
||
type WinDevice struct { | ||
handle nvml_win.Device | ||
} | ||
|
||
func New() (*WinNvmlClient, error) { | ||
api, err := nvml_win.New("") | ||
|
||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
client := WinNvmlClient{ | ||
api: api, | ||
} | ||
|
||
return &client, nil | ||
} | ||
|
||
type WinNvmlClient struct { | ||
api *nvml_win.API | ||
} | ||
|
||
func (client *WinNvmlClient) Init() error { | ||
return client.api.Init() | ||
} | ||
|
||
func (client *WinNvmlClient) Shutdown() error { | ||
return client.api.Shutdown() | ||
} | ||
|
||
func (client *WinNvmlClient) GetDeviceCount() (uint, error) { | ||
value, err := client.api.DeviceGetCount() | ||
if err != nil { | ||
return 0, err | ||
} | ||
|
||
return uint(value), nil | ||
} | ||
|
||
func (client *WinNvmlClient) DeviceGetUtilizationRates(device Device) (GPUUtilization, error) { | ||
winDevice := device.(WinDevice) | ||
value, err := client.api.DeviceGetUtilizationRates(winDevice.handle) | ||
if err != nil { | ||
return GPUUtilization{GPU: 0, Memory: 0}, err | ||
} | ||
|
||
use := GPUUtilization{ | ||
GPU: uint(value.GPU), | ||
Memory: uint(value.Memory), | ||
} | ||
return use, nil | ||
} | ||
|
||
func (client *WinNvmlClient) DeviceGetMemoryInfo(device Device) (Memory, error) { | ||
winDevice := device.(WinDevice) | ||
use, err := client.api.DeviceGetMemoryInfo(winDevice.handle) | ||
if err != nil { | ||
return Memory(use), err | ||
} | ||
return Memory(use), nil | ||
} | ||
|
||
func (client *WinNvmlClient) DeviceGetHandleByIndex(index uint) (Device, error) { | ||
handle, err := client.api.DeviceGetHandleByIndex(uint32(index)) | ||
if err != nil { | ||
return Device(WinDevice{handle: handle}), err | ||
} | ||
return Device(WinDevice{handle: handle}), nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
package batchinsights | ||
|
||
import ( | ||
"fmt" | ||
"github.com/Azure/batch-insights/nvml" | ||
) | ||
|
||
type GPUStatsCollector struct { | ||
nvml nvml.NvmlClient | ||
deviceCount uint | ||
} | ||
|
||
type GPUUsage struct { | ||
GPU float64 | ||
Memory float64 | ||
} | ||
|
||
func NewGPUStatsCollector() GPUStatsCollector { | ||
nvmlClient, err := nvml.New() | ||
|
||
if err != nil { | ||
fmt.Println("No GPU detected. Nvidia driver might be missing") | ||
} else { | ||
err = nvmlClient.Init() | ||
|
||
if err != nil { | ||
fmt.Println("No GPU detected. Nvidia driver might be missing. Error while initializing NVML", err) | ||
nvmlClient = nil | ||
} else { | ||
deviceCount, err := nvmlClient.GetDeviceCount() | ||
|
||
if err != nil { | ||
fmt.Println(err) | ||
} else { | ||
fmt.Printf("NVML is loaded found %d gpus\n", deviceCount) | ||
} | ||
|
||
return GPUStatsCollector{ | ||
nvml: nvmlClient, | ||
deviceCount: deviceCount, | ||
} | ||
} | ||
} | ||
return GPUStatsCollector{} | ||
} | ||
|
||
func (gpu GPUStatsCollector) GetStats() []GPUUsage { | ||
if gpu.nvml == nil { | ||
return nil | ||
} | ||
|
||
var uses []GPUUsage | ||
|
||
for i := uint(0); i < gpu.deviceCount; i++ { | ||
device, err := gpu.nvml.DeviceGetHandleByIndex(i) | ||
if err != nil { | ||
fmt.Println(err) | ||
continue | ||
} | ||
|
||
memory, err := gpu.nvml.DeviceGetMemoryInfo(device) | ||
|
||
if err != nil { | ||
fmt.Println(err) | ||
} | ||
|
||
use, err := gpu.nvml.DeviceGetUtilizationRates(device) | ||
|
||
if err != nil { | ||
fmt.Println(err) | ||
} | ||
|
||
usage := GPUUsage{ | ||
GPU: float64(use.GPU), | ||
Memory: float64(memory.Used) / float64(memory.Total) * 100, | ||
} | ||
uses = append(uses, usage) | ||
} | ||
return uses | ||
} | ||
|
||
func (gpu GPUStatsCollector) Shutdown() { | ||
if gpu.nvml == nil { | ||
return | ||
} | ||
gpu.nvml.Shutdown() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,4 +11,5 @@ type NodeStats struct { | |
diskUsage []*disk.UsageStat | ||
diskIO *IOStats | ||
netIO *IOStats | ||
gpus []GPUUsage | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
set -e | ||
# apt remove nvidia-cuda-toolkit | ||
# apt remove nvidia-* | ||
apt update | ||
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub | ||
bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list' | ||
apt update | ||
apt install -y nvidia-driver-410 --no-install-recommends | ||
apt install -y cuda-10-0 --no-install-recommends | ||
apt-get install -y git binutils bison build-essential --no-install-recommends |
Oops, something went wrong.