Skip to content
This repository has been archived by the owner on Oct 12, 2023. It is now read-only.

Commit

Permalink
Feature: Include GPU monitoring (#24)
Browse files Browse the repository at this point in the history
  • Loading branch information
timotheeguerin authored Dec 12, 2018
1 parent 1d30fd3 commit 961f77b
Show file tree
Hide file tree
Showing 12 changed files with 348 additions and 3 deletions.
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ require (
github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f
github.com/dustin/go-humanize v0.0.0-20180713052910-9f541cc9db5d
github.com/go-ole/go-ole v1.2.1 // indirect
github.com/mindprince/gonvml v0.0.0-20180514031326-b364b296c732
github.com/mxpv/nvml-go v0.0.0-20180227003457-e07f8c26812d
github.com/pkg/errors v0.8.0
github.com/satori/go.uuid v1.2.0 // indirect
github.com/shirou/gopsutil v2.18.11+incompatible
golang.org/x/sys v0.0.0-20180907202204-917fdcba135d // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,20 @@ code.cloudfoundry.org/clock v0.0.0-20180518195852-02e53af36e6c/go.mod h1:QD9Lzhd
github.com/Azure/batch-insights v0.0.0-20180614201012-6d427c8344af h1:egr4mBTro2rKGAXfOAkkW0MRcu+k14V/UiajbY0bqUo=
github.com/Microsoft/ApplicationInsights-Go v0.4.2 h1:HIZoGXMiKNwAtMAgCSSX35j9mP+DjGF9ezfBvxMDLLg=
github.com/Microsoft/ApplicationInsights-Go v0.4.2/go.mod h1:CukZ/G66zxXtI+h/VcVn3eVVDGDHfXM2zVILF7bMmsg=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20181114021304-b70474fb8511 h1:A9x/8mtuZ6Sg3QYV5bP2QCHQ53aLVaAd/G8EAZmujtg=
github.com/NVIDIA/gpu-monitoring-tools v0.0.0-20181114021304-b70474fb8511/go.mod h1:nMOvShGpWaf0bXwXmeu4k+O4uziuaEI8pWzIj3BUrOA=
github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f h1:5ZfJxyXo8KyX8DgGXC5B7ILL8y51fci/qYz2B4j8iLY=
github.com/StackExchange/wmi v0.0.0-20180725035823-b12b22c5341f/go.mod h1:3eOhrUMpNV+6aFIbp5/iudMxNCF27Vw2OZgy4xEx0Fg=
github.com/dustin/go-humanize v0.0.0-20180713052910-9f541cc9db5d h1:lDrio3iIdNb0Gw9CgH7cQF+iuB5mOOjdJ9ERNJCBgb4=
github.com/dustin/go-humanize v0.0.0-20180713052910-9f541cc9db5d/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
github.com/go-ole/go-ole v1.2.1 h1:2lOsA72HgjxAuMlKpFiCbHTvu44PIVkZ5hqm3RSdI/E=
github.com/go-ole/go-ole v1.2.1/go.mod h1:7FAglXiTm7HKlQRDeOQ6ZNUHidzCWXuZWq/1dTyBNF8=
github.com/mindprince/gonvml v0.0.0-20180514031326-b364b296c732 h1:Dl/79RxNt1t6AYIMhKzyofqooXgw6+LZtAN4EIXRLCk=
github.com/mindprince/gonvml v0.0.0-20180514031326-b364b296c732/go.mod h1:2eu9pRWp8mo84xCg6KswZ+USQHjwgRhNp06sozOdsTY=
github.com/mxpv/nvml-go v0.0.0-20180227003457-e07f8c26812d h1:lQo1zUtnGr52K2a+Ll3DNDoukmPeuHK11baUNGRDSt0=
github.com/mxpv/nvml-go v0.0.0-20180227003457-e07f8c26812d/go.mod h1:PS1oTOPfvtFjl9T7nduA/RYrIpqtRh2Nvk++rQCZ2q8=
github.com/pkg/errors v0.8.0 h1:WdK/asTD0HN+q6hsWO3/vpuAkAr+tw6aNJNDFFf0+qw=
github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww=
github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
github.com/shirou/gopsutil v2.18.11+incompatible h1:PMFTKnFTr/YTRW5rbLK4vWALV3a+IGXse5nvhSjztmg=
Expand Down
25 changes: 25 additions & 0 deletions nvml/nvml.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package nvml

type NvmlClient interface {
Init() error
Shutdown() error
GetDeviceCount() (uint, error)

DeviceGetHandleByIndex(index uint) (Device, error)
DeviceGetMemoryInfo(device Device) (Memory, error)
DeviceGetUtilizationRates(device Device) (GPUUtilization, error)
}

type GPUUtilization struct {
GPU uint
Memory uint
}

type Memory struct {
Total uint64 // Total installed FB memory (in bytes).
Free uint64 // Unallocated FB memory (in bytes).
Used uint64 // Allocated FB memory (in bytes).
}

type Device interface {
}
66 changes: 66 additions & 0 deletions nvml/nvml_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// +build linux

package nvml

import (
nvml_linux "github.com/mindprince/gonvml"
)

type LinuxDevice = nvml_linux.Device

type LinuxNvmlClient struct {
}

func New() (*LinuxNvmlClient, error) {
client := LinuxNvmlClient{}

return &client, nil
}

func (client *LinuxNvmlClient) Init() error {
return nvml_linux.Initialize()
}

func (client *LinuxNvmlClient) Shutdown() error {
return nvml_linux.Shutdown()
}

func (client *LinuxNvmlClient) GetDeviceCount() (uint, error) {
value, err := nvml_linux.DeviceCount()
if err != nil {
return 0, err
}

return uint(value), nil
}

func (client *LinuxNvmlClient) DeviceGetUtilizationRates(device Device) (GPUUtilization, error) {
linuxDevice := device.(LinuxDevice)
gpu, memory, err := linuxDevice.UtilizationRates()
if err != nil {
return GPUUtilization{GPU: 0, Memory: 0}, err
}

use := GPUUtilization{
GPU: gpu,
Memory: memory,
}
return use, nil
}

func (client *LinuxNvmlClient) DeviceGetMemoryInfo(device Device) (Memory, error) {
linuxDevice := device.(LinuxDevice)
total, used, err := linuxDevice.MemoryInfo()
if err != nil {
return Memory{Used: used, Total: total}, err
}
return Memory{Used: used, Total: total}, nil
}

func (client *LinuxNvmlClient) DeviceGetHandleByIndex(index uint) (Device, error) {
device, err := nvml_linux.DeviceHandleByIndex(uint(index))
if err != nil {
return Device(device), err
}
return Device(device), nil
}
77 changes: 77 additions & 0 deletions nvml/nvml_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// +build windows

package nvml

import (
nvml_win "github.com/mxpv/nvml-go"
)

type WinDevice struct {
handle nvml_win.Device
}

func New() (*WinNvmlClient, error) {
api, err := nvml_win.New("")

if err != nil {
return nil, err
}

client := WinNvmlClient{
api: api,
}

return &client, nil
}

type WinNvmlClient struct {
api *nvml_win.API
}

func (client *WinNvmlClient) Init() error {
return client.api.Init()
}

func (client *WinNvmlClient) Shutdown() error {
return client.api.Shutdown()
}

func (client *WinNvmlClient) GetDeviceCount() (uint, error) {
value, err := client.api.DeviceGetCount()
if err != nil {
return 0, err
}

return uint(value), nil
}

func (client *WinNvmlClient) DeviceGetUtilizationRates(device Device) (GPUUtilization, error) {
winDevice := device.(WinDevice)
value, err := client.api.DeviceGetUtilizationRates(winDevice.handle)
if err != nil {
return GPUUtilization{GPU: 0, Memory: 0}, err
}

use := GPUUtilization{
GPU: uint(value.GPU),
Memory: uint(value.Memory),
}
return use, nil
}

func (client *WinNvmlClient) DeviceGetMemoryInfo(device Device) (Memory, error) {
winDevice := device.(WinDevice)
use, err := client.api.DeviceGetMemoryInfo(winDevice.handle)
if err != nil {
return Memory(use), err
}
return Memory(use), nil
}

func (client *WinNvmlClient) DeviceGetHandleByIndex(index uint) (Device, error) {
handle, err := client.api.DeviceGetHandleByIndex(uint32(index))
if err != nil {
return Device(WinDevice{handle: handle}), err
}
return Device(WinDevice{handle: handle}), nil
}
17 changes: 14 additions & 3 deletions pkg/appinsights.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ func (service AppInsightsService) UploadStats(stats NodeStats) {
}

for _, usage := range stats.diskUsage {
// client.TrackMetric("Disk usage", disk_usage.used, properties={"Disk": name})
// client.TrackMetric("Disk free", disk_usage.free, properties={"Disk": name})
usedMetric := appinsights.NewMetricTelemetry("Disk usage", float64(usage.Used))
usedMetric.Properties["Disk"] = usage.Path
client.Track(usedMetric)
Expand All @@ -41,10 +39,23 @@ func (service AppInsightsService) UploadStats(stats NodeStats) {
}

client.TrackMetric("Memory used", float64(stats.memory.Used))
client.TrackMetric("Memory available", float64(stats.memory.Free))
client.TrackMetric("Memory available", float64(stats.memory.Total-stats.memory.Used))
client.TrackMetric("Disk read", float64(stats.diskIO.readBps))
client.TrackMetric("Disk write", float64(stats.diskIO.writeBps))
client.TrackMetric("Network read", float64(stats.netIO.readBps))
client.TrackMetric("Network write", float64(stats.netIO.writeBps))

if len(stats.gpus) > 0 {
for cpuN, usage := range stats.gpus {
gpuMetric := appinsights.NewMetricTelemetry("Gpu usage", usage.GPU)
gpuMetric.Properties["GPU #"] = strconv.Itoa(cpuN)
client.Track(gpuMetric)

gpuMemoryMetric := appinsights.NewMetricTelemetry("Gpu memory usage", usage.Memory)
gpuMemoryMetric.Properties["GPU #"] = strconv.Itoa(cpuN)
client.Track(gpuMemoryMetric)
}
}

client.Channel().Flush()
}
12 changes: 12 additions & 0 deletions pkg/batchinsights.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,13 @@ func getDiskToWatch() []string {
func ListenForStats(poolId string, nodeId string, appInsightsKey string) {
var diskIO = IOAggregator{}
var netIO = IOAggregator{}
var gpuStatsCollector = NewGPUStatsCollector()
defer gpuStatsCollector.Shutdown()

var appInsightsService = createAppInsightsService(poolId, nodeId, appInsightsKey)

for _ = range time.Tick(STATS_POLL_RATE) {
gpuStatsCollector.GetStats()

v, _ := mem.VirtualMemory()
var cpus, err = cpu.PerCpuPercent()
Expand All @@ -62,6 +66,7 @@ func ListenForStats(poolId string, nodeId string, appInsightsKey string) {
diskUsage: getDiskUsage(),
diskIO: getDiskIO(&diskIO),
netIO: getNetIO(&netIO),
gpus: gpuStatsCollector.GetStats(),
}

if appInsightsService != nil {
Expand Down Expand Up @@ -143,6 +148,13 @@ func printStats(stats NodeStats) {
if stats.netIO != nil {
fmt.Printf("NET IO: R:%sps, S:%sps\n", humanize.Bytes(stats.netIO.readBps), humanize.Bytes(stats.netIO.writeBps))
}

if len(stats.gpus) > 0 {
fmt.Printf("GPU(s) usage:\n")
for _, usage := range stats.gpus {
fmt.Printf(" - GPU: %f%%, Memory: %f%%\n", usage.GPU, usage.Memory)
}
}
fmt.Println()
fmt.Println()
}
Expand Down
87 changes: 87 additions & 0 deletions pkg/gpu_stats_collector.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package batchinsights

import (
"fmt"
"github.com/Azure/batch-insights/nvml"
)

type GPUStatsCollector struct {
nvml nvml.NvmlClient
deviceCount uint
}

type GPUUsage struct {
GPU float64
Memory float64
}

func NewGPUStatsCollector() GPUStatsCollector {
nvmlClient, err := nvml.New()

if err != nil {
fmt.Println("No GPU detected. Nvidia driver might be missing")
} else {
err = nvmlClient.Init()

if err != nil {
fmt.Println("No GPU detected. Nvidia driver might be missing. Error while initializing NVML", err)
nvmlClient = nil
} else {
deviceCount, err := nvmlClient.GetDeviceCount()

if err != nil {
fmt.Println(err)
} else {
fmt.Printf("NVML is loaded found %d gpus\n", deviceCount)
}

return GPUStatsCollector{
nvml: nvmlClient,
deviceCount: deviceCount,
}
}
}
return GPUStatsCollector{}
}

func (gpu GPUStatsCollector) GetStats() []GPUUsage {
if gpu.nvml == nil {
return nil
}

var uses []GPUUsage

for i := uint(0); i < gpu.deviceCount; i++ {
device, err := gpu.nvml.DeviceGetHandleByIndex(i)
if err != nil {
fmt.Println(err)
continue
}

memory, err := gpu.nvml.DeviceGetMemoryInfo(device)

if err != nil {
fmt.Println(err)
}

use, err := gpu.nvml.DeviceGetUtilizationRates(device)

if err != nil {
fmt.Println(err)
}

usage := GPUUsage{
GPU: float64(use.GPU),
Memory: float64(memory.Used) / float64(memory.Total) * 100,
}
uses = append(uses, usage)
}
return uses
}

func (gpu GPUStatsCollector) Shutdown() {
if gpu.nvml == nil {
return
}
gpu.nvml.Shutdown()
}
1 change: 1 addition & 0 deletions pkg/node_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ type NodeStats struct {
diskUsage []*disk.UsageStat
diskIO *IOStats
netIO *IOStats
gpus []GPUUsage
}
10 changes: 10 additions & 0 deletions scripts/gpu-init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
set -e
# apt remove nvidia-cuda-toolkit
# apt remove nvidia-*
apt update
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
bash -c 'echo "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
apt update
apt install -y nvidia-driver-410 --no-install-recommends
apt install -y cuda-10-0 --no-install-recommends
apt-get install -y git binutils bison build-essential --no-install-recommends
Loading

0 comments on commit 961f77b

Please sign in to comment.