Signed-off-by: limengxuan <limengxuan@4paradigm.com>

fix scheduler logs & provide metrics for vgpu
volcano-sh · Oct 30, 2023 · 3880421 · 3880421
1 parent 0c1e1f2
commit 3880421
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 8 deletions.
diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go b/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go
@@ -74,7 +74,6 @@ func NewGPUDevice(id int, mem uint) *GPUDevice {
 }
 
 func NewGPUDevices(name string, node *v1.Node) *GPUDevices {
-	klog.V(3).Infoln("into devices")
 	if node == nil {
 		return nil
 	}
@@ -141,6 +140,7 @@ func (gs *GPUDevices) AddResource(pod *v1.Pod) {
 			}
 		}
 	}
+	gs.GetStatus()
 }
 
 // SubResource frees the gpu hold by the pod
@@ -180,25 +180,21 @@ func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) erro
 }
 
 func (gs *GPUDevices) FilterNode(pod *v1.Pod) (int, string, error) {
-	klog.V(5).Infoln("4pdvgpu DeviceSharing starts filtering pods", pod.Name)
 	if VGPUEnable {
+		klog.V(5).Infoln("4pdvgpu DeviceSharing starts filtering pods", pod.Name)
 		fit, _, err := checkNodeGPUSharingPredicate(pod, gs, true)
 		if err != nil || !fit {
 			klog.Errorln("deviceSharing err=", err.Error())
 			return devices.Unschedulable, fmt.Sprintf("4pdvgpuDeviceSharing %s", err.Error()), err
 		}
+		klog.V(5).Infoln("4pdvgpu DeviceSharing successfully filters pods")
 	}
-	klog.V(5).Infoln("4pdvgpu DeviceSharing successfully filters pods")
 	return devices.Success, "", nil
 }
 
-func (gs *GPUDevices) GetStatus() string {
-	return ""
-}
-
 func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error {
-	klog.V(3).Infoln("VGPU DeviceSharing:Into AllocateToPod", pod.Name)
 	if VGPUEnable {
+		klog.V(3).Infoln("VGPU DeviceSharing:Into AllocateToPod", pod.Name)
 		fit, device, err := checkNodeGPUSharingPredicate(pod, gs, false)
 		if err != nil || !fit {
 			klog.Errorln("DeviceSharing err=", err.Error())
@@ -224,6 +220,7 @@ func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) err
 		if err != nil {
 			return err
 		}
+		gs.GetStatus()
 		klog.V(3).Infoln("DeviceSharing:Allocate Success")
 	}
 	return nil

diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/metrics.go b/pkg/scheduler/api/devices/nvidia/vgpu/metrics.go
@@ -0,0 +1,78 @@
+/*
+Copyright 2023 The Volcano Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package vgpu
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto" // auto-registry collectors in default registry
+)
+
+const (
+	// VolcanoNamespace - namespace in prometheus used by volcano
+	VolcanoNamespace = "volcano"
+
+	// OnSessionOpen label
+	OnSessionOpen = "OnSessionOpen"
+
+	// OnSessionClose label
+	OnSessionClose = "OnSessionClose"
+)
+
+var (
+	VGPUDevicesSharedNumber = promauto.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Subsystem: VolcanoNamespace,
+			Name:      "vgpu_device_shared_number",
+			Help:      "The number of vgpu tasks sharing this card",
+		},
+		[]string{"devID"},
+	)
+	VGPUDevicesSharedMemory = promauto.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Subsystem: VolcanoNamespace,
+			Name:      "vgpu_device_allocated_memory",
+			Help:      "The number of vgpu memory allocated in this card",
+		},
+		[]string{"devID"},
+	)
+	VGPUDevicesSharedCores = promauto.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Subsystem: VolcanoNamespace,
+			Name:      "vgpu_device_allocated_cores",
+			Help:      "The percentage of gpu compute cores allocated in this card",
+		},
+		[]string{"devID"},
+	)
+	VGPUDevicesMemoryLimit = promauto.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Subsystem: VolcanoNamespace,
+			Name:      "vgpu_device_memory_limit",
+			Help:      "The number of total device memory allocated in this card",
+		},
+		[]string{"devID"},
+	)
+)
+
+func (gs *GPUDevices) GetStatus() string {
+	for _, val := range gs.Device {
+		VGPUDevicesSharedNumber.WithLabelValues(val.UUID).Set(float64(val.UsedNum))
+		VGPUDevicesSharedMemory.WithLabelValues(val.UUID).Set(float64(val.UsedMem))
+		VGPUDevicesMemoryLimit.WithLabelValues(val.UUID).Set(float64(val.Memory))
+		VGPUDevicesSharedCores.WithLabelValues(val.UUID).Set(float64(val.UsedCore))
+	}
+	return ""
+}