Skip to content

Commit

Permalink
Signed-off-by: limengxuan <limengxuan@4paradigm.com>
Browse files Browse the repository at this point in the history
fix scheduler logs & provide metrics for vgpu
  • Loading branch information
archlitchi committed Oct 30, 2023
1 parent 0c1e1f2 commit 3880421
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 8 deletions.
13 changes: 5 additions & 8 deletions pkg/scheduler/api/devices/nvidia/vgpu/device_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ func NewGPUDevice(id int, mem uint) *GPUDevice {
}

func NewGPUDevices(name string, node *v1.Node) *GPUDevices {
klog.V(3).Infoln("into devices")
if node == nil {
return nil
}
Expand Down Expand Up @@ -141,6 +140,7 @@ func (gs *GPUDevices) AddResource(pod *v1.Pod) {
}
}
}
gs.GetStatus()
}

// SubResource frees the gpu hold by the pod
Expand Down Expand Up @@ -180,25 +180,21 @@ func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) erro
}

func (gs *GPUDevices) FilterNode(pod *v1.Pod) (int, string, error) {
klog.V(5).Infoln("4pdvgpu DeviceSharing starts filtering pods", pod.Name)
if VGPUEnable {
klog.V(5).Infoln("4pdvgpu DeviceSharing starts filtering pods", pod.Name)
fit, _, err := checkNodeGPUSharingPredicate(pod, gs, true)
if err != nil || !fit {
klog.Errorln("deviceSharing err=", err.Error())
return devices.Unschedulable, fmt.Sprintf("4pdvgpuDeviceSharing %s", err.Error()), err
}
klog.V(5).Infoln("4pdvgpu DeviceSharing successfully filters pods")
}
klog.V(5).Infoln("4pdvgpu DeviceSharing successfully filters pods")
return devices.Success, "", nil
}

func (gs *GPUDevices) GetStatus() string {
return ""
}

func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error {
klog.V(3).Infoln("VGPU DeviceSharing:Into AllocateToPod", pod.Name)
if VGPUEnable {
klog.V(3).Infoln("VGPU DeviceSharing:Into AllocateToPod", pod.Name)
fit, device, err := checkNodeGPUSharingPredicate(pod, gs, false)
if err != nil || !fit {
klog.Errorln("DeviceSharing err=", err.Error())
Expand All @@ -224,6 +220,7 @@ func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) err
if err != nil {
return err
}
gs.GetStatus()
klog.V(3).Infoln("DeviceSharing:Allocate Success")
}
return nil
Expand Down
78 changes: 78 additions & 0 deletions pkg/scheduler/api/devices/nvidia/vgpu/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
Copyright 2023 The Volcano Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package vgpu

import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto" // auto-registry collectors in default registry
)

const (
// VolcanoNamespace - namespace in prometheus used by volcano
VolcanoNamespace = "volcano"

// OnSessionOpen label
OnSessionOpen = "OnSessionOpen"

// OnSessionClose label
OnSessionClose = "OnSessionClose"
)

var (
VGPUDevicesSharedNumber = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: VolcanoNamespace,
Name: "vgpu_device_shared_number",
Help: "The number of vgpu tasks sharing this card",
},
[]string{"devID"},
)
VGPUDevicesSharedMemory = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: VolcanoNamespace,
Name: "vgpu_device_allocated_memory",
Help: "The number of vgpu memory allocated in this card",
},
[]string{"devID"},
)
VGPUDevicesSharedCores = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: VolcanoNamespace,
Name: "vgpu_device_allocated_cores",
Help: "The percentage of gpu compute cores allocated in this card",
},
[]string{"devID"},
)
VGPUDevicesMemoryLimit = promauto.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: VolcanoNamespace,
Name: "vgpu_device_memory_limit",
Help: "The number of total device memory allocated in this card",
},
[]string{"devID"},
)
)

func (gs *GPUDevices) GetStatus() string {
for _, val := range gs.Device {
VGPUDevicesSharedNumber.WithLabelValues(val.UUID).Set(float64(val.UsedNum))
VGPUDevicesSharedMemory.WithLabelValues(val.UUID).Set(float64(val.UsedMem))
VGPUDevicesMemoryLimit.WithLabelValues(val.UUID).Set(float64(val.Memory))
VGPUDevicesSharedCores.WithLabelValues(val.UUID).Set(float64(val.UsedCore))
}
return ""
}

0 comments on commit 3880421

Please sign in to comment.