From af6d7b925f4d3f938886e19fd6251d4c9bff53dc Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 22 Aug 2024 10:55:24 +0200 Subject: [PATCH 1/2] [no-relnote] Address integer overflow linting errors Signed-off-by: Evan Lezar --- .golangci.yml | 3 +++ api/config/v1/replicas.go | 4 ++-- internal/cuda/api.go | 1 + internal/rm/health.go | 38 ++++++++++++++++++++++++-------------- internal/vgpu/pciutil.go | 10 +++++----- internal/vgpu/vgpu.go | 8 ++++---- 6 files changed, 39 insertions(+), 25 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 22d53eda1..4f2c8a6e6 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -22,6 +22,9 @@ linters-settings: local-prefixes: github.com/NVIDIA/k8s-device-plugin issues: + exclude: + # A conversion of a uint8 to an int cannot overflow. + - "G115: integer overflow conversion uint8 -> int" exclude-rules: # We use math/rand instead of crypto/rand for unique names in e2e tests. - path: tests/e2e/ diff --git a/api/config/v1/replicas.go b/api/config/v1/replicas.go index c24c7109c..0e00dd57a 100644 --- a/api/config/v1/replicas.go +++ b/api/config/v1/replicas.go @@ -299,9 +299,9 @@ func (s *ReplicatedDevices) UnmarshalJSON(b []byte) error { result := make([]ReplicatedDeviceRef, len(slice)) for i, s := range slice { // Match a uint as a GPU index and convert it to a string - var index uint + var index uint64 if err = json.Unmarshal(s, &index); err == nil { - result[i] = ReplicatedDeviceRef(strconv.Itoa(int(index))) + result[i] = ReplicatedDeviceRef(strconv.FormatUint(index, 10)) continue } // Match strings as valid entries if they are GPU indices, MIG indices, or UUIDs diff --git a/internal/cuda/api.go b/internal/cuda/api.go index d41c59417..e43ce4a2b 100644 --- a/internal/cuda/api.go +++ b/internal/cuda/api.go @@ -65,6 +65,7 @@ func DriverGetVersion() (int, Result) { // DeviceGet returns the device with the specified index. func DeviceGet(index int) (Device, Result) { var device Device + //nolint:gosec // Since index is internal-only, we ignore possible overflow errors here. r := cuDeviceGet(&device, int32(index)) return device, r diff --git a/internal/rm/health.go b/internal/rm/health.go index 259288850..3a308ff3e 100644 --- a/internal/rm/health.go +++ b/internal/rm/health.go @@ -88,8 +88,8 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic }() parentToDeviceMap := make(map[string]*Device) - deviceIDToGiMap := make(map[string]int) - deviceIDToCiMap := make(map[string]int) + deviceIDToGiMap := make(map[string]uint32) + deviceIDToCiMap := make(map[string]uint32) eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError) for _, d := range devices { @@ -112,7 +112,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic supportedEvents, ret := gpu.GetSupportedEventTypes() if ret != nvml.SUCCESS { - klog.Infof("Unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret) + klog.Infof("unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret) unhealthy <- d continue } @@ -176,7 +176,7 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic if d.IsMigDevice() && e.GpuInstanceId != 0xFFFFFFFF && e.ComputeInstanceId != 0xFFFFFFFF { gi := deviceIDToGiMap[d.ID] ci := deviceIDToCiMap[d.ID] - if !(uint32(gi) == e.GpuInstanceId && uint32(ci) == e.ComputeInstanceId) { + if !(gi == e.GpuInstanceId && ci == e.ComputeInstanceId) { continue } klog.Infof("Event for mig device %v (gi=%v, ci=%v)", d.ID, gi, ci) @@ -215,7 +215,7 @@ func getAdditionalXids(input string) []uint64 { // getDevicePlacement returns the placement of the specified device. // For a MIG device the placement is defined by the 3-tuple // For a full device the returned 3-tuple is the device's uuid and 0xFFFFFFFF for the other two elements. -func (r *nvmlResourceManager) getDevicePlacement(d *Device) (string, int, int, error) { +func (r *nvmlResourceManager) getDevicePlacement(d *Device) (string, uint32, uint32, error) { if !d.IsMigDevice() { return d.GetUUID(), 0xFFFFFFFF, 0xFFFFFFFF, nil } @@ -223,7 +223,7 @@ func (r *nvmlResourceManager) getDevicePlacement(d *Device) (string, int, int, e } // getMigDeviceParts returns the parent GI and CI ids of the MIG device. -func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, int, int, error) { +func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, uint32, uint32, error) { if !d.IsMigDevice() { return "", 0, 0, fmt.Errorf("cannot get GI and CI of full device") } @@ -250,32 +250,42 @@ func (r *nvmlResourceManager) getMigDeviceParts(d *Device) (string, int, int, er if ret != nvml.SUCCESS { return "", 0, 0, fmt.Errorf("failed to get Compute Instance ID: %v", ret) } - return parentUUID, gi, ci, nil + //nolint:gosec // We know that the values returned from Get*InstanceId are within the valid uint32 range. + return parentUUID, uint32(gi), uint32(ci), nil } return parseMigDeviceUUID(uuid) } // parseMigDeviceUUID splits the MIG device UUID into the parent device UUID and ci and gi -func parseMigDeviceUUID(mig string) (string, int, int, error) { +func parseMigDeviceUUID(mig string) (string, uint32, uint32, error) { tokens := strings.SplitN(mig, "-", 2) if len(tokens) != 2 || tokens[0] != "MIG" { - return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") + return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") } tokens = strings.SplitN(tokens[1], "/", 3) if len(tokens) != 3 || !strings.HasPrefix(tokens[0], "GPU-") { - return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") + return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") } - gi, err := strconv.Atoi(tokens[1]) + gi, err := toUint32(tokens[1]) if err != nil { - return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") + return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") } - ci, err := strconv.Atoi(tokens[2]) + ci, err := toUint32(tokens[2]) if err != nil { - return "", 0, 0, fmt.Errorf("Unable to parse UUID as MIG device") + return "", 0, 0, fmt.Errorf("unable to parse UUID as MIG device") } return tokens[0], gi, ci, nil } + +func toUint32(s string) (uint32, error) { + u, err := strconv.ParseUint(s, 10, 32) + if err != nil { + return 0, err + } + //nolint:gosec // Since we parse s with a 32-bit size this will not overflow. + return uint32(u), nil +} diff --git a/internal/vgpu/pciutil.go b/internal/vgpu/pciutil.go index 112daee1b..ea1664961 100644 --- a/internal/vgpu/pciutil.go +++ b/internal/vgpu/pciutil.go @@ -122,11 +122,11 @@ func (d *PCIDevice) GetVendorSpecificCapability() ([]byte, error) { } var visited [256]byte - pos := int(GetByte(d.Config, PciCapabilityList)) + pos := GetByte(d.Config, PciCapabilityList) for pos != 0 { - id := int(GetByte(d.Config, pos+PciCapabilityListID)) - next := int(GetByte(d.Config, pos+PciCapabilityListNext)) - length := int(GetByte(d.Config, pos+PciCapabilityLength)) + id := GetByte(d.Config, pos+PciCapabilityListID) + next := GetByte(d.Config, pos+PciCapabilityListNext) + length := GetByte(d.Config, pos+PciCapabilityLength) if visited[pos] != 0 { // chain looped @@ -149,7 +149,7 @@ func (d *PCIDevice) GetVendorSpecificCapability() ([]byte, error) { } // GetByte returns a single byte of data at specified position -func GetByte(buffer []byte, pos int) uint8 { +func GetByte(buffer []byte, pos uint8) uint8 { return buffer[pos] } diff --git a/internal/vgpu/vgpu.go b/internal/vgpu/vgpu.go index 15ccffe0e..828e2cb7e 100644 --- a/internal/vgpu/vgpu.go +++ b/internal/vgpu/vgpu.go @@ -40,7 +40,7 @@ type Info struct { const ( // VGPUCapabilityRecordStart indicates offset of beginning vGPU capability record - VGPUCapabilityRecordStart = 5 + VGPUCapabilityRecordStart uint8 = 5 // HostDriverVersionLength indicates max length of driver version HostDriverVersionLength = 10 // HostDriverBranchLength indicates max length of driver branch @@ -116,14 +116,14 @@ func (d *Device) GetInfo() (*Info, error) { foundDriverVersionRecord := false pos := VGPUCapabilityRecordStart record := GetByte(d.vGPUCapability, VGPUCapabilityRecordStart) - for record != 0 && pos < len(d.vGPUCapability) { + for record != 0 && int(pos) < len(d.vGPUCapability) { // find next record recordLength := GetByte(d.vGPUCapability, pos+1) - pos += int(recordLength) + pos += recordLength record = GetByte(d.vGPUCapability, pos) } - if record == 0 && pos+2+HostDriverVersionLength+HostDriverBranchLength <= len(d.vGPUCapability) { + if record == 0 && int(pos+2+HostDriverVersionLength+HostDriverBranchLength) <= len(d.vGPUCapability) { foundDriverVersionRecord = true // found vGPU host driver version record type // initialized at record data byte, i.e pos + 1(record id byte) + 1(record lengh byte) From efeab001ec440c5f23cb05e32ecb4da606789c77 Mon Sep 17 00:00:00 2001 From: Monokaix Date: Wed, 4 Sep 2024 15:46:07 +0800 Subject: [PATCH 2/2] Update owner information Signed-off-by: Monokaix --- OWNERS | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 OWNERS diff --git a/OWNERS b/OWNERS new file mode 100644 index 000000000..543fa787a --- /dev/null +++ b/OWNERS @@ -0,0 +1,8 @@ +reviewers: + - william-wang + - archlitchi + - wangyang0616 + - Monokaix +approvers: + - william-wang + - Monokaix