Skip to content

Commit

Permalink
koordlet: supply rdma devices (#2276)
Browse files Browse the repository at this point in the history
Signed-off-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
Co-authored-by: wangjianyu.wjy <wangjianyu.wjy@alibaba-inc.com>
  • Loading branch information
ZiMengSheng and wangjianyu.wjy authored Dec 5, 2024
1 parent b8b1d89 commit 7c94119
Show file tree
Hide file tree
Showing 15 changed files with 492 additions and 25 deletions.
2 changes: 1 addition & 1 deletion docker/koordlet.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ RUN go build -a -o koordlet cmd/koordlet/main.go

FROM --platform=$TARGETPLATFORM nvidia/cuda:11.8.0-base-ubuntu22.04
WORKDIR /
RUN apt-get update && apt-get install -y lvm2 iptables && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y lvm2 iptables pciutils && rm -rf /var/lib/apt/lists/*
COPY --from=builder /go/src/github.com/koordinator-sh/koordinator/koordlet .
COPY --from=builder /usr/local/lib /usr/lib
ENTRYPOINT ["/koordlet"]
8 changes: 8 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ module github.com/koordinator-sh/koordinator
go 1.20

require (
github.com/Mellanox/rdmamap v1.1.0
github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82
github.com/cakturk/go-netstat v0.0.0-20200220111822-e5b49efee7a5
github.com/containerd/nri v0.6.1
Expand All @@ -19,6 +20,7 @@ require (
github.com/golang/protobuf v1.5.3
github.com/google/go-cmp v0.5.9
github.com/google/uuid v1.3.0
github.com/jaypipes/ghw v0.12.0
github.com/jedib0t/go-pretty/v6 v6.4.0
github.com/k8stopologyawareschedwg/noderesourcetopology-api v0.1.1
github.com/mohae/deepcopy v0.0.0-20170603005431-491d3605edfb
Expand Down Expand Up @@ -65,23 +67,29 @@ require (

require (
cloud.google.com/go/compute/metadata v0.2.3 // indirect
github.com/StackExchange/wmi v1.2.1 // indirect
github.com/antlr/antlr4/runtime/Go/antlr/v4 v4.0.0-20230305170008-8188dc5388df // indirect
github.com/cenkalti/backoff/v4 v4.2.1 // indirect
github.com/containerd/containerd v1.6.9 // indirect
github.com/containerd/ttrpc v1.2.3 // indirect
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/evanphx/json-patch/v5 v5.6.0 // indirect
github.com/ghodss/yaml v1.0.0 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-ole/go-ole v1.2.6 // indirect
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/google/cel-go v0.16.1 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/pprof v0.0.0-20220829040838-70bd9ae97f40 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3 // indirect
github.com/jaypipes/pcidb v1.0.0 // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/stoewer/go-strcase v1.2.0 // indirect
golang.org/x/exp v0.0.0-20220827204233-334a2380cb91 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20230731190214-cbb8c96f2d6d // indirect
howett.net/plist v1.0.0 // indirect
k8s.io/controller-manager v0.28.7 // indirect
k8s.io/dynamic-resource-allocation v0.28.7 // indirect
k8s.io/gengo v0.0.0-20220902162205-c0856e24416d // indirect
Expand Down
15 changes: 15 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,8 @@ github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab h1:UKkYhof1njT1
github.com/JeffAshton/win_pdh v0.0.0-20161109143554-76bb4ee9f0ab/go.mod h1:3VYc5hodBMJ5+l/7J4xAyMeuM2PNuepvHlGs8yilUCA=
github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk=
github.com/MakeNowJust/heredoc v1.0.0/go.mod h1:mG5amYoWBHf8vpLOuehzbGGw0EHxpZZ6lCpQ4fNJ8LE=
github.com/Mellanox/rdmamap v1.1.0 h1:A/W1wAXw+6vm58f3VklrIylgV+eDJlPVIMaIKuxgUT4=
github.com/Mellanox/rdmamap v1.1.0/go.mod h1:fN+/V9lf10ABnDCwTaXRjeeWijLt2iVLETnK+sx/LY8=
github.com/Microsoft/go-winio v0.4.11/go.mod h1:VhR8bwka0BXejwEJY73c50VrPtXAaKcyvVC4A4RozmA=
github.com/Microsoft/go-winio v0.4.14/go.mod h1:qXqCSQ3Xa7+6tgxaGTIe4Kpcdsi+P8jBhyzoq1bpyYA=
github.com/Microsoft/go-winio v0.4.15-0.20190919025122-fc70bd9a86b5/go.mod h1:tTuCMEN+UleMWgg9dVx4Hu52b1bJo+59jBh3ajtinzw=
Expand Down Expand Up @@ -264,6 +266,8 @@ github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cq
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU=
github.com/Shopify/logrus-bugsnag v0.0.0-20171204204709-577dee27f20d/go.mod h1:HI8ITrYtUY+O+ZhtlqUnD8+KwNPOyugEhfP9fdUIaEQ=
github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8=
github.com/a8m/tree v0.0.0-20210115125333-10a5fd5b637d/go.mod h1:FSdwKX97koS5efgm8WevNf7XS3PqtyFkKDDXrz778cg=
github.com/ajstarks/deck v0.0.0-20200831202436-30c9fc6549a9/go.mod h1:JynElWSGnm/4RlzPXRlREEwqTHAN3T56Bv2ITsFT3gY=
github.com/ajstarks/deck/generate v0.0.0-20210309230005-c3f852c02e19/go.mod h1:T13YZdzov6OU0A1+RfKZiZN9ca6VeKdBdyDV+BY97Tk=
Expand Down Expand Up @@ -586,6 +590,7 @@ github.com/fsnotify/fsnotify v1.6.0/go.mod h1:sl3t1tCWJFWoRz9R8WJCbQihKKwmorjAbS
github.com/fullsailor/pkcs7 v0.0.0-20190404230743-d7302db945fa/go.mod h1:KnogPXtdwXqoenmZCw6S+25EAm2MkxbG0deNDu4cbSA=
github.com/fvbommel/sortorder v1.1.0/go.mod h1:uk88iVf1ovNn1iLfgUVU2F9o5eO30ui720w+kxuqRs0=
github.com/garyburd/redigo v0.0.0-20150301180006-535138d7bcd7/go.mod h1:NR3MbYisc3/PwhQ00EMzDiPmrwpPxAn5GI05/YaO1SY=
github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk=
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
Expand Down Expand Up @@ -621,6 +626,9 @@ github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre
github.com/go-logr/zapr v1.2.3/go.mod h1:eIauM6P8qSvTw5o2ez6UEAfGjQKrxQTl5EoK+Qa2oG4=
github.com/go-logr/zapr v1.2.4 h1:QHVo+6stLbfJmYGkQ7uGHUCu5hnAFAj6mDe6Ea0SeOo=
github.com/go-logr/zapr v1.2.4/go.mod h1:FyHWQIzQORZ0QVE1BtVHv3cKtNLuXsbNLtpuhNapBOA=
github.com/go-ole/go-ole v1.2.5/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE=
github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
github.com/go-openapi/jsonreference v0.20.1/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
Expand Down Expand Up @@ -852,6 +860,10 @@ github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLf
github.com/ionos-cloud/sdk-go/v6 v6.1.3 h1:vb6yqdpiqaytvreM0bsn2pXw+1YDvEk2RKSmBAQvgDQ=
github.com/ishidawataru/sctp v0.0.0-20190723014705-7c296d48a2b5/go.mod h1:DM4VvS+hD/kDi1U1QsX2fnZowwBhqD0Dk3bRPKF/Oc8=
github.com/j-keck/arping v0.0.0-20160618110441-2cf9dc699c56/go.mod h1:ymszkNOg6tORTn+6F6j+Jc8TOr5osrynvN6ivFWZ2GA=
github.com/jaypipes/ghw v0.12.0 h1:xU2/MDJfWmBhJnujHY9qwXQLs3DBsf0/Xa9vECY0Tho=
github.com/jaypipes/ghw v0.12.0/go.mod h1:jeJGbkRB2lL3/gxYzNYzEDETV1ZJ56OKr+CSeSEym+g=
github.com/jaypipes/pcidb v1.0.0 h1:vtZIfkiCUE42oYbJS0TAq9XSfSmcsgo9IdxSm9qzYU8=
github.com/jaypipes/pcidb v1.0.0/go.mod h1:TnYUvqhPBzCKnH34KrIX22kAeEbDCSRJ9cqLRCuNDfk=
github.com/jedib0t/go-pretty/v6 v6.4.0 h1:YlI/2zYDrweA4MThiYMKtGRfT+2qZOO65ulej8GTcVI=
github.com/jedib0t/go-pretty/v6 v6.4.0/go.mod h1:MgmISkTWDSFu0xOqiZ0mKNntMQ2mDgOcwOkwBEkMDJI=
github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
Expand Down Expand Up @@ -1859,6 +1871,7 @@ gopkg.in/square/go-jose.v2 v2.6.0/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME=
gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI=
gopkg.in/yaml.v1 v1.0.0-20140924161607-9f9df34309c0/go.mod h1:WDnlLJ4WF5VGsH/HVa3CI79GS0ol3YnhVnKP89i0kNg=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
Expand All @@ -1883,6 +1896,8 @@ honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt
honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.1.3/go.mod h1:NgwopIslSNH47DimFoV78dnkksY2EFtX0ajyb3K/las=
howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM=
howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g=
k8s.io/api v0.28.7 h1:YKIhBxjXKaxuxWJnwohV0aGjRA5l4IU0Eywf/q19AVI=
k8s.io/api v0.28.7/go.mod h1:y4RbcjCCMff1930SG/TcP3AUKNfaJUgIeUp58e/2vyY=
k8s.io/apiextensions-apiserver v0.28.7 h1:NQlzP/vmvIO9Qt7wQTdMe9sGWGkozQZMPk9suehAvR8=
Expand Down
7 changes: 7 additions & 0 deletions pkg/features/koordlet_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,12 @@ const (
// Accelerators enables GPU related feature in koordlet. Only Nvidia GPUs supported.
Accelerators featuregate.Feature = "Accelerators"

// owner: @ZiMengSheng
// alpha: v1.6
//
// NetDevices enables RDMA related feature in koordlet.
RDMADevices featuregate.Feature = "RDMADevices"

// owner: @songtao98 @zwzhang0107
// alpha: v1.0
//
Expand Down Expand Up @@ -164,6 +170,7 @@ var (
CgroupReconcile: {Default: false, PreRelease: featuregate.Alpha},
NodeTopologyReport: {Default: true, PreRelease: featuregate.Beta},
Accelerators: {Default: false, PreRelease: featuregate.Alpha},
RDMADevices: {Default: false, PreRelease: featuregate.Alpha},
CPICollector: {Default: false, PreRelease: featuregate.Alpha},
Libpfm4: {Default: false, PreRelease: featuregate.Alpha},
PSICollector: {Default: false, PreRelease: featuregate.Alpha},
Expand Down
11 changes: 10 additions & 1 deletion pkg/koordlet/metricsadvisor/devices/gpu/collector_gpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"errors"
"fmt"
"sort"
"strings"
"sync"
"time"

Expand All @@ -30,6 +31,7 @@ import (

"github.com/koordinator-sh/koordinator/pkg/features"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/devices/helper"
"github.com/koordinator-sh/koordinator/pkg/koordlet/util"
)

Expand Down Expand Up @@ -121,7 +123,14 @@ func (g *gpuDeviceManager) initGPUData() error {
if ret != nvml.SUCCESS {
return fmt.Errorf("unable to get pci info: %v", nvml.ErrorString(ret))
}
nodeID, pcie, busID, err := parseGPUPCIInfo(pciInfo.BusIdLegacy)
busIDBuilder := &strings.Builder{}
for _, v := range pciInfo.BusIdLegacy {
if v != 0 {
busIDBuilder.WriteByte(byte(v))
}
}
busID := strings.ToLower(busIDBuilder.String())
nodeID, pcie, busID, err := helper.ParsePCIInfo(busID)
if err != nil {
return err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package gpu
package helper

import (
"bytes"
Expand All @@ -23,7 +23,6 @@ import (
"path/filepath"
"regexp"
"strconv"
"strings"

"github.com/koordinator-sh/koordinator/pkg/koordlet/util/system"
)
Expand All @@ -32,14 +31,7 @@ var (
pcieRegexp = regexp.MustCompile(`pci\d{4}:[0-9a-fA-F]{2}`)
)

func parseGPUPCIInfo(busIdLegacy [16]int8) (int32, string, string, error) {
busIDBuilder := &strings.Builder{}
for _, v := range busIdLegacy {
if v != 0 {
busIDBuilder.WriteByte(byte(v))
}
}
busID := strings.ToLower(busIDBuilder.String())
func ParsePCIInfo(busID string) (int32, string, string, error) {
nodeID, err := getNUMANodeID(busID)
if err != nil {
return 0, "", "", fmt.Errorf("failed to parse NUMA Node ID, err: %w", err)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package gpu
package helper

import (
"fmt"
Expand Down Expand Up @@ -74,7 +74,7 @@ func Test_parseGPUPCIInfo(t *testing.T) {
for i, v := range tt.busID {
busIdLegacy[i] = int8(v)
}
nodeID, pcie, busID, err := parseGPUPCIInfo(busIdLegacy)
nodeID, pcie, busID, err := ParsePCIInfo(tt.busID)
if (err != nil) && !tt.wantErr {
t.Errorf("expect wantErr=%v but got err=%v", tt.wantErr, err)
return
Expand Down
77 changes: 77 additions & 0 deletions pkg/koordlet/metricsadvisor/devices/rdma/collector_rdma.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
Copyright 2022 The Koordinator Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package rdma

import (
corev1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"

"github.com/koordinator-sh/koordinator/pkg/features"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/framework"
)

const (
DeviceCollectorName = "RDMA"
)

type rdmaCollector struct {
enabled bool
}

func New(opt *framework.Options) framework.DeviceCollector {
return &rdmaCollector{
enabled: features.DefaultKoordletFeatureGate.Enabled(features.RDMADevices),
}
}

func (g *rdmaCollector) Shutdown() {
}

func (g *rdmaCollector) Enabled() bool {
return g.enabled
}

func (g *rdmaCollector) Setup(fra *framework.Context) {
}

func (g *rdmaCollector) Run(stopCh <-chan struct{}) {
}

func (g *rdmaCollector) Started() bool {
return true
}

func (g *rdmaCollector) Infos() metriccache.Devices {
netDevices, err := GetNetDevice()
if err != nil {
klog.Errorf("failed to get net device: %v", err)
}
return netDevices
}

func (g *rdmaCollector) GetNodeMetric() ([]metriccache.MetricSample, error) {
return nil, nil
}

func (g *rdmaCollector) GetPodMetric(uid, podParentDir string, cs []corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
return nil, nil
}

func (g *rdmaCollector) GetContainerMetric(containerID, podParentDir string, c *corev1.ContainerStatus) ([]metriccache.MetricSample, error) {
return nil, nil
}
Loading

0 comments on commit 7c94119

Please sign in to comment.