Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

new flag -"-gpu" to enable Nvidia container runtime #17314

Merged
merged 13 commits into from
Oct 6, 2023
20 changes: 20 additions & 0 deletions cmd/minikube/cmd/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -1302,6 +1302,12 @@ func validateFlags(cmd *cobra.Command, drvName string) {
}
}

if cmd.Flags().Changed(gpus) {
if err := validateGPUs(viper.GetString(gpus), drvName, viper.GetString(containerRuntime)); err != nil {
exit.Message(reason.Usage, "{{.err}}", out.V{"err": err})
}
}

if driver.IsSSH(drvName) {
sshIPAddress := viper.GetString(sshIPAddress)
if sshIPAddress == "" {
Expand Down Expand Up @@ -1438,6 +1444,20 @@ func validateRuntime(rtime string) error {
return nil
}

// validateGPUs validates that a valid option was given, and if so, can it be used with the given configuration
func validateGPUs(value, drvName, rtime string) error {
if value == "" {
return nil
}
if value != "nvidia" && value != "all" {
return errors.Errorf(`The gpus flag must be passed a value of "nvidia" or "all"`)
}
if drvName == constants.Docker && (rtime == constants.Docker || rtime == constants.DefaultContainerRuntime) {
return nil
}
return errors.Errorf("The gpus flag can only be used with the docker driver and docker container-runtime")
}

func getContainerRuntime(old *config.ClusterConfig) string {
paramRuntime := viper.GetString(containerRuntime)

Expand Down
3 changes: 3 additions & 0 deletions cmd/minikube/cmd/start_flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ const (
socketVMnetPath = "socket-vmnet-path"
staticIP = "static-ip"
autoPauseInterval = "auto-pause-interval"
gpus = "gpus"
spowelljr marked this conversation as resolved.
Show resolved Hide resolved
)

var (
Expand Down Expand Up @@ -204,6 +205,7 @@ func initMinikubeFlags() {
startCmd.Flags().Bool(disableMetrics, false, "If set, disables metrics reporting (CPU and memory usage), this can improve CPU usage. Defaults to false.")
startCmd.Flags().String(staticIP, "", "Set a static IP for the minikube cluster, the IP must be: private, IPv4, and the last octet must be between 2 and 254, for example 192.168.200.200 (Docker and Podman drivers only)")
startCmd.Flags().Duration(autoPauseInterval, time.Minute*1, "Duration of inactivity before the minikube VM is paused (default 1m0s). To disable, set to 0s")
startCmd.Flags().StringP(gpus, "g", "", "Allow pods to use your NVIDIA GPUs. Options include: [all,nvidia] (Docker driver with Docker container-runtime only)")
}

// initKubernetesFlags inits the commandline flags for Kubernetes related options
Expand Down Expand Up @@ -595,6 +597,7 @@ func generateNewConfigFromFlags(cmd *cobra.Command, k8sVersion string, rtime str
},
MultiNodeRequested: viper.GetInt(nodes) > 1,
AutoPauseInterval: viper.GetDuration(autoPauseInterval),
GPUs: viper.GetString(gpus),
}
cc.VerifyComponents = interpretWaitFlag(*cmd)
if viper.GetBool(createMount) && driver.IsKIC(drvName) {
Expand Down
30 changes: 29 additions & 1 deletion cmd/minikube/cmd/start_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -444,7 +444,6 @@ func TestValidateRuntime(t *testing.T) {
runtime: "docker",
errorMsg: "",
},

{
runtime: "test",
errorMsg: fmt.Sprintf("Invalid Container Runtime: test. Valid runtimes are: %v", cruntime.ValidRuntimes()),
Expand Down Expand Up @@ -860,3 +859,32 @@ func TestImageMatchesBinaryVersion(t *testing.T) {
}
}
}

func TestValidateGPUs(t *testing.T) {
tests := []struct {
gpus string
drvName string
runtime string
errorMsg string
}{
{"", "kvm", "containerd", ""},
{"all", "docker", "docker", ""},
{"nvidia", "docker", "docker", ""},
{"all", "docker", "", ""},
{"nvidia", "docker", "", ""},
{"all", "kvm", "docker", "The gpus flag can only be used with the docker driver and docker container-runtime"},
{"nvidia", "docker", "containerd", "The gpus flag can only be used with the docker driver and docker container-runtime"},
{"cat", "docker", "docker", `The gpus flag must be passed a value of "nvidia" or "all"`},
}

for _, tc := range tests {
gotError := ""
got := validateGPUs(tc.gpus, tc.drvName, tc.runtime)
if got != nil {
gotError = got.Error()
}
if gotError != tc.errorMsg {
t.Errorf("validateGPUs(%s, %s, %s) = %q; want = %q", tc.gpus, tc.drvName, tc.runtime, got, tc.errorMsg)
}
}
}
4 changes: 4 additions & 0 deletions deploy/addons/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,4 +166,8 @@ var (
// Kubeflow assets for kubeflow addon
//go:embed kubeflow/*.yaml
Kubeflow embed.FS

// NvidiaDevicePlugin assets for nvidia-device-plugin addon
//go:embed nvidia-device-plugin/*.tmpl
NvidiaDevicePlugin embed.FS
)
56 changes: 56 additions & 0 deletions deploy/addons/nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: {{.CustomRegistries.NvidiaDevicePlugin | default .ImageRepository | default .Registries.NvidiaDevicePlugin}}{{.Images.NvidiaDevicePlugin}}
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
5 changes: 5 additions & 0 deletions pkg/addons/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,4 +227,9 @@ var Addons = []*Addon{
set: SetBool,
callbacks: []setFn{EnableOrDisableAddon},
},
{
name: "nvidia-device-plugin",
set: SetBool,
callbacks: []setFn{EnableOrDisableAddon},
},
}
1 change: 1 addition & 0 deletions pkg/drivers/kic/kic.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ func (d *Driver) Create() error {
ExtraArgs: append([]string{"--expose", fmt.Sprintf("%d", d.NodeConfig.APIServerPort)}, d.NodeConfig.ExtraArgs...),
OCIBinary: d.NodeConfig.OCIBinary,
APIServerPort: d.NodeConfig.APIServerPort,
GPUs: d.NodeConfig.GPUs,
}

networkName := d.NodeConfig.Network
Expand Down
3 changes: 3 additions & 0 deletions pkg/drivers/kic/oci/oci.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@ func CreateContainerNode(p CreateParams) error {
runArgs = append(runArgs, "--network", p.Network)
runArgs = append(runArgs, "--ip", p.IP)
}
if p.GPUs != "" {
runArgs = append(runArgs, "--gpus", "all")
}

memcgSwap := hasMemorySwapCgroup()
memcg := HasMemoryCgroup()
Expand Down
3 changes: 2 additions & 1 deletion pkg/drivers/kic/oci/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ type CreateParams struct {
ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080...
OCIBinary string // docker or podman
Network string // network name that the container will attach to
IP string // static IP to assign for th container in the cluster network
IP string // static IP to assign the container in the cluster network
GPUs string // add NVIDIA GPU devices to the container
}

// createOpt is an option for Create
Expand Down
1 change: 1 addition & 0 deletions pkg/drivers/kic/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,5 @@ type Config struct {
StaticIP string // static IP for the kic cluster
ExtraArgs []string // a list of any extra option to pass to oci binary during creation time, for example --expose 8080...
ListenAddress string // IP Address to listen to
GPUs string // add NVIDIA GPU devices to the container
}
13 changes: 13 additions & 0 deletions pkg/minikube/assets/addons.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ func (a *Addon) IsEnabledOrDefault(cc *config.ClusterConfig) bool {
return a.enabled
}

// EnableByDefault will enable the addon by default on cluster start
func (a *Addon) EnableByDefault() {
a.enabled = true
}

// Addons is the list of addons
// TODO: Make dynamically loadable: move this data to a .yaml file within each addon directory
var Addons = map[string]*Addon{
Expand Down Expand Up @@ -774,6 +779,14 @@ var Addons = map[string]*Addon{
MustBinAsset(addons.Kubeflow, "kubeflow/kubeflow.yaml", vmpath.GuestAddonsDir, "kubeflow.yaml", "0640"),
}, false, "kubeflow", "3rd party", "", "", nil, nil,
),
"nvidia-device-plugin": NewAddon([]*BinAsset{
MustBinAsset(addons.NvidiaDevicePlugin, "nvidia-device-plugin/nvidia-device-plugin.yaml.tmpl", vmpath.GuestAddonsDir, "nvidia-device-plugin.yaml", "0640"),
}, false, "nvidia-device-plugin", "3rd party (NVIDIA)", "", "",
map[string]string{
"NvidiaDevicePlugin": "nvidia/k8s-device-plugin:v0.14.1@sha256:15c4280d13a61df703b12d1fd1b5b5eec4658157db3cb4b851d3259502310136",
}, map[string]string{
"NvidiaDevicePlugin": "nvcr.io",
}),
}

// parseMapString creates a map based on `str` which is encoded as <key1>=<value1>,<key2>=<value2>,...
Expand Down
1 change: 1 addition & 0 deletions pkg/minikube/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ type ClusterConfig struct {
SSHAuthSock string
SSHAgentPID int
AutoPauseInterval time.Duration // Specifies interval of time to wait before checking if cluster should be paused
GPUs string
}

// KubernetesConfig contains the parameters used to configure the VM Kubernetes.
Expand Down
8 changes: 5 additions & 3 deletions pkg/minikube/cruntime/cruntime.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func (cs ContainerState) String() string {

// ValidRuntimes lists the supported container runtimes
func ValidRuntimes() []string {
return []string{"docker", "nvidia-docker", "cri-o", "containerd"}
return []string{"docker", "cri-o", "containerd"}
}

// CommandRunner is the subset of command.Runner this package consumes
Expand Down Expand Up @@ -155,6 +155,8 @@ type Config struct {
KubernetesVersion semver.Version
// InsecureRegistry list of insecure registries
InsecureRegistry []string
// GPUs add GPU devices to the container
GPUs bool
}

// ListContainersOptions are the options to use for listing containers
Expand Down Expand Up @@ -210,7 +212,7 @@ func New(c Config) (Manager, error) {
sm := sysinit.New(c.Runner)

switch c.Type {
case "", "docker", "nvidia-docker":
case "", "docker":
sp := c.Socket
cs := ""
// There is no more dockershim socket, in Kubernetes version 1.24 and beyond
Expand All @@ -219,7 +221,6 @@ func New(c Config) (Manager, error) {
cs = "cri-docker.socket"
}
return &Docker{
Type: c.Type,
Socket: sp,
Runner: c.Runner,
NetworkPlugin: c.NetworkPlugin,
Expand All @@ -228,6 +229,7 @@ func New(c Config) (Manager, error) {
Init: sm,
UseCRI: (sp != ""), // !dockershim
CRIService: cs,
GPUs: c.GPUs,
}, nil
case "crio", "cri-o":
return &CRIO{
Expand Down
18 changes: 1 addition & 17 deletions pkg/minikube/cruntime/cruntime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ func TestName(t *testing.T) {
}{
{"", "Docker"},
{"docker", "Docker"},
{"nvidia-docker", "Docker"},
{"crio", "CRI-O"},
{"cri-o", "CRI-O"},
{"containerd", "containerd"},
Expand Down Expand Up @@ -125,7 +124,6 @@ func TestCGroupDriver(t *testing.T) {
want string
}{
{"docker", "cgroupfs"},
{"nvidia-docker", "cgroupfs"},
{"crio", "cgroupfs"},
{"containerd", "cgroupfs"},
}
Expand Down Expand Up @@ -157,12 +155,6 @@ func TestKubeletOptions(t *testing.T) {
{"docker", "1.24.0", map[string]string{
"container-runtime-endpoint": "unix:///var/run/cri-dockerd.sock",
}},
{"nvidia-docker", "1.23.0", map[string]string{
"container-runtime": "docker",
}},
{"nvidia-docker", "1.25.0", map[string]string{
"container-runtime-endpoint": "unix:///var/run/cri-dockerd.sock",
}},
{"crio", "1.25.0", map[string]string{
"container-runtime-endpoint": "unix:///var/run/crio/crio.sock",
}},
Expand Down Expand Up @@ -688,13 +680,6 @@ func TestEnable(t *testing.T) {
"crio": SvcExited,
"crio-shutdown": SvcExited,
}},
{"nvidia-docker", defaultServices,
map[string]serviceState{
"docker": SvcRestarted,
"containerd": SvcExited,
"crio": SvcExited,
"crio-shutdown": SvcExited,
}},
{"containerd", defaultServices,
map[string]serviceState{
"docker": SvcExited,
Expand Down Expand Up @@ -736,7 +721,6 @@ func TestContainerFunctions(t *testing.T) {
runtime string
}{
{"docker"},
{"nvidia-docker"},
{"crio"},
{"containerd"},
}
Expand All @@ -746,7 +730,7 @@ func TestContainerFunctions(t *testing.T) {
t.Run(tc.runtime, func(t *testing.T) {
runner := NewFakeRunner(t)
prefix := ""
if tc.runtime == "docker" || tc.runtime == "nvidia-docker" {
if tc.runtime == "docker" {
prefix = "k8s_"
}
runner.containers = map[string]string{
Expand Down
Loading