Add support for requesting GPUs (#509)

* Add support for requesting GPUs * refine unit test * Add GPU spec and refine unit test * Add user guide about GPU and refine code format * fix typo
kubeflow · Jun 7, 2019 · fbdd41b · fbdd41b
1 parent df94871
commit fbdd41b
Show file tree

Hide file tree

Showing 5 changed files with 313 additions and 1 deletion.
diff --git a/docs/user-guide.md b/docs/user-guide.md
@@ -11,6 +11,7 @@ The Kubernetes Operator for Apache Spark ships with a command-line tool called `
     * [Specifying Hadoop Configuration](#specifying-hadoop-configuration)
     * [Writing Driver Specification](#writing-driver-specification)
     * [Writing Executor Specification](#writing-executor-specification)
+    * [Requesting GPU Resources](#requesting-gpu-resources)
     * [Mounting Secrets](#mounting-secrets)
     * [Mounting ConfigMaps](#mounting-configmaps)
         * [Mounting a ConfigMap storing Spark Configuration Files](#mounting-a-configmap-storing-spark-configuration-files)
@@ -145,6 +146,34 @@ spec:
       version: 2.4.0
 ```
 
+### Requesting GPU Resources
+
+A `SparkApplication` can specify GPU resources for the driver or executor pod, using the optional field `.spec.driver.gpu` or `.spec.executor.gpu`. Below is an example:
+
+```yaml
+spec:
+  driver:
+    cores: 0.1
+    coreLimit: "200m"
+    memory: "512m"
+    gpu:
+      name: "amd.com/gpu"   # GPU resource name
+      quantity: 1           # number of GPUs to request
+    labels:
+      version: 2.4.0
+    serviceAccount: spark
+  executor:
+    cores: 1
+    instances: 1
+    memory: "512m"
+    gpu:
+      name: "nvidia.com/gpu"
+      quantity: 1
+
+```
+Note that the mutating admission webhook is needed to use this feature. Please refer to the [Quick Start Guide](quick-start-guide.md) on how to enable the mutating admission webhook.
+
+
 ### Mounting Secrets
 
 As mentioned above, both the driver specification and executor specification have an optional field `secrets` for configuring the list of Kubernetes Secrets to be mounted into the driver and executors, respectively. The field is a map with the names of the Secrets as keys and values specifying the mount path and type of each Secret. For instance, the following example shows a driver specification with a Secret named `gcp-svc-account` of type `GCPServiceAccount` to be mounted to `/mnt/secrets` in the driver pod.

diff --git a/pkg/apis/sparkoperator.k8s.io/v1beta1/types.go b/pkg/apis/sparkoperator.k8s.io/v1beta1/types.go
@@ -346,6 +346,9 @@ type SparkPodSpec struct {
 	// MemoryOverhead is the amount of off-heap memory to allocate in cluster mode, in MiB unless otherwise specified.
 	// Optional.
 	MemoryOverhead *string `json:"memoryOverhead,omitempty"`
+	// GPU specifies GPU requirement for the pod.
+	// Optional.
+	GPU *GPUSpec `json:"gpu,omitempty"`
 	// Image is the container image to use. Overrides Spec.Image if set.
 	// Optional.
 	Image *string `json:"image,omitempty"`
@@ -499,6 +502,13 @@ type PrometheusSpec struct {
 	Configuration *string `json:"configuration,omitempty"`
 }
 
+type GPUSpec struct {
+	// Name is GPU resource name, such as: nvidia.com/gpu or amd.com/gpu
+	Name string `json:"name"`
+	// Quantity is the number of GPUs to request for driver or executor.
+	Quantity int64 `json:"quantity"`
+}
+
 // PrometheusMonitoringEnabled returns if Prometheus monitoring is enabled or not.
 func (s *SparkApplication) PrometheusMonitoringEnabled() bool {
 	return s.Spec.Monitoring != nil && s.Spec.Monitoring.Prometheus != nil

diff --git a/pkg/apis/sparkoperator.k8s.io/v1beta1/zz_generated.deepcopy.go b/pkg/apis/sparkoperator.k8s.io/v1beta1/zz_generated.deepcopy.go
diff --git a/pkg/webhook/patch.go b/pkg/webhook/patch.go
@@ -18,10 +18,12 @@ package webhook
 
 import (
 	"fmt"
+	"strings"
 
 	"github.com/golang/glog"
 
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
 	"github.com/GoogleCloudPlatform/spark-on-k8s-operator/pkg/apis/sparkoperator.k8s.io/v1beta1"
@@ -74,7 +76,10 @@ func patchSparkPod(pod *corev1.Pod, app *v1beta1.SparkApplication) []patchOperat
 			patchOps = append(patchOps, *op)
 		}
 	}
-
+	op = addGPU(pod, app)
+	if op != nil {
+		patchOps = append(patchOps, *op)
+	}
 	return patchOps
 }
 
@@ -354,6 +359,47 @@ func addSidecarContainers(pod *corev1.Pod, app *v1beta1.SparkApplication) []patc
 	return ops
 }
 
+func addGPU(pod *corev1.Pod, app *v1beta1.SparkApplication) *patchOperation {
+	var gpu *v1beta1.GPUSpec
+	if util.IsDriverPod(pod) {
+		gpu = app.Spec.Driver.GPU
+	}
+	if util.IsExecutorPod(pod) {
+		gpu = app.Spec.Executor.GPU
+	}
+	if gpu == nil {
+		return nil
+	}
+	if gpu.Name == "" {
+		glog.V(2).Infof("Please specify GPU resource name, such as: nvidia.com/gpu, amd.com/gpu etc. Current gpu spec: %+v", gpu)
+		return nil
+	}
+	if gpu.Quantity <= 0 {
+		glog.V(2).Infof("GPU Quantity must be positive. Current gpu spec: %+v", gpu)
+		return nil
+	}
+	i := 0
+	// Find the driver or executor container in the pod.
+	for ; i < len(pod.Spec.Containers); i++ {
+		if pod.Spec.Containers[i].Name == sparkDriverContainerName ||
+			pod.Spec.Containers[i].Name == sparkExecutorContainerName {
+			break
+		}
+	}
+	path := fmt.Sprintf("/spec/containers/%d/resources/limits", i)
+	var value interface{}
+	if len(pod.Spec.Containers[i].Resources.Limits) == 0 {
+		value = corev1.ResourceList{
+			corev1.ResourceName(gpu.Name): *resource.NewQuantity(gpu.Quantity, resource.DecimalSI),
+		}
+	} else {
+		encoder := strings.NewReplacer("~", "~0", "/", "~1")
+		path += "/" + encoder.Replace(gpu.Name)
+		value = *resource.NewQuantity(gpu.Quantity, resource.DecimalSI)
+	}
+	return &patchOperation{Op: "add", Path: path, Value: value}
+}
+
 func hasContainer(pod *corev1.Pod, container *corev1.Container) bool {
 	for _, c := range pod.Spec.Containers {
 		if container.Name == c.Name && container.Image == c.Image {