Skip to content

Commit

Permalink
Add support for requesting GPUs (#509)
Browse files Browse the repository at this point in the history
* Add support for requesting GPUs

* refine unit test

* Add GPU spec and refine unit test

* Add user guide about GPU and refine code format

* fix typo
  • Loading branch information
tkanng authored and liyinan926 committed Jun 7, 2019
1 parent df94871 commit fbdd41b
Show file tree
Hide file tree
Showing 5 changed files with 313 additions and 1 deletion.
29 changes: 29 additions & 0 deletions docs/user-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ The Kubernetes Operator for Apache Spark ships with a command-line tool called `
* [Specifying Hadoop Configuration](#specifying-hadoop-configuration)
* [Writing Driver Specification](#writing-driver-specification)
* [Writing Executor Specification](#writing-executor-specification)
* [Requesting GPU Resources](#requesting-gpu-resources)
* [Mounting Secrets](#mounting-secrets)
* [Mounting ConfigMaps](#mounting-configmaps)
* [Mounting a ConfigMap storing Spark Configuration Files](#mounting-a-configmap-storing-spark-configuration-files)
Expand Down Expand Up @@ -145,6 +146,34 @@ spec:
version: 2.4.0
```

### Requesting GPU Resources

A `SparkApplication` can specify GPU resources for the driver or executor pod, using the optional field `.spec.driver.gpu` or `.spec.executor.gpu`. Below is an example:

```yaml
spec:
driver:
cores: 0.1
coreLimit: "200m"
memory: "512m"
gpu:
name: "amd.com/gpu" # GPU resource name
quantity: 1 # number of GPUs to request
labels:
version: 2.4.0
serviceAccount: spark
executor:
cores: 1
instances: 1
memory: "512m"
gpu:
name: "nvidia.com/gpu"
quantity: 1
```
Note that the mutating admission webhook is needed to use this feature. Please refer to the [Quick Start Guide](quick-start-guide.md) on how to enable the mutating admission webhook.


### Mounting Secrets

As mentioned above, both the driver specification and executor specification have an optional field `secrets` for configuring the list of Kubernetes Secrets to be mounted into the driver and executors, respectively. The field is a map with the names of the Secrets as keys and values specifying the mount path and type of each Secret. For instance, the following example shows a driver specification with a Secret named `gcp-svc-account` of type `GCPServiceAccount` to be mounted to `/mnt/secrets` in the driver pod.
Expand Down
10 changes: 10 additions & 0 deletions pkg/apis/sparkoperator.k8s.io/v1beta1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,9 @@ type SparkPodSpec struct {
// MemoryOverhead is the amount of off-heap memory to allocate in cluster mode, in MiB unless otherwise specified.
// Optional.
MemoryOverhead *string `json:"memoryOverhead,omitempty"`
// GPU specifies GPU requirement for the pod.
// Optional.
GPU *GPUSpec `json:"gpu,omitempty"`
// Image is the container image to use. Overrides Spec.Image if set.
// Optional.
Image *string `json:"image,omitempty"`
Expand Down Expand Up @@ -499,6 +502,13 @@ type PrometheusSpec struct {
Configuration *string `json:"configuration,omitempty"`
}

type GPUSpec struct {
// Name is GPU resource name, such as: nvidia.com/gpu or amd.com/gpu
Name string `json:"name"`
// Quantity is the number of GPUs to request for driver or executor.
Quantity int64 `json:"quantity"`
}

// PrometheusMonitoringEnabled returns if Prometheus monitoring is enabled or not.
func (s *SparkApplication) PrometheusMonitoringEnabled() bool {
return s.Spec.Monitoring != nil && s.Spec.Monitoring.Prometheus != nil
Expand Down
33 changes: 33 additions & 0 deletions pkg/apis/sparkoperator.k8s.io/v1beta1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

48 changes: 47 additions & 1 deletion pkg/webhook/patch.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ package webhook

import (
"fmt"
"strings"

"github.com/golang/glog"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/GoogleCloudPlatform/spark-on-k8s-operator/pkg/apis/sparkoperator.k8s.io/v1beta1"
Expand Down Expand Up @@ -74,7 +76,10 @@ func patchSparkPod(pod *corev1.Pod, app *v1beta1.SparkApplication) []patchOperat
patchOps = append(patchOps, *op)
}
}

op = addGPU(pod, app)
if op != nil {
patchOps = append(patchOps, *op)
}
return patchOps
}

Expand Down Expand Up @@ -354,6 +359,47 @@ func addSidecarContainers(pod *corev1.Pod, app *v1beta1.SparkApplication) []patc
return ops
}

func addGPU(pod *corev1.Pod, app *v1beta1.SparkApplication) *patchOperation {
var gpu *v1beta1.GPUSpec
if util.IsDriverPod(pod) {
gpu = app.Spec.Driver.GPU
}
if util.IsExecutorPod(pod) {
gpu = app.Spec.Executor.GPU
}
if gpu == nil {
return nil
}
if gpu.Name == "" {
glog.V(2).Infof("Please specify GPU resource name, such as: nvidia.com/gpu, amd.com/gpu etc. Current gpu spec: %+v", gpu)
return nil
}
if gpu.Quantity <= 0 {
glog.V(2).Infof("GPU Quantity must be positive. Current gpu spec: %+v", gpu)
return nil
}
i := 0
// Find the driver or executor container in the pod.
for ; i < len(pod.Spec.Containers); i++ {
if pod.Spec.Containers[i].Name == sparkDriverContainerName ||
pod.Spec.Containers[i].Name == sparkExecutorContainerName {
break
}
}
path := fmt.Sprintf("/spec/containers/%d/resources/limits", i)
var value interface{}
if len(pod.Spec.Containers[i].Resources.Limits) == 0 {
value = corev1.ResourceList{
corev1.ResourceName(gpu.Name): *resource.NewQuantity(gpu.Quantity, resource.DecimalSI),
}
} else {
encoder := strings.NewReplacer("~", "~0", "/", "~1")
path += "/" + encoder.Replace(gpu.Name)
value = *resource.NewQuantity(gpu.Quantity, resource.DecimalSI)
}
return &patchOperation{Op: "add", Path: path, Value: value}
}

func hasContainer(pod *corev1.Pod, container *corev1.Container) bool {
for _, c := range pod.Spec.Containers {
if container.Name == c.Name && container.Image == c.Image {
Expand Down
Loading

0 comments on commit fbdd41b

Please sign in to comment.