Skip to content

Commit

Permalink
Feature: Add pprof endpoint (kubeflow#2164)
Browse files Browse the repository at this point in the history
* add pprof support to the operator Controller Manager

Signed-off-by: ImpSy <3097030+ImpSy@users.noreply.github.com>

* add pprof support to helm chart

Signed-off-by: ImpSy <3097030+ImpSy@users.noreply.github.com>

---------

Signed-off-by: ImpSy <3097030+ImpSy@users.noreply.github.com>
(cherry picked from commit 75b9266)
Signed-off-by: Yi Chen <github@chenyicn.net>
  • Loading branch information
ImpSy authored and ChenYi015 committed Sep 19, 2024
1 parent ffd2ed8 commit 4a10b27
Show file tree
Hide file tree
Showing 8 changed files with 134 additions and 1 deletion.
3 changes: 3 additions & 0 deletions charts/spark-operator-chart/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ See [helm uninstall](https://helm.sh/docs/helm/helm_uninstall) for command docum
| controller.sidecars | list | `[]` | Sidecar containers for controller pods. |
| controller.podDisruptionBudget.enable | bool | `false` | Specifies whether to create pod disruption budget for controller. Ref: [Specifying a Disruption Budget for your Application](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) |
| controller.podDisruptionBudget.minAvailable | int | `1` | The number of pods that must be available. Require `controller.replicas` to be greater than 1 |
| controller.pprof.enable | bool | `false` | Specifies whether to enable pprof. |
| controller.pprof.port | int | `6060` | Specifies pprof port. |
| controller.pprof.portName | string | `"pprof"` | Specifies pprof service port name. |
| webhook.enable | bool | `true` | Specifies whether to enable webhook. |
| webhook.replicas | int | `1` | Number of replicas of webhook server. |
| webhook.logLevel | string | `"info"` | Configure the verbosity of logging, can be one of `debug`, `info`, `error`. |
Expand Down
7 changes: 7 additions & 0 deletions charts/spark-operator-chart/templates/controller/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ Create the name of the pod disruption budget to be used by controller
{{ include "spark-operator.controller.name" . }}-pdb
{{- end -}}

{{/*
Create the name of the service used by controller
*/}}
{{- define "spark-operator.controller.serviceName" -}}
{{ include "spark-operator.controller.name" . }}-svc
{{- end -}}

{{/*
Create the role policy rules for the controller in every Spark job namespace
*/}}
Expand Down
11 changes: 10 additions & 1 deletion charts/spark-operator-chart/templates/controller/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,20 @@ spec:
- --leader-election=true
- --leader-election-lock-name={{ include "spark-operator.controller.leaderElectionName" . }}
- --leader-election-lock-namespace={{ .Release.Namespace }}
{{- if .Values.prometheus.metrics.enable }}
{{- if .Values.controller.pprof.enable }}
- --pprof-bind-address=:{{ .Values.controller.pprof.port }}
{{- end }}
{{- if or .Values.prometheus.metrics.enable .Values.controller.pprof.enable }}
ports:
{{- if .Values.controller.pprof.enable }}
- name: {{ .Values.controller.pprof.portName | quote }}
containerPort: {{ .Values.controller.pprof.port }}
{{- end }}
{{- if .Values.prometheus.metrics.enable }}
- name: {{ .Values.prometheus.metrics.portName | quote }}
containerPort: {{ .Values.prometheus.metrics.port }}
{{- end }}
{{- end }}
{{- with .Values.controller.env }}
env:
{{- toYaml . | nindent 8 }}
Expand Down
31 changes: 31 additions & 0 deletions charts/spark-operator-chart/templates/controller/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{{/*
Copyright 2024 The Kubeflow authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/}}

{{- if .Values.controller.pprof.enable }}
apiVersion: v1
kind: Service
metadata:
name: {{ include "spark-operator.controller.serviceName" . }}
labels:
{{- include "spark-operator.controller.labels" . | nindent 4 }}
spec:
selector:
{{- include "spark-operator.controller.selectorLabels" . | nindent 4 }}
ports:
- port: {{ .Values.controller.pprof.port }}
targetPort: {{ .Values.controller.pprof.portName | quote }}
name: {{ .Values.controller.pprof.portName }}
{{- end }}
26 changes: 26 additions & 0 deletions charts/spark-operator-chart/tests/controller/deployment_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -567,3 +567,29 @@ tests:
asserts:
- failedTemplate:
errorMessage: "controller.replicas must be greater than 1 to enable topology spread constraints for controller pods"

- it: Should contain `--pprof-bind-address` arg if `controller.pprof.enable` is set to `true`
set:
controller:
pprof:
enable: true
port: 12345
asserts:
- contains:
path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].args
content: --pprof-bind-address=:12345

- it: Should add pprof ports if `controller.pprof.enable` is set to `true`
set:
controller:
pprof:
enable: true
port: 12345
portName: pprof-test
asserts:
- contains:
path: spec.template.spec.containers[?(@.name=="spark-operator-controller")].ports
content:
name: pprof-test
containerPort: 12345
count: 1
44 changes: 44 additions & 0 deletions charts/spark-operator-chart/tests/controller/service_test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#
# Copyright 2024 The Kubeflow authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

suite: Test controller deployment

templates:
- controller/service.yaml

release:
name: spark-operator
namespace: spark-operator

tests:
- it: Should create the pprof service correctly
set:
controller:
pprof:
enable: true
port: 12345
portName: pprof-test
asserts:
- containsDocument:
apiVersion: v1
kind: Service
name: spark-operator-controller-svc
- equal:
path: spec.ports[0]
value:
port: 12345
targetPort: pprof-test
name: pprof-test
8 changes: 8 additions & 0 deletions charts/spark-operator-chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,14 @@ controller:
# Require `controller.replicas` to be greater than 1
minAvailable: 1

pprof:
# -- Specifies whether to enable pprof.
enable: false
# -- Specifies pprof port.
port: 6060
# -- Specifies pprof service port name.
portName: pprof

webhook:
# -- Specifies whether to enable webhook.
enable: true
Expand Down
5 changes: 5 additions & 0 deletions cmd/operator/controller/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ var (
metricsJobStartLatencyBuckets []float64

healthProbeBindAddress string
pprofBindAddress string
secureMetrics bool
enableHTTP2 bool
development bool
Expand Down Expand Up @@ -161,6 +162,9 @@ func NewStartCommand() *cobra.Command {
command.Flags().BoolVar(&secureMetrics, "secure-metrics", false, "If set the metrics endpoint is served securely")
command.Flags().BoolVar(&enableHTTP2, "enable-http2", false, "If set, HTTP/2 will be enabled for the metrics and webhook servers")

command.Flags().StringVar(&pprofBindAddress, "pprof-bind-address", "0", "The address the pprof endpoint binds to. "+
"If not set, it will be 0 in order to disable the pprof server")

flagSet := flag.NewFlagSet("controller", flag.ExitOnError)
ctrl.RegisterFlags(flagSet)
zapOptions.BindFlags(flagSet)
Expand Down Expand Up @@ -193,6 +197,7 @@ func start() {
TLSOpts: tlsOptions,
}),
HealthProbeBindAddress: healthProbeBindAddress,
PprofBindAddress: pprofBindAddress,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionLockName,
LeaderElectionNamespace: leaderElectionLockNamespace,
Expand Down

0 comments on commit 4a10b27

Please sign in to comment.