Skip to content

Commit

Permalink
Support custom metrics collector kind (#908)
Browse files Browse the repository at this point in the history
* Support custom metrics collector kind

* Fix python image version for v1alpha2
  • Loading branch information
hougangliu authored and k8s-ci-robot committed Nov 4, 2019
1 parent c95c144 commit 2df906e
Show file tree
Hide file tree
Showing 9 changed files with 211 additions and 24 deletions.
2 changes: 1 addition & 1 deletion cmd/suggestion/bayesianoptimization/v1alpha2/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3
FROM python:3.6

ADD . /usr/src/app/github.com/kubeflow/katib
WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/suggestion/bayesianoptimization/v1alpha2
Expand Down
2 changes: 1 addition & 1 deletion cmd/suggestion/grid/v1alpha2/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3
FROM python:3.6

ADD . /usr/src/app/github.com/kubeflow/katib
WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/suggestion/grid/v1alpha2
Expand Down
2 changes: 1 addition & 1 deletion cmd/suggestion/hyperband/v1alpha2/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3
FROM python:3.6

ADD . /usr/src/app/github.com/kubeflow/katib
WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/suggestion/hyperband/v1alpha2
Expand Down
2 changes: 1 addition & 1 deletion cmd/suggestion/random/v1alpha2/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3
FROM python:3.6

ADD . /usr/src/app/github.com/kubeflow/katib
WORKDIR /usr/src/app/github.com/kubeflow/katib/cmd/suggestion/random/v1alpha2
Expand Down
76 changes: 76 additions & 0 deletions examples/v1alpha3/custom-metricscollector-example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
apiVersion: "kubeflow.org/v1alpha3"
kind: Experiment
metadata:
namespace: kubeflow
labels:
controller-tools.k8s.io: "1.0"
name: custom-metricscollector-example
spec:
objective:
type: maximize
goal: 0.99
objectiveMetricName: accuracy
metricsCollectorSpec:
source:
fileSystemPath:
path: "/katib/mnist.log"
kind: File
collector:
kind: Custom
customCollector:
args:
- -m
- accuracy
- -s
- katib-manager.kubeflow:6789
- -path
- /katib/mnist.log
image: liuhougangxa/custom-metrics-collector:latest
imagePullPolicy: Always
name: custom-metrics-logger-and-collector
env:
- name: TrialNamePrefix
valueFrom:
fieldRef:
fieldPath: metadata.name
algorithm:
algorithmName: random
parallelTrialCount: 3
maxTrialCount: 12
maxFailedTrialCount: 3
parameters:
- name: --lr
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.03"
- name: --momentum
parameterType: double
feasibleSpace:
min: "0.3"
max: "0.7"
trialTemplate:
goTemplate:
rawTemplate: |-
apiVersion: batch/v1
kind: Job
metadata:
name: {{.Trial}}
namespace: {{.NameSpace}}
spec:
template:
spec:
containers:
- name: {{.Trial}}
image: docker.io/liuhougangxa/pytorch-mnist
imagePullPolicy: Always
command:
- "python"
- "/var/mnist.py"
- "--epochs=1"
{{- with .HyperParameters}}
{{- range .}}
- "{{.Name}}={{.Value}}"
{{- end}}
{{- end}}
restartPolicy: Never
12 changes: 12 additions & 0 deletions pkg/webhook/v1alpha3/pod/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,19 @@ limitations under the License.

package pod

import (
common "github.com/kubeflow/katib/pkg/apis/controller/common/v1alpha3"
)

const (
MasterRole = "master"
BatchJob = "Job"
)

var (
NeedWrapWorkerMetricsCollecterList = [...]common.CollectorKind{
common.StdOutCollector,
common.TfEventCollector,
common.FileCollector,
}
)
68 changes: 48 additions & 20 deletions pkg/webhook/v1alpha3/pod/inject_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,41 +137,54 @@ func (s *sidecarInjector) Mutate(pod *v1.Pod, namespace string) (*v1.Pod, error)
return nil, err
}

injectContainer, err := s.getMetricsCollectorContainer(trial)
if err != nil {
return nil, err
}
mutatedPod.Spec.Containers = append(mutatedPod.Spec.Containers, *injectContainer)

mutatedPod.Spec.ServiceAccountName = pod.Spec.ServiceAccountName
mutatedPod.Spec.ShareProcessNamespace = pointer.BoolPtr(true)

mountPath, pathKind := getMountPath(trial.Spec.MetricsCollector)
if mountPath != "" {
if err = mutateVolume(mutatedPod, kind, mountPath, injectContainer.Name, pathKind); err != nil {
return nil, err
}
}
if needWrapWorkerContainer(trial.Spec.MetricsCollector) {
if err = wrapWorkerContainer(mutatedPod, kind, mountPath, pathKind, trial.Spec.MetricsCollector); err != nil {
return nil, err
}
}

log.Info("Inject metrics collector sidecar container", "Pod", pod.Name, "Trial", trialName)
return mutatedPod, nil
}

func (s *sidecarInjector) getMetricsCollectorContainer(trial *trialsv1alpha3.Trial) (*v1.Container, error) {
mc := trial.Spec.MetricsCollector
if mc.Collector.Kind == common.CustomCollector {
return mc.Collector.CustomCollector, nil
}
metricName := trial.Spec.Objective.ObjectiveMetricName
for _, v := range trial.Spec.Objective.AdditionalMetricNames {
metricName += ";"
metricName += v
}

image, err := katibconfig.GetMetricsCollectorImage(trial.Spec.MetricsCollector.Collector.Kind, s.client)
image, err := katibconfig.GetMetricsCollectorImage(mc.Collector.Kind, s.client)
if err != nil {
return nil, err
}
args := getMetricsCollectorArgs(trialName, metricName, trial.Spec.MetricsCollector)
args := getMetricsCollectorArgs(trial.Name, metricName, mc)
sidecarContainerName := getSidecarContainerName(trial.Spec.MetricsCollector.Collector.Kind)
injectContainer := v1.Container{
Name: sidecarContainerName,
Image: image,
Args: args,
ImagePullPolicy: v1.PullIfNotPresent,
}
mutatedPod.Spec.Containers = append(mutatedPod.Spec.Containers, injectContainer)
mutatedPod.Spec.ServiceAccountName = pod.Spec.ServiceAccountName
mutatedPod.Spec.ShareProcessNamespace = pointer.BoolPtr(true)

if mountPath, pathKind := getMountPath(trial.Spec.MetricsCollector); mountPath != "" {
if err = wrapWorkerContainer(
mutatedPod, kind, mountPath, pathKind, trial.Spec.MetricsCollector); err != nil {
return nil, err
}
if err = mutateVolume(mutatedPod, kind, mountPath, sidecarContainerName, pathKind); err != nil {
return nil, err
}
}

log.Info("Inject metrics collector sidecar container", "Pod", pod.Name, "Trial", trialName)

return mutatedPod, nil
return &injectContainer, nil
}

func getMetricsCollectorArgs(trialName, metricName string, mc common.MetricsCollectorSpec) []string {
Expand All @@ -189,11 +202,26 @@ func getMountPath(mc common.MetricsCollectorSpec) (string, common.FileSystemKind
return mc.Source.FileSystemPath.Path, common.FileKind
} else if mc.Collector.Kind == common.TfEventCollector {
return mc.Source.FileSystemPath.Path, common.DirectoryKind
} else if mc.Collector.Kind == common.CustomCollector {
if mc.Source == nil || mc.Source.FileSystemPath == nil {
return "", common.InvalidKind
}
return mc.Source.FileSystemPath.Path, mc.Source.FileSystemPath.Kind
} else {
return "", common.InvalidKind
}
}

func needWrapWorkerContainer(mc common.MetricsCollectorSpec) bool {
mcKind := mc.Collector.Kind
for _, kind := range NeedWrapWorkerMetricsCollecterList {
if mcKind == kind {
return true
}
}
return false
}

func wrapWorkerContainer(
pod *v1.Pod, jobKind, metricsFile string,
pathKind common.FileSystemKind,
Expand Down
64 changes: 64 additions & 0 deletions test/scripts/v1alpha3/run-custom-metricscollector.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash

# Copyright 2018 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This shell script is used to build a cluster and create a namespace from our
# argo workflow

set -o errexit
set -o nounset
set -o pipefail

CLUSTER_NAME="${CLUSTER_NAME}"
ZONE="${GCP_ZONE}"
PROJECT="${GCP_PROJECT}"
NAMESPACE="${DEPLOY_NAMESPACE}"
REGISTRY="${GCP_REGISTRY}"
GO_DIR=${GOPATH}/src/github.com/${REPO_OWNER}/${REPO_NAME}

echo "Activating service-account"
gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS}

echo "Configuring kubectl"

echo "CLUSTER_NAME: ${CLUSTER_NAME}"
echo "ZONE: ${GCP_ZONE}"
echo "PROJECT: ${GCP_PROJECT}"

gcloud --project ${PROJECT} container clusters get-credentials ${CLUSTER_NAME} \
--zone ${ZONE}
kubectl config set-context $(kubectl config current-context) --namespace=default
USER=`gcloud config get-value account`

echo "All Katib components are running."
kubectl version
kubectl cluster-info
echo "Katib deployments"
kubectl -n kubeflow get deploy
echo "Katib services"
kubectl -n kubeflow get svc
echo "Katib pods"
kubectl -n kubeflow get pod

cd ${GO_DIR}/test/e2e/v1alpha3

echo "Running e2e custom metricscollector experiment"
export KUBECONFIG=$HOME/.kube/config
./run-e2e-experiment ../../../examples/v1alpha3/custom-metricscollector-example.yaml
kubectl -n kubeflow describe suggestion
kubectl delete -f ../../../examples/v1alpha3/custom-metricscollector-example.yaml
kubectl describe pods
kubectl describe deploy
exit 0
7 changes: 7 additions & 0 deletions test/workflows/components/workflows-v1alpha3.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,10 @@
name: "run-file-metricscollector-e2e-tests",
template: "run-file-metricscollector-e2e-tests",
},
{
name: "run-custom-metricscollector-e2e-tests",
template: "run-custom-metricscollector-e2e-tests",
},
{
name: "run-bayesian-e2e-tests",
template: "run-bayesian-e2e-tests",
Expand Down Expand Up @@ -378,6 +382,9 @@
$.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("run-file-metricscollector-e2e-tests", testWorkerImage, [
"test/scripts/v1alpha3/run-file-metricscollector.sh",
]), // run file metrics collector test
$.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("run-custom-metricscollector-e2e-tests", testWorkerImage, [
"test/scripts/v1alpha3/run-custom-metricscollector.sh",
]), // run custom metrics collector test
$.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("create-pr-symlink", testWorkerImage, [
"python",
"-m",
Expand Down

0 comments on commit 2df906e

Please sign in to comment.