Skip to content

Commit

Permalink
Modify trial template configMap for new version of Trial Template
Browse files Browse the repository at this point in the history
Fix experiment defaults
change valid/invalid experiment
  • Loading branch information
andreyvelich committed Jun 8, 2020
1 parent 05c1ef6 commit cebec02
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 156 deletions.
76 changes: 62 additions & 14 deletions manifests/v1beta1/katib-controller/trialTemplateConfigmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,74 @@ kind: ConfigMap
metadata:
name: trial-template
namespace: kubeflow
labels:
app: katib-trial-templates
data:
defaultTrialTemplate.yaml: |-
apiVersion: batch/v1
kind: Job
metadata:
name: {{.Trial}}
namespace: {{.NameSpace}}
spec:
template:
spec:
containers:
- name: {{.Trial}}
image: docker.io/kubeflowkatib/mxnet-mnist
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
- "--batch-size=64"
{{- with .HyperParameters}}
{{- range .}}
- "{{.Name}}={{.Value}}"
{{- end}}
{{- end}}
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
- "--batch-size=64"
- "--lr=${trialParameters.learningRate}"
- "--num-layers=${trialParameters.numberLayers}"
- "--optimizer=${trialParameters.optimizer}"
restartPolicy: Never
# For ConfigMap templates double quotes must set in commands to correct parse JSON parameters in Trial Template (e.g nn_config, architecture)
enasCPUTemplate: |-
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu
command:
- python3
- -u
- RunTrial.py
- --num_epochs=1
- "--architecture=\"${trialParameters.neuralNetworkArchitecture}\""
- "--nn_config=\"${trialParameters.neuralNetworkConfig}\""
restartPolicy: Never
pytorchJobTemplate: |-
apiVersion: "kubeflow.org/v1"
kind: PyTorchJob
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: OnFailure
template:
spec:
containers:
- name: pytorch
image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0
imagePullPolicy: Always
command:
- "python"
- "/var/mnist.py"
- "--lr=${trialParameters.learningRate}"
- "--momentum=${trialParameters.momentum}"
Worker:
replicas: 2
restartPolicy: OnFailure
template:
spec:
containers:
- name: pytorch
image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0
imagePullPolicy: Always
command:
- "python"
- "/var/mnist.py"
- "--lr=${trialParameters.learningRate}"
- "--momentum=${trialParameters.momentum}"

This file was deleted.

20 changes: 18 additions & 2 deletions pkg/apis/controller/experiments/v1beta1/experiment_defaults.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion pkg/controller.v1beta1/experiment/experiment_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1beta
trial.Spec.ParameterAssignments = trialAssignment.ParameterAssignments

runSpec, err := r.GetRunSpecWithHyperParameters(expInstance, trial.Name, trial.Namespace, hps)
logger.Info("RUN SPEC-------------", "runSpec", runSpec)
if err != nil {
logger.Error(err, "Fail to get RunSpec from experiment", expInstance.Name)
return err
Expand Down
52 changes: 30 additions & 22 deletions test/e2e/v1beta1/invalid-experiment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,40 +14,48 @@ spec:
- Train-accuracy
algorithm:
algorithmName: random
trialTemplate:
goTemplate:
rawTemplate: |-
apiVersion: batch/v1
kind: invalid-kind # invalid
metadata:
name: {{.Trial}}
namespace: {{.NameSpace}}
spec:
template:
spec:
containers:
- name: {{.Trial}}
image: docker.io/kubeflowkatib/mxnet-mnist
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
- "--batch-size=64"
restartPolicy: Never
parameters:
- name: --lr
- name: lr
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.03"
- name: --num-layers
- name: num-layers
parameterType: int
feasibleSpace:
min: "2"
max: "5"
- name: --optimizer
- name: optimizer
parameterType: categorical
feasibleSpace:
list:
- sgd
- adam
- ftrl
trialTemplate:
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- name: numberLayers
description: Number of training model layers
reference: num-layers
- name: optimizer
description: Training model optimizer (sdg, adam or ftrl)
reference: optimizer
trialSpec:
apiVersion: batch/v1 # Invalid, Kind must be specified
spec:
template:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
- "--batch-size=64"
- "--lr=${trialParameters.learningRate}"
- "--num-layers=${trialParameters.numberLayers}"
- "--optimizer=${trialParameters.optimizer}"
restartPolicy: Never
53 changes: 31 additions & 22 deletions test/e2e/v1beta1/valid-experiment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,40 +14,49 @@ spec:
- Train-accuracy
algorithm:
algorithmName: random
trialTemplate:
goTemplate:
rawTemplate: |-
apiVersion: batch/v1
kind: Job
metadata:
name: {{.Trial}}
namespace: {{.NameSpace}}
spec:
template:
spec:
containers:
- name: {{.Trial}}
image: docker.io/kubeflowkatib/mxnet-mnist
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
- "--batch-size=64"
restartPolicy: Never
parameters:
- name: --lr
- name: lr
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.03"
- name: --num-layers
- name: num-layers
parameterType: int
feasibleSpace:
min: "2"
max: "5"
- name: --optimizer
- name: optimizer
parameterType: categorical
feasibleSpace:
list:
- sgd
- adam
- ftrl
trialTemplate:
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- name: numberLayers
description: Number of training model layers
reference: num-layers
- name: optimizer
description: Training model optimizer (sdg, adam or ftrl)
reference: optimizer
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/mxnet-mnist
command:
- "python3"
- "/opt/mxnet-mnist/mnist.py"
- "--batch-size=64"
- "--lr=${trialParameters.learningRate}"
- "--num-layers=${trialParameters.numberLayers}"
- "--optimizer=${trialParameters.optimizer}"
restartPolicy: Never

0 comments on commit cebec02

Please sign in to comment.