From df434e411d5adf06a3c9630271cc36aaa2fe07f1 Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Wed, 4 Jul 2018 20:09:20 -0700 Subject: [PATCH] Put the full PyTorch prototype in the jsonnet file. (#1119) * The current pattern in Kubeflow is to put the complete prototype in the jsonnet file. This way the result of ks generate is a .jsonnet file containing a full spec. This makes it easy for users to do complex modifications starting with the prototype as an example. * Configure the prototype to do mnist by default. Fix #1114 --- .../prototypes/pytorch-job.jsonnet | 75 +++++++++++++++++- kubeflow/pytorch-job/pytorch-job.libsonnet | 79 ------------------- 2 files changed, 72 insertions(+), 82 deletions(-) delete mode 100644 kubeflow/pytorch-job/pytorch-job.libsonnet diff --git a/kubeflow/pytorch-job/prototypes/pytorch-job.jsonnet b/kubeflow/pytorch-job/prototypes/pytorch-job.jsonnet index 0d588f4b822..9d69337b5fa 100644 --- a/kubeflow/pytorch-job/prototypes/pytorch-job.jsonnet +++ b/kubeflow/pytorch-job/prototypes/pytorch-job.jsonnet @@ -5,7 +5,7 @@ // @param name string Name to give to each of the components // @optionalParam namespace string null Namespace to use for the components. It is automatically inherited from the environment if not set. // @optionalParam args string null Comma separated list of arguments to pass to the job -// @optionalParam image string null The docker image to use for the job. +// @optionalParam image string gcr.io/kubeflow-examples/pytorch-dist-mnist:v20180702-a57993c The docker image to use for the job. // @optionalParam image_gpu string null The docker image to use when using GPUs. // @optionalParam num_masters number 1 The number of masters to use // @optionalParam num_workers number 1 The number of workers to use @@ -13,6 +13,75 @@ local k = import "k.libsonnet"; -local all = import "kubeflow/pytorch-job/pytorch-job.libsonnet"; +local util = { + pytorchJobReplica(replicaType, number, args, image, numGpus=0):: + local baseContainer = { + image: image, + name: "pytorch", + }; + local containerArgs = if std.length(args) > 0 then + { + args: args, + } + else {}; + local resources = if numGpus > 0 then { + resources: { + limits: { + "nvidia.com/gpu": numGpus, + }, + }, + } else {}; + if number > 0 then + { + replicas: number, + template: { + spec: { + containers: [ + baseContainer + containerArgs + resources, + ], + restartPolicy: "OnFailure", + }, + }, + replicaType: replicaType, + } + else {}, +}; -std.prune(k.core.v1.list.new(all.pyTorchJobPrototype(params, env))) +local namespace = env.namespace; +local name = params.name; + +local argsParam = params.args; +local args = + if argsParam == "null" then + [] + else + std.split(argsParam, ","); + +local image = params.image; +local imageGpu = params.image_gpu; +local numMasters = params.num_masters; +local numWorkers = params.num_workers; +local numGpus = params.num_gpus; + +local workerSpec = if numGpus > 0 then + util.pytorchJobReplica("WORKER", numWorkers, args, imageGpu, numGpus) +else + util.pytorchJobReplica("WORKER", numWorkers, args, image); + +local masterSpec = util.pytorchJobReplica("MASTER", numMasters, args, image); +local replicas = [masterSpec, workerSpec]; + + +local job = { + apiVersion: "kubeflow.org/v1alpha1", + kind: "PyTorchJob", + metadata: { + name: name, + namespace: namespace, + }, + spec: { + replicaSpecs: replicas, + }, +}; + +std.prune(k.core.v1.list.new([job])) diff --git a/kubeflow/pytorch-job/pytorch-job.libsonnet b/kubeflow/pytorch-job/pytorch-job.libsonnet deleted file mode 100644 index b41b8d063f6..00000000000 --- a/kubeflow/pytorch-job/pytorch-job.libsonnet +++ /dev/null @@ -1,79 +0,0 @@ -local k = import "k.libsonnet"; - -local util = { - pytorchJobReplica(replicaType, number, args, image, numGpus=0):: - local baseContainer = { - image: image, - name: "pytorch", - }; - local containerArgs = if std.length(args) > 0 then - { - args: args, - } - else {}; - local resources = if numGpus > 0 then { - resources: { - limits: { - "nvidia.com/gpu": numGpus, - }, - }, - } else {}; - if number > 0 then - { - replicas: number, - template: { - spec: { - containers: [ - baseContainer + containerArgs + resources, - ], - restartPolicy: "OnFailure", - }, - }, - replicaType: replicaType, - } - else {}, -}; - -{ - pyTorchJobPrototype(params, env):: [ - $.parts(params, env).PyTorchJob, - ], - - parts(params, env):: { - local namespace = if params.namespace != "null" then params.namespace else env.namespace, - local name = params.name, - - local argsParam = params.args, - local args = - if argsParam == "null" then - [] - else - std.split(argsParam, ","), - - local image = params.image, - local imageGpu = params.image_gpu, - local numMasters = params.num_masters, - local numWorkers = params.num_workers, - local numGpus = params.num_gpus, - - local workerSpec = if numGpus > 0 then - util.pytorchJobReplica("WORKER", numWorkers, args, imageGpu, numGpus) - else - util.pytorchJobReplica("WORKER", numWorkers, args, image), - local masterSpec = util.pytorchJobReplica("MASTER", numMasters, args, image), - local replicas = [masterSpec, workerSpec], - - - PyTorchJob:: { - apiVersion: "kubeflow.org/v1alpha1", - kind: "PyTorchJob", - metadata: { - name: name, - namespace: namespace, - }, - spec: { - replicaSpecs: replicas, - }, - }, - }, -}