diff --git a/examples/v1alpha3/bayesianoptimization-example.yaml b/examples/v1alpha3/bayesianoptimization-example.yaml index 8cb97a180ff..ffec2f89bce 100644 --- a/examples/v1alpha3/bayesianoptimization-example.yaml +++ b/examples/v1alpha3/bayesianoptimization-example.yaml @@ -11,7 +11,7 @@ spec: goal: 0.99 objectiveMetricName: Validation-accuracy additionalMetricNames: - - accuracy + - Train-accuracy algorithm: algorithmName: bayesianoptimization algorithmSettings: @@ -51,10 +51,10 @@ spec: spec: containers: - name: {{.Trial}} - image: docker.io/kubeflowkatib/mxnet-mnist-example + image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" + - "python3" + - "/opt/mxnet-mnist/mnist.py" - "--batch-size=64" {{- with .HyperParameters}} {{- range .}} diff --git a/examples/v1alpha3/grid-example.yaml b/examples/v1alpha3/grid-example.yaml index ce3beae687b..26f5514335c 100644 --- a/examples/v1alpha3/grid-example.yaml +++ b/examples/v1alpha3/grid-example.yaml @@ -11,7 +11,7 @@ spec: goal: 0.99 objectiveMetricName: Validation-accuracy additionalMetricNames: - - accuracy + - Train-accuracy algorithm: algorithmName: grid parallelTrialCount: 3 @@ -32,8 +32,8 @@ spec: - name: --num-epochs parameterType: int feasibleSpace: - min: "20" - max: "40" + min: "10" + max: "15" # Grid doesn't support categorical, refer to https://chocolate.readthedocs.io/api/sample.html#chocolate.Grid # - name: --optimizer # parameterType: categorical @@ -55,10 +55,10 @@ spec: spec: containers: - name: {{.Trial}} - image: docker.io/kubeflowkatib/mxnet-mnist-example + image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" + - "python3" + - "/opt/mxnet-mnist/mnist.py" - "--batch-size=64" {{- with .HyperParameters}} {{- range .}} diff --git a/examples/v1alpha3/hyperband-example.yaml b/examples/v1alpha3/hyperband-example.yaml index f3f697285e9..d2001a64ea0 100644 --- a/examples/v1alpha3/hyperband-example.yaml +++ b/examples/v1alpha3/hyperband-example.yaml @@ -11,7 +11,7 @@ spec: goal: 0.99 objectiveMetricName: Validation-accuracy additionalMetricNames: - - accuracy + - Train-accuracy algorithm: algorithmName: hyperband algorithmSettings: @@ -58,10 +58,10 @@ spec: spec: containers: - name: {{.Trial}} - image: kubeflowkatib/mxnet-mnist-example + image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" + - "python3" + - "/opt/mxnet-mnist/mnist.py" - "--batch-size=64" {{- with .HyperParameters}} {{- range .}} diff --git a/examples/v1alpha3/mxnet-mnist/Dockerfile b/examples/v1alpha3/mxnet-mnist/Dockerfile new file mode 100644 index 00000000000..a08dc032d6b --- /dev/null +++ b/examples/v1alpha3/mxnet-mnist/Dockerfile @@ -0,0 +1,13 @@ +FROM ubuntu:16.04 + +RUN apt-get update && \ + apt-get install -y wget python3-dev gcc && \ + wget https://bootstrap.pypa.io/get-pip.py && \ + python3 get-pip.py + +RUN pip3 install mxnet + +ADD . /opt/mxnet-mnist +WORKDIR /opt/mxnet-mnist + +ENTRYPOINT ["python3", "/opt/mxnet-mnist/mnist.py"] diff --git a/examples/v1alpha3/mxnet-mnist/README.md b/examples/v1alpha3/mxnet-mnist/README.md new file mode 100644 index 00000000000..5707be4696a --- /dev/null +++ b/examples/v1alpha3/mxnet-mnist/README.md @@ -0,0 +1,6 @@ +# Mxnet image classification example +This is Mxnet image classification training container with recording time of the metrics. + +It uses only simple multilayer perceptron network (mlp). + +If you want to read more about this example, visit official [incubator-mxnet](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification) github repository. diff --git a/examples/v1alpha3/mxnet-mnist/common/__init__.py b/examples/v1alpha3/mxnet-mnist/common/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/v1alpha3/mxnet-mnist/common/fit.py b/examples/v1alpha3/mxnet-mnist/common/fit.py new file mode 100644 index 00000000000..8e8b0197960 --- /dev/null +++ b/examples/v1alpha3/mxnet-mnist/common/fit.py @@ -0,0 +1,340 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" example train fit utility """ +import logging +import os +import time +import re +import math +import mxnet as mx + +def get_epoch_size(args, kv): + return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size) + +def _get_lr_scheduler(args, kv): + if 'lr_factor' not in args or args.lr_factor >= 1: + return (args.lr, None) + epoch_size = get_epoch_size(args, kv) + begin_epoch = args.load_epoch if args.load_epoch else 0 + if 'pow' in args.lr_step_epochs: + lr = args.lr + max_up = args.num_epochs * epoch_size + pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs)) + poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr) + return (lr, poly_sched) + step_epochs = [int(l) for l in args.lr_step_epochs.split(',')] + lr = args.lr + for s in step_epochs: + if begin_epoch >= s: + lr *= args.lr_factor + if lr != args.lr: + logging.info('Adjust learning rate to %e for epoch %d', + lr, begin_epoch) + + steps = [epoch_size * (x - begin_epoch) + for x in step_epochs if x - begin_epoch > 0] + if steps: + return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor, + base_lr=args.lr)) + else: + return (lr, None) + +def _load_model(args, rank=0): + if 'load_epoch' not in args or args.load_epoch is None: + return (None, None, None) + assert args.model_prefix is not None + model_prefix = args.model_prefix + if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)): + model_prefix += "-%d" % (rank) + sym, arg_params, aux_params = mx.model.load_checkpoint( + model_prefix, args.load_epoch) + logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch) + return (sym, arg_params, aux_params) + + +def _save_model(args, rank=0): + if args.model_prefix is None: + return None + return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % ( + args.model_prefix, rank), period=args.save_period) + + +def add_fit_args(parser): + """ + parser : argparse.ArgumentParser + return a parser added with args required by fit + """ + train = parser.add_argument_group('Training', 'model training') + train.add_argument('--network', type=str, + help='the neural network to use') + train.add_argument('--num-layers', type=int, + help='number of layers in the neural network, \ + required by some networks such as resnet') + train.add_argument('--gpus', type=str, + help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu') + train.add_argument('--kv-store', type=str, default='device', + help='key-value store type') + train.add_argument('--num-epochs', type=int, default=100, + help='max num of epochs') + train.add_argument('--lr', type=float, default=0.1, + help='initial learning rate') + train.add_argument('--lr-factor', type=float, default=0.1, + help='the ratio to reduce lr on each step') + train.add_argument('--lr-step-epochs', type=str, + help='the epochs to reduce the lr, e.g. 30,60') + train.add_argument('--initializer', type=str, default='default', + help='the initializer type') + train.add_argument('--optimizer', type=str, default='sgd', + help='the optimizer type') + train.add_argument('--mom', type=float, default=0.9, + help='momentum for sgd') + train.add_argument('--wd', type=float, default=0.0001, + help='weight decay for sgd') + train.add_argument('--batch-size', type=int, default=128, + help='the batch size') + train.add_argument('--disp-batches', type=int, default=20, + help='show progress for every n batches') + train.add_argument('--model-prefix', type=str, + help='model prefix') + train.add_argument('--save-period', type=int, default=1, help='params saving period') + parser.add_argument('--monitor', dest='monitor', type=int, default=0, + help='log network parameters every N iters if larger than 0') + train.add_argument('--load-epoch', type=int, + help='load the model on an epoch using the model-load-prefix') + train.add_argument('--top-k', type=int, default=0, + help='report the top-k accuracy. 0 means no report.') + train.add_argument('--loss', type=str, default='', + help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss') + train.add_argument('--test-io', type=int, default=0, + help='1 means test reading speed without training') + train.add_argument('--dtype', type=str, default='float32', + help='precision: float32 or float16') + train.add_argument('--gc-type', type=str, default='none', + help='type of gradient compression to use, \ + takes `2bit` or `none` for now') + train.add_argument('--gc-threshold', type=float, default=0.5, + help='threshold for 2bit gradient compression') + # additional parameters for large batch sgd + train.add_argument('--macrobatch-size', type=int, default=0, + help='distributed effective batch size') + train.add_argument('--warmup-epochs', type=int, default=5, + help='the epochs to ramp-up lr to scaled large-batch value') + train.add_argument('--warmup-strategy', type=str, default='linear', + help='the ramping-up strategy for large batch sgd') + train.add_argument('--profile-worker-suffix', type=str, default='', + help='profile workers actions into this file. During distributed training\ + filename saved will be rank1_ followed by this suffix') + train.add_argument('--profile-server-suffix', type=str, default='', + help='profile server actions into a file with name like rank1_ followed by this suffix \ + during distributed training') + train.add_argument('--use-imagenet-data-augmentation', type=int, default=0, + help='enable data augmentation of ImageNet data, default disabled') + return train + + +def fit(args, network, data_loader, **kwargs): + """ + train a model + args : argparse returns + network : the symbol definition of the nerual network + data_loader : function that returns the train and val data iterators + """ + # kvstore + kv = mx.kvstore.create(args.kv_store) + if args.gc_type != 'none': + kv.set_gradient_compression({'type': args.gc_type, + 'threshold': args.gc_threshold}) + if args.profile_server_suffix: + mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server') + mx.profiler.set_state(state='run', profile_process='server') + + if args.profile_worker_suffix: + if kv.num_workers > 1: + filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix + else: + filename = args.profile_worker_suffix + mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker') + mx.profiler.set_state(state='run', profile_process='worker') + + # logging + head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + logging.info('start with arguments %s', args) + + epoch_size = get_epoch_size(args, kv) + + # data iterators + (train, val) = data_loader(args, kv) + if 'dist' in args.kv_store and not 'async' in args.kv_store: + logging.info('Resizing training data to %d batches per machine', epoch_size) + # resize train iter to ensure each machine has same number of batches per epoch + # if not, dist_sync can hang at the end with one machine waiting for other machines + train = mx.io.ResizeIter(train, epoch_size) + + if args.test_io: + tic = time.time() + for i, batch in enumerate(train): + if isinstance(batch, list): + for b in batch: + for j in b.data: + j.wait_to_read() + else: + for j in batch.data: + j.wait_to_read() + if (i + 1) % args.disp_batches == 0: + logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i, + args.disp_batches * args.batch_size / (time.time() - tic)) + tic = time.time() + return + + # load model + if 'arg_params' in kwargs and 'aux_params' in kwargs: + arg_params = kwargs['arg_params'] + aux_params = kwargs['aux_params'] + else: + sym, arg_params, aux_params = _load_model(args, kv.rank) + if sym is not None: + assert sym.tojson() == network.tojson() + + # save model + checkpoint = _save_model(args, kv.rank) + + # devices for training + devs = mx.cpu() if args.gpus is None or args.gpus == "" else [ + mx.gpu(int(i)) for i in args.gpus.split(',')] + + # learning rate + lr, lr_scheduler = _get_lr_scheduler(args, kv) + + # create model + model = mx.mod.Module( + context=devs, + symbol=network + ) + + lr_scheduler = lr_scheduler + optimizer_params = { + 'learning_rate': lr, + 'wd': args.wd, + 'lr_scheduler': lr_scheduler, + 'multi_precision': True} + + # Only a limited number of optimizers have 'momentum' property + has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'} + if args.optimizer in has_momentum: + optimizer_params['momentum'] = args.mom + + monitor = mx.mon.Monitor( + args.monitor, pattern=".*") if args.monitor > 0 else None + + # A limited number of optimizers have a warmup period + has_warmup = {'lbsgd', 'lbnag'} + if args.optimizer in has_warmup: + nworkers = kv.num_workers + if epoch_size < 1: + epoch_size = 1 + macrobatch_size = args.macrobatch_size + if macrobatch_size < args.batch_size * nworkers: + macrobatch_size = args.batch_size * nworkers + #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999) + batch_scale = math.ceil( + float(macrobatch_size) / args.batch_size / nworkers) + optimizer_params['updates_per_epoch'] = epoch_size + optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0 + optimizer_params['batch_scale'] = batch_scale + optimizer_params['warmup_strategy'] = args.warmup_strategy + optimizer_params['warmup_epochs'] = args.warmup_epochs + optimizer_params['num_epochs'] = args.num_epochs + + if args.initializer == 'default': + if args.network == 'alexnet': + # AlexNet will not converge using Xavier + initializer = mx.init.Normal() + # VGG will not trend to converge using Xavier-Gaussian + elif args.network and 'vgg' in args.network: + initializer = mx.init.Xavier() + else: + initializer = mx.init.Xavier( + rnd_type='gaussian', factor_type="in", magnitude=2) + # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), + elif args.initializer == 'xavier': + initializer = mx.init.Xavier() + elif args.initializer == 'msra': + initializer = mx.init.MSRAPrelu() + elif args.initializer == 'orthogonal': + initializer = mx.init.Orthogonal() + elif args.initializer == 'normal': + initializer = mx.init.Normal() + elif args.initializer == 'uniform': + initializer = mx.init.Uniform() + elif args.initializer == 'one': + initializer = mx.init.One() + elif args.initializer == 'zero': + initializer = mx.init.Zero() + + # evaluation metrices + eval_metrics = ['accuracy'] + if args.top_k > 0: + eval_metrics.append(mx.metric.create( + 'top_k_accuracy', top_k=args.top_k)) + + supported_loss = ['ce', 'nll_loss'] + if len(args.loss) > 0: + # ce or nll loss is only applicable to softmax output + loss_type_list = args.loss.split(',') + if 'softmax_output' in network.list_outputs(): + for loss_type in loss_type_list: + loss_type = loss_type.strip() + if loss_type == 'nll': + loss_type = 'nll_loss' + if loss_type not in supported_loss: + logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \ + 'negative likelihood loss is supported!') + else: + eval_metrics.append(mx.metric.create(loss_type)) + else: + logging.warning("The output is not softmax_output, loss argument will be skipped!") + + # callbacks that run after each batch + batch_end_callbacks = [mx.callback.Speedometer( + args.batch_size, args.disp_batches)] + if 'batch_end_callback' in kwargs: + cbs = kwargs['batch_end_callback'] + batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs] + + # run + model.fit(train, + begin_epoch=args.load_epoch if args.load_epoch else 0, + num_epoch=args.num_epochs, + eval_data=val, + eval_metric=eval_metrics, + kvstore=kv, + optimizer=args.optimizer, + optimizer_params=optimizer_params, + initializer=initializer, + arg_params=arg_params, + aux_params=aux_params, + batch_end_callback=batch_end_callbacks, + epoch_end_callback=checkpoint, + allow_missing=True, + monitor=monitor) + + if args.profile_server_suffix: + mx.profiler.set_state(state='run', profile_process='server') + if args.profile_worker_suffix: + mx.profiler.set_state(state='run', profile_process='worker') diff --git a/examples/v1alpha3/mxnet-mnist/common/utils.py b/examples/v1alpha3/mxnet-mnist/common/utils.py new file mode 100644 index 00000000000..87717020fdc --- /dev/null +++ b/examples/v1alpha3/mxnet-mnist/common/utils.py @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import errno + +import mxnet as mx + +def download_file(url, local_fname=None, force_write=False): + # requests is not default installed + import requests + if local_fname is None: + local_fname = url.split('/')[-1] + if not force_write and os.path.exists(local_fname): + return local_fname + + dir_name = os.path.dirname(local_fname) + + if dir_name != "": + if not os.path.exists(dir_name): + try: # try to create the directory if it doesn't exists + os.makedirs(dir_name) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + + r = requests.get(url, stream=True) + assert r.status_code == 200, "failed to open %s" % url + with open(local_fname, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + return local_fname diff --git a/examples/v1alpha3/mxnet-mnist/mnist.py b/examples/v1alpha3/mxnet-mnist/mnist.py new file mode 100644 index 00000000000..b3478fdc29e --- /dev/null +++ b/examples/v1alpha3/mxnet-mnist/mnist.py @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Train mnist, see more explanation at https://mxnet.io/tutorials/python/mnist.html +""" +import os +import argparse +import logging +import mxnet as mx +import numpy as np +import gzip, struct +from common import fit +from common import utils +# This example only for mlp network +from symbols import mlp + +# Use this format (%Y-%m-%dT%H:%M:%SZ) to record timestamp of the metrics +logging.basicConfig( + format="%(asctime)s %(levelname)-8s %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.DEBUG) + +def read_data(label, image): + """ + download and read data into numpy + """ + base_url = 'http://yann.lecun.com/exdb/mnist/' + with gzip.open(utils.download_file(base_url+label, os.path.join('data',label))) as flbl: + magic, num = struct.unpack(">II", flbl.read(8)) + label = np.fromstring(flbl.read(), dtype=np.int8) + with gzip.open(utils.download_file(base_url+image, os.path.join('data',image)), 'rb') as fimg: + magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) + image = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(label), rows, cols) + return (label, image) + + +def to4d(img): + """ + reshape to 4D arrays + """ + return img.reshape(img.shape[0], 1, 28, 28).astype(np.float32)/255 + +def get_mnist_iter(args, kv): + """ + create data iterator with NDArrayIter + """ + (train_lbl, train_img) = read_data( + 'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz') + (val_lbl, val_img) = read_data( + 't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz') + train = mx.io.NDArrayIter( + to4d(train_img), train_lbl, args.batch_size, shuffle=True) + val = mx.io.NDArrayIter( + to4d(val_img), val_lbl, args.batch_size) + return (train, val) + +if __name__ == '__main__': + # parse args + parser = argparse.ArgumentParser(description="train mnist", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--num-classes', type=int, default=10, + help='the number of classes') + parser.add_argument('--num-examples', type=int, default=60000, + help='the number of training examples') + + parser.add_argument('--add_stn', action="store_true", default=False, help='Add Spatial Transformer Network Layer (lenet only)') + parser.add_argument('--image_shape', default='1, 28, 28', help='shape of training images') + + fit.add_fit_args(parser) + parser.set_defaults( + # network + network = 'mlp', + # train + gpus = None, + batch_size = 64, + disp_batches = 100, + num_epochs = 10, + lr = .05, + lr_step_epochs = '10' + ) + args = parser.parse_args() + + # load mlp network + sym = mlp.get_symbol(**vars(args)) + + # train + fit.fit(args, sym, get_mnist_iter) diff --git a/examples/v1alpha3/mxnet-mnist/symbols/__init__.py b/examples/v1alpha3/mxnet-mnist/symbols/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/examples/v1alpha3/mxnet-mnist/symbols/mlp.py b/examples/v1alpha3/mxnet-mnist/symbols/mlp.py new file mode 100644 index 00000000000..4b190b29db9 --- /dev/null +++ b/examples/v1alpha3/mxnet-mnist/symbols/mlp.py @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +a simple multilayer perceptron +""" +import mxnet as mx + +def get_symbol(num_classes=10, **kwargs): + data = mx.symbol.Variable('data') + data = mx.sym.Flatten(data=data) + fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) + act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") + fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) + act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") + fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) + mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') + return mlp diff --git a/examples/v1alpha3/random-example.yaml b/examples/v1alpha3/random-example.yaml index 26e0a61a81f..9742e2b2706 100644 --- a/examples/v1alpha3/random-example.yaml +++ b/examples/v1alpha3/random-example.yaml @@ -11,7 +11,7 @@ spec: goal: 0.99 objectiveMetricName: Validation-accuracy additionalMetricNames: - - accuracy + - Train-accuracy algorithm: algorithmName: random parallelTrialCount: 3 @@ -48,10 +48,10 @@ spec: spec: containers: - name: {{.Trial}} - image: docker.io/kubeflowkatib/mxnet-mnist-example + image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" + - "python3" + - "/opt/mxnet-mnist/mnist.py" - "--batch-size=64" {{- with .HyperParameters}} {{- range .}} diff --git a/examples/v1alpha3/tpe-example.yaml b/examples/v1alpha3/tpe-example.yaml index 664976ce720..524dfdb2d17 100644 --- a/examples/v1alpha3/tpe-example.yaml +++ b/examples/v1alpha3/tpe-example.yaml @@ -11,7 +11,7 @@ spec: goal: 0.99 objectiveMetricName: Validation-accuracy additionalMetricNames: - - accuracy + - Train-accuracy algorithm: algorithmName: tpe parallelTrialCount: 3 @@ -48,10 +48,10 @@ spec: spec: containers: - name: {{.Trial}} - image: docker.io/kubeflowkatib/mxnet-mnist-example + image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" + - "python3" + - "/opt/mxnet-mnist/mnist.py" - "--batch-size=64" {{- with .HyperParameters}} {{- range .}} diff --git a/manifests/v1alpha3/katib-controller/trialTemplateConfigmap.yaml b/manifests/v1alpha3/katib-controller/trialTemplateConfigmap.yaml index c0e9f0009d6..2b9e8e56727 100644 --- a/manifests/v1alpha3/katib-controller/trialTemplateConfigmap.yaml +++ b/manifests/v1alpha3/katib-controller/trialTemplateConfigmap.yaml @@ -15,10 +15,10 @@ data: spec: containers: - name: {{.Trial}} - image: docker.io/katib/mxnet-mnist-example + image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" + - "python3" + - "/opt/mxnet-mnist/mnist.py" - "--batch-size=64" {{- with .HyperParameters}} {{- range .}} diff --git a/test/e2e/v1alpha3/invalid-experiment.yaml b/test/e2e/v1alpha3/invalid-experiment.yaml index 918e2143c84..ca6065346fb 100644 --- a/test/e2e/v1alpha3/invalid-experiment.yaml +++ b/test/e2e/v1alpha3/invalid-experiment.yaml @@ -11,7 +11,7 @@ spec: goal: 0.99 objectiveMetricName: Validation-accuracy additionalMetricNames: - - accuracy + - Train-accuracy algorithm: algorithmName: random trialTemplate: @@ -27,10 +27,10 @@ spec: spec: containers: - name: {{.Trial}} - image: katib/mxnet-mnist-example + image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" + - "python3" + - "/opt/mxnet-mnist/mnist.py" - "--batch-size=64" restartPolicy: Never parameters: diff --git a/test/e2e/v1alpha3/valid-experiment.yaml b/test/e2e/v1alpha3/valid-experiment.yaml index e1453a9093a..487d6a581b5 100644 --- a/test/e2e/v1alpha3/valid-experiment.yaml +++ b/test/e2e/v1alpha3/valid-experiment.yaml @@ -11,7 +11,7 @@ spec: goal: 0.99 objectiveMetricName: Validation-accuracy additionalMetricNames: - - accuracy + - Train-accuracy algorithm: algorithmName: random trialTemplate: @@ -27,10 +27,10 @@ spec: spec: containers: - name: {{.Trial}} - image: katib/mxnet-mnist-example + image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" + - "python3" + - "/opt/mxnet-mnist/mnist.py" - "--batch-size=64" restartPolicy: Never parameters: diff --git a/test/scripts/v1alpha3/create-cluster.sh b/test/scripts/v1alpha3/create-cluster.sh index c409eceb874..15b420e14fb 100755 --- a/test/scripts/v1alpha3/create-cluster.sh +++ b/test/scripts/v1alpha3/create-cluster.sh @@ -33,6 +33,7 @@ echo "Creating GPU cluster" gcloud --project ${PROJECT} beta container clusters create ${CLUSTER_NAME} \ --zone ${ZONE} \ --machine-type=n1-standard-8 \ + --num-nodes=6 \ --cluster-version 1.14 echo "Configuring kubectl" gcloud --project ${PROJECT} container clusters get-credentials ${CLUSTER_NAME} \