diff --git a/benchmark/python/metric/benchmark_metric.py b/benchmark/python/metric/benchmark_metric.py index 3c9abf6e3cc0..fc0f8da5d451 100644 --- a/benchmark/python/metric/benchmark_metric.py +++ b/benchmark/python/metric/benchmark_metric.py @@ -66,7 +66,7 @@ def data(self): def run_metric(name, data_gen_cls, i, n, c, pred_ctx, label_ctx, **kwargs): """ Helper function for running one metric benchmark """ - metric = mx.gluon.metric.create(name, **kwargs) + metric = mx.metric.create(name, **kwargs) data_gen = data_gen_cls(n, c, pred_ctx, label_ctx) try: label, pred = data_gen.data() @@ -105,7 +105,7 @@ def test_metric_performance(): output_dims = [128, 1024, 8192] ctxs = [mx.cpu(), mx.gpu()] - print("\nmx.gluon.metric benchmarks", file=sys.stderr) + print("\nmx.metric benchmarks", file=sys.stderr) print( "{:15}{:10}{:12}{:12}{:15}{:15}{}".format( 'Metric', 'Data-Ctx', 'Label-Ctx', 'Data Size', 'Batch Size', 'Output Dim', 'Elapsed Time'), diff --git a/benchmark/python/sparse/sparse_end2end.py b/benchmark/python/sparse/sparse_end2end.py new file mode 100644 index 000000000000..d032f9d6c38e --- /dev/null +++ b/benchmark/python/sparse/sparse_end2end.py @@ -0,0 +1,307 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import time +import argparse +import os +import multiprocessing +from mxnet.test_utils import * + +MAX_NUM_BATCH = 99999999 +COMP = "compute" +COMM = "communication" +IO = "io" + +parser = argparse.ArgumentParser(description="Run sparse linear regression " \ + "with distributed kvstore", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--profiler', type=int, default=0, + help='whether to use profiler') +parser.add_argument('--num-epoch', type=int, default=1, + help='number of epochs to train') +parser.add_argument('--batch-size', type=int, default=512, + help='number of examples per batch') +parser.add_argument('--num-batch', type=int, default=MAX_NUM_BATCH, + help='number of batches per epoch') +parser.add_argument('--dummy-iter', type=int, default=0, + help='whether to use dummy iterator to exclude io cost') +parser.add_argument('--kvstore', type=str, default=None, + help='what kvstore to use [local, dist_sync, etc]') +parser.add_argument('--sparse-log-level', type=str, default='DEBUG', + help='logging level [DEBUG, INFO, ERROR]') +parser.add_argument('--dataset', type=str, default='avazu', + help='what test dataset to use') +parser.add_argument('--num-gpu', type=int, default=0, + help='number of gpus to use. 0 means using cpu(0);' + 'otherwise, use gpu(0),...,gpu(num_gpu-1)') +parser.add_argument('--output-dim', type=int, default=4, + help='number of columns of the forward output') +parser.add_argument('--dummy-metric', type=int, default=0, + help='whether to call update_metric') +parser.add_argument('--enable-logging-for', default="0", + help="Enable logging for the specified list of workers") +parser.add_argument('--measure-only', default=None, + help="Measure only", + choices=[IO, COMP, COMM]) +parser.add_argument('--omit-row-sparse-push', action='store_true', + help="omit row_sparse_push") + +class DummyIter(mx.io.DataIter): + "A dummy iterator that always return the same batch, used for speed testing" + def __init__(self, real_iter): + super(DummyIter, self).__init__() + self.real_iter = real_iter + self.provide_data = real_iter.provide_data + self.provide_label = real_iter.provide_label + self.batch_size = real_iter.batch_size + + for batch in real_iter: + self.the_batch = batch + break + + def __iter__(self): + return self + + def next(self): + return self.the_batch + +# testing dataset sources +avazu = { + 'data_name': 'avazu-app.t', + 'data_origin_name': 'avazu-app.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2", + 'feature_dim': 1000001, + 'lc': 1719304, +} + +kdda = { + 'data_name': 'kdda.t', + 'data_origin_name': 'kdda.t.bz2', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2", + 'feature_dim': 20216831, + 'lc': 510302, +} + +criteo = { + 'data_name': 'criteo.t', + 'data_origin_name': 'criteo.t.bz2', + 'url': "https://s3-us-west-2.amazonaws.com/sparse-dataset/criteo.t.bz2", + 'feature_dim': 8388621, + 'lc': 548787, +} + +datasets = { 'kdda' : kdda, 'avazu' : avazu , 'criteo': criteo } + + +def get_sym(feature_dim): + inputs = mx.symbol.Variable("data", stype='csr') + norm_init = mx.initializer.Normal(sigma=0.01) + weights = mx.symbol.Variable("w", shape=(feature_dim, args.output_dim), + init=norm_init, stype='row_sparse') + embed = mx.symbol.sparse.dot(inputs, weights) + softmax_output = mx.symbol.Variable("softmax_label") + model = mx.symbol.SoftmaxOutput(data=embed, label=softmax_output, name="out") + return model + + +def row_sparse_push(kv, param_arrays, grad_arrays, param_names): + for index, pair in enumerate(zip(param_arrays, grad_arrays)): + arg_list, grad_list = pair + if grad_list[0] is None: + continue + name = param_names[index] + kv.push(name, grad_list, priority=-index) + + +def row_sparse_pull(kv, key, data, slices, weight_array, priority): + # if have kvstore, need to pull corresponding rows of + # the weights to each context + # column indices (NDArray type) of the csr data + # used as the row_idx of the weight row-sparse matrix + row_indices = data.indices + if len(slices) == 1: + kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_indices) + else: # more than one slices, multi-GPU training. Need to retain weight rows according to data slices + # TODO(junwu): + # the following line blocks, may need to pre-compute + # and cache it outside the for loop + indptr = data.indptr.asnumpy() + row_idx_array = [] + for s in slices: + row_idx_array.append(row_indices[indptr[s.start]:indptr[s.stop]]) + kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_idx_array) + + +if __name__ == '__main__': + + # arg parser + args = parser.parse_args() + num_epoch = args.num_epoch + num_batch = args.num_batch + kvstore = args.kvstore + profiler = args.profiler > 0 + batch_size = args.batch_size if args.num_gpu == 0 else args.num_gpu * args.batch_size + dummy_iter = args.dummy_iter + dataset = args.dataset + log_level = args.sparse_log_level + measure_only = args.measure_only + num_cores = multiprocessing.cpu_count() + omit_row_sparse_push = args.omit_row_sparse_push + if measure_only == COMP or measure_only == IO: + assert not kvstore, "when compute_only or io_only is set, kvstore should be None" + num_batch = datasets[dataset]['lc'] / batch_size if num_batch == MAX_NUM_BATCH else num_batch + if measure_only == COMM: + assert (kvstore == "dist_async"), "when communication_only is set kvstore should be dist_async" + num_batch = datasets[dataset]['lc'] / batch_size if num_batch == MAX_NUM_BATCH else num_batch + + + contexts = mx.context.cpu(0) if args.num_gpu < 1\ + else [mx.context.gpu(i) for i in range(args.num_gpu)] + + # create kvstore when there are gpus + kv = mx.kvstore.create(kvstore) if kvstore else None + rank = kv.rank if kv is not None else 0 + num_worker = kv.num_workers if kv is not None else 1 + + # only print log for rank 0 worker + import logging + if log_level == 'ERROR': + log_level = logging.ERROR + elif log_level == 'DEBUG': + log_level = logging.DEBUG + else: + log_level = logging.INFO + + # Only log if it is in the list of workers to be logged + logging_workers_list = [int(i) for i in args.enable_logging_for.split(",")] + log_level = log_level if rank in logging_workers_list else logging.CRITICAL + + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=log_level, format=head) + + # dataset + assert(dataset in datasets), "unknown dataset " + dataset + metadata = datasets[dataset] + feature_dim = metadata['feature_dim'] + if logging: + logging.debug('preparing data ... ') + data_dir = os.path.join(os.getcwd(), 'data') + path = os.path.join(data_dir, metadata['data_name']) + if not os.path.exists(path): + get_bz2_data(data_dir, metadata['data_name'], metadata['url'], + metadata['data_origin_name']) + assert os.path.exists(path) + + # data iterator + train_data = mx.io.LibSVMIter(data_libsvm=path, data_shape=(feature_dim,), + batch_size=batch_size, num_parts=num_worker, + part_index=rank) + if dummy_iter or measure_only == COMP or measure_only == COMM: + train_data = DummyIter(train_data) + + # model + model = get_sym(feature_dim) + + # module + mod = mx.mod.Module(symbol=model, data_names=['data'], + label_names=['softmax_label'], context=contexts) + mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) + mod.init_params(initializer=mx.init.Uniform(scale=.1)) + sgd = mx.optimizer.SGD(momentum=0.0, clip_gradient=5.0, + learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker) + mod.init_optimizer(optimizer=sgd, kvstore=kv) + # use accuracy as the metric + metric = mx.metric.create('acc') + + index = mod._exec_group.param_names.index('w') + # weight_array bound to executors of the contexts + weight_array = mod._exec_group.param_arrays[index] + + mx.nd.waitall() # sync point for initialization + # start profiler + if profiler: + device = 'cpu' + if args.num_gpu > 0: + device = 'gpu' + str(args.num_gpu) + name = 'profile_' + args.dataset + '_' + device + '_nworker' + str(num_worker)\ + + '_batchsize' + str(args.batch_size) + '_outdim' + str(args.output_dim) + '.json' + mx.profiler.set_config(profile_all=True, filename=name) + mx.profiler.set_state('run') + + logging.debug('start training ...') + start = time.time() + data_iter = iter(train_data) + time_cost_epoch = 0. + sum_cost_epoch = 0. + average_cost_epoch = 0. + + for epoch in range(num_epoch): + start_time_epoch = time.time() + nbatch = 0 + end_of_batch = False + metric.reset() + next_batch = next(data_iter) + if kv is not None: + row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index) + while not end_of_batch: + nbatch += 1 + batch = next_batch + + if measure_only != IO and measure_only != COMM: + mod.forward_backward(batch) + # update parameters + mod.update() + if measure_only == COMM: + if nbatch == 1: + mod.forward_backward(batch) + mod.update() + elif not omit_row_sparse_push: + row_sparse_push(kv, mod._exec_group.param_arrays, mod._exec_group.grad_arrays, mod._exec_group.param_names) + + + try: + # pre fetch next batch + next_batch = next(data_iter) + if nbatch == num_batch: + raise StopIteration + if kv is not None: + row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index) + except StopIteration: + end_of_batch = True + # accumulate prediction accuracy + if args.dummy_metric == 0: + mod.update_metric(metric, batch.label) + else: # call waitall to replace update_metric as sync point + mx.nd.waitall() # sync point for the current minibatch + logging.info('epoch {}, {}'.format(epoch, metric.get())) + end_time_epoch = time.time() + if epoch == 0: + logging.debug("num_batches = {}".format(nbatch)) + logging.info('|device|num_worker|average_cost_epoch|rank|') + time_cost_epoch = end_time_epoch - start_time_epoch + if epoch > 0: + sum_cost_epoch = sum_cost_epoch + time_cost_epoch + average_cost_epoch = float(sum_cost_epoch) / epoch + logging.info('num_worker = {}, time cost per epoch = {}'.format(str(num_worker), str(time_cost_epoch))) + if args.num_gpu < 1: + logging.info('|cpu/{} cores| {} | {} | {} |'.format(str(num_cores), str(num_worker), str(average_cost_epoch), rank)) + data_iter.reset() + if profiler: + mx.profiler.set_state('stop') + end = time.time() + time_cost = end - start + logging.info('num_worker = {}, rank = {}, time cost = {}'.format(str(num_worker), str(rank), str(time_cost))) diff --git a/example/adversary/adversary_generation.ipynb b/example/adversary/adversary_generation.ipynb index 0dda371a8f41..76c5f4cff569 100644 --- a/example/adversary/adversary_generation.ipynb +++ b/example/adversary/adversary_generation.ipynb @@ -168,7 +168,7 @@ "epoch = 3\n", "for e in range(epoch):\n", " train_loss = 0.\n", - " acc = mx.gluon.metric.Accuracy()\n", + " acc = mx.metric.Accuracy()\n", " for i, (data, label) in enumerate(train_data):\n", " data = data.as_in_context(ctx)\n", " label = label.as_in_context(ctx)\n", @@ -223,7 +223,7 @@ " l = loss(output, label)\n", "l.backward()\n", "\n", - "acc = mx.gluon.metric.Accuracy()\n", + "acc = mx.metric.Accuracy()\n", "acc.update(label, output)\n", "\n", "print(\"Validation batch accuracy {}\".format(acc.get()[1]))" @@ -256,7 +256,7 @@ "\n", "output = net(data_perturbated) \n", "\n", - "acc = mx.gluon.metric.Accuracy()\n", + "acc = mx.metric.Accuracy()\n", "acc.update(label, output)\n", "\n", "print(\"Validation batch accuracy after perturbation {}\".format(acc.get()[1]))" diff --git a/example/autoencoder/variational_autoencoder/VAE_example.ipynb b/example/autoencoder/variational_autoencoder/VAE_example.ipynb new file mode 100755 index 000000000000..964e13725c69 --- /dev/null +++ b/example/autoencoder/variational_autoencoder/VAE_example.ipynb @@ -0,0 +1,1204 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import mxnet as mx\n", + "import numpy as np\n", + "import os\n", + "import logging\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.cm as cm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building a Variational Autoencoder in MXNet\n", + "\n", + "#### Xiaoyu Lu, July 5th, 2017\n", + "\n", + "This tutorial guides you through the process of building a variational encoder in MXNet. In this notebook we'll focus on an example using the MNIST handwritten digit recognition dataset. Refer to [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/) for more details on the model description.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "To complete this tutorial, we need following python packages:\n", + "\n", + "- numpy, matplotlib " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Loading the Data\n", + "\n", + "We first load the MNIST dataset, which contains 60000 training and 10000 test examples. The following code imports required modules and loads the data. These images are stored in a 4-D matrix with shape (`batch_size, num_channels, width, height`). For the MNIST dataset, there is only one color channel, and both width and height are 28, so we reshape each image as a 28x28 array. See below for a visualization:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "60000 784\n" + ] + } + ], + "source": [ + "mnist = mx.test_utils.get_mnist()\n", + "image = np.reshape(mnist['train_data'],(60000,28*28))\n", + "label = image\n", + "image_test = np.reshape(mnist['test_data'],(10000,28*28))\n", + "label_test = image_test\n", + "[N,features] = np.shape(image) #number of examples and features\n", + "print(N,features)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAACWCAYAAAA7UIUvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFI5JREFUeJzt3X+wVfO/x/H3u9+JfikVnb5FaYTEnLmqe5mLDDKJwfGVaYzfVJRBP+5FgzHjd6YoMkyFcbvUJDToxnW7GDql6YpSFwnpVKRfqDM+94+2O+f9Waf94+xfa+3P8zHTnF6rtfd6n/a73afde62lzjkBAAAAQtSs3AUAAAAA5cJiGAAAAMFiMQwAAIBgsRgGAABAsFgMAwAAIFgshgEAABAsFsMAAAAIFothAAAABCuvxbCqnq+q61V1o6pOLlRRAAAAQCloU+9Ap6rNReQrETlXRL4XkRUicqVz7otDPaZLly6ud+/eTToe4uXbb7+V7du3azGemz6pLCtXrtzunOtajOemVyoH7ynIFu8pyEYu7ykt8jjOP4jIRufc1yIiqvpvIjJSRA65GO7du7fU1tbmcUjERXV1ddGemz6pLKq6qVjPTa9UDt5TkC3eU5CNXN5T8hmTOEZENjfI36e2Gap6o6rWqmrttm3b8jgcKhl9gmzRK8gGfYJs0Sso+gl0zrnZzrlq51x1165F+V8NVAD6BNmiV5AN+gTZoleQz2L4BxGpapB7prYBAAAAiZDPYniFiPRT1T6q2kpE/i4iiwtTFgAAAFB8TT6BzjlXr6rjROQdEWkuIi8459YWrDIAAACgyPK5moQ455aIyJIC1QIAAACUFHegAwAAQLBYDAMAACBYLIYBAAAQLBbDAAAACBaLYQAAAASLxTAAAACCxWIYAAAAwWIxDAAAgGCxGAYAAECw8roDHYDiue2220yeMWNGZJ8OHTqYvG7dOpO7d+9e+MIAAKggfDIMAACAYLEYBgAAQLBYDAMAACBYzAwXwdatW00eNGiQyUOHDjV5wYIFRa8J8bdq1SqTn376aZObNYv+23X37t0m19XVmczMMJBcf/zxh8nz5883+Zprrkn7+OXLl5s8ePBgk/ft2xd5TJs2bUw+cOCAyf77UOvWrdPWACQBnwwDAAAgWCyGAQAAECwWwwAAAAgWM8NFMHv2bJP9GeJFixaVshzEVH19vcnz5s3L+TmOP/54k3v16pVXTQDKY+/evZFtU6dONfnJJ580WVXTPuftt99u8mmnnWbyc889F3nMZZddZvL7779vcu/evU32z4Hxz1O48cYbTe7UqdOhCwbKhE+GAQAAECwWwwAAAAgWi2EAAAAEi5nhEnDOlbsExNAnn3xi8owZM3J+jsmTJ5vcsWPHvGoCUB6XXnppZNvSpUtNzjQj7Fu5cqXJtbW1GZ/vtddeS/ucO3bsyOk53333XZOXLFkSeU6uVZzehg0bTO7fv39knxEjRpg8a9Ysk48++ujCF+bZuXOnyX6v+OdTZTJu3LjItqqqqtwLywKfDAMAACBYLIYBAAAQLBbDAAAACBYzw0Wwdu1ak/0ZqksuuaSU5SAm/vzzT5OnTJmS0+NXrFgR2XbyySfnVRMyu+WWW0z+4osvTL7qqqtMHjhwoMmDBw8uTmFItLlz55r8wQcflKmS4vr6669N/uOPPyL7MDOc3vz5801ubNb7jTfeMNk/J8WfGT7qqKNMHjVqVNoaPvvss8g2v2d//PFHk7dt22ayf/5Uphn48847L7KNmWEAAACgwFgMAwAAIFgshgEAABAsZoaLwJ/F8TV2P3hUPn/m6sMPP0y7/xFHHGFyY9eJbNmyZf6FwfCvs/rss8+m3X/58uUm+3NwLVpE32b9uWL/epp9+vRJ+xz79u0zediwYWlrRPn512CdOnWqyQcOHChlOQXjnwMzcuRIky+//HKT27RpU/Sakqa+vt7k1atXm/zEE0/k/Jz+vK6fff41rXOd7y2GIUOGlOxYfDIMAACAYLEYBgAAQLAyLoZV9QVVrVPVzxts66yqS1V1Q+prp+KWCQAAABReNjPDc0TkKRGZ12DbZBFZ5px7SFUnp/KkwpeXDPv37zd548aNZaoEcfbNN9/ktP8pp5xicvfu3QtZDg7Bn+dt27atyb/99lvax/uzdo3Ngq5cudLka665xuRmzeznFP68nn+Mww47zOQuXbpEjvnRRx+ZTD+V1po1a0z2/97wX1OR6LXJ/b7IpF27dia3b9/e5MauLeu/7/j8GeBWrVrlVBOidu3aZfLpp5+edv+amprItuHDh5vsn6OyaNEikzdt2pRLiSXhfw+NnW9RLBn/ZDnn/ktEfvY2jxSRv64YPldELi5wXQAAAEDRNXVmuJtzbkvq5z+JSLcC1QMAAACUTN4n0LmD/7cT/f+dFFW9UVVrVbU206U9EC76BNmiV5AN+gTZolfQ1MXwVlXtISKS+lp3qB2dc7Odc9XOuequXbs28XCodPQJskWvIBv0CbJFr6Cp08mLReRqEXko9fX1glWUQNu3bzf57bffNrmxEyMQnrfeeiun/R955JEiVYJ0jj/+eJP9P98zZ8402T/5xffMM89EtmX69Mk/cSqTPXv2pM0i0e9r4cKFJnPjjtKqq7OfITV2UwP/hDn/RMkrrrjC5NGjR5t87LHHmlxVVZVznSi8vXv3mnz33Xfn9Pjrr78+su2cc84x2e+FBx980OQtW7ZIvt59912T77zzTpMznWzsn5zsv1eW8qZS2Vxa7RUR+VhE+qvq96p6nRxcBJ+rqhtEZFgqAwAAAImS8ZNh59yVh/ilcw6xHQAAAEgE7kAHAACAYJXuisYV7OOPPza5sdkvhOfXX381edmyZTk9vrq6upDloIn8ubY77rgjp8dPmhS9H1GmmeBXXnnF5Ew38vEvqL9hw4bIPv4ccaZ5PhSWPyfaFL169TJ5xowZJvu9inj6/fffTZ4/f77J/nlGN998s8n+fHA2/N7w58kzWbduXWTbmDFjcnqODh06mPzhhx+afMwxx+T0fIXEJ8MAAAAIFothAAAABIvFMAAAAILFzHABnHrqqSb714Js1aqVyW3atCl6TSg//9qRP/zwQ9r9u3WzdzVn9rwy+O8H2bjhhhty2v/ee+81ee7cuZF9xo0bZ/LkyZNNHjFiRE7HRG78vyeOPPJIk3fs2JHxOdavX2/yrFmzTB45cqTJxx13XC4lokTat29v8oIFC0wePny4yY8//njRa/K9+eabJtfU1ET2yfR3lD+n7M8IDxgwoInVFR6fDAMAACBYLIYBAAAQLBbDAAAACBYzwwXw2Wefmbxv3z6T+/TpY3JTZggRf/X19Sb7830+f0b4pZdeMrlZM/6tiuy0a9fO5FGjRkX28WeGJ0yYUNSaYHXv3t3kfv36mZzNzLBv4sSJJk+dOtXkRx991GT/erUoj5YtW5o8ZMgQk9977z2TS3H96Hfeecfkiy66yORszmHxr8E+fvx4k8t5HeFM+NsWAAAAwWIxDAAAgGCxGAYAAECwmBkuAP/aef59xS+//PJSloMymT59usnLli1Lu/+JJ55o8tlnn13wmhCmZ555JuM+/gwrSuu1114zubq6OrKPf23yTOcR+OerjB071uS77rrL5BUrVkSeo2/fvia3aMEyodhat25t8uDBg4t+zO+++87kiy++OO3+Rx11VGTbLbfcYrI/w56keyrwyTAAAACCxWIYAAAAwWIxDAAAgGAxDFQA/uydfz2+QYMGlbIclEk2c5oNnXDCCUWqBKHZuXOnydOmTYvs48/vVVVVFbUmpOf/vTFv3rzIPqNHjzZ569ateR3Tnyk+6aSTIvv417g988wz8zom4sG/jrA/77t//36Tjz76aJM//fTTyHP6+yQZnwwDAAAgWCyGAQAAECwWwwAAAAgWM8MF4M/n+dcZXr16tckjRowoek2IP/oAheLPq2/bti2yj99vnMsQL41dZ9y/hr1/DeBiuOKKK0xesGCByUOHDi16Dcjfrl27TPZnwb/99tu0j9+7d6/J/nkJIswMAwAAABWBxTAAAACCxWIYAAAAwWIxDAAAgGBxAl0B+DfZ8DMqz6ZNmyLbfvzxx7SPad68ucmnn356QWtCOH7++WeTp0+fbnLPnj0jj3n55ZeLWhMKz78xyubNm01euHChyffff7/JO3bsyPmY/smXY8eONfmjjz4yuW3btjkfA8W3Zs0akx977DGTM61TZs2aZfKAAQMKU1hM8ckwAAAAgsViGAAAAMFiMQwAAIBgMTNcAP6Fp3/66SeTL7roolKWgxLwX2MRkd9++y3tY8aMGWNy+/btC1oTKtf+/ftNfvjhh032+/HCCy+MPMfhhx9e+MJQVP55Bj169DC5Y8eOJvs3WiiE1q1bp60J8VBbW2tyrjd1qqmpMfmCCy7Iu6Yk4ZNhAAAABCvjYlhVq1T1fVX9QlXXqur41PbOqrpUVTekvnYqfrkAAABA4WTzyXC9iNzhnBsgIoNFZKyqDhCRySKyzDnXT0SWpTIAAACQGBlnhp1zW0RkS+rnu1X1SxE5RkRGisg/p3abKyL/KSKTilJlzK1atcpk//p9nTrxoXmlefbZZ3N+zFNPPWXyTTfdZPIJJ5yQV02oXFu2bDH5kUceMdm/1usDDzxQ9JqQ3u7du02eOHGiye3atTO5W7dukedwzpn86quvmuz/3ZPJn3/+aXKzZpk/DzvssMNyOgaK7/XXX49su/baa03OND/euXNnk6dNm2Zyhw4dmlhdMuU0M6yqvUXkVBH5RES6pRbKIiI/iUj0TzIAAAAQY1kvhlX1cBFZICITnHPmnxzu4D9f3SEed6Oq1qpqrX9nG+Av9AmyRa8gG/QJskWvIKvFsKq2lIML4Zedc3/d/3GrqvZI/XoPEalr7LHOudnOuWrnXHXXrl0LUTMqEH2CbNEryAZ9gmzRK8g4M6wHB2CfF5EvnXNPNPilxSJytYg8lPoaHWIJRN++fU32r/nZq1evUpaDmPLn//w+YWYYh7Jz506T/fMSHnvsMZMHDRpU9JqQ3ubNm01+7rnncn4O/z3Df939nIk/I9zY4/33IX8+vVWrVjkdE/nbtGmTyffcc09kn19++cXkTL0xbNgwk0M/tymbm278o4iMFpH/UdXVqW3/IgcXwf+uqteJyCYRqTnE4wEAAIBYyuZqEv8tIof6J8Y5hS0HAAAAKB3uQAcAAIBgZTMmgQz8a4D6szr79u0zmes2Jl+PHj1yfow/a3fWWWcVqhxUmPr6epOnTp1qcsuWLU0eOnRo0WtCbvzruF533XUmP//886Usp1Ht27ePbFuyZInJVVVVpSoHKXV19noEp512msn+fLBIdL7cd8YZZ5j84osvmtyiRdjLQT4ZBgAAQLBYDAMAACBYLIYBAAAQrLCHRApkz549JvvzpMwIV55bb701sm3mzJkm+/d2nz59elFrQuVYu3atyYsXLzb5wgsvNJnrCsdP9+7dTX7wwQdNfvPNN03eunVr0WsaP368yaNGjYrsw4xw6flriP79+5u8a5e56W9W15e+7777TJ4wYYLJoc8I+/hkGAAAAMFiMQwAAIBgsRgGAABAsBgaKQD/PuH+rBgqT2OvcWPXfgSa4oEHHjDZn+/z508Rfx07djT5q6++MnnMmDGRx7z00ks5HaO6utrkadOmmTxkyJCcng/FsXfvXpMvu+wyk/0Z4Wz4M8KTJk0y2b/OPSw+GQYAAECwWAwDAAAgWCyGAQAAECwWwwAAAAgWJ9AVwP3331/uEgBUsDPOOMPkgQMHlqkSNJV/EqSf586dG3lMY9uQfO3atTO5rq4up8fX1NREtk2ZMsVkbqqRGz4ZBgAAQLBYDAMAACBYLIYBAAAQLIZKAAAAymTx4sUmjxs3zuS2bduaPGfOnMhzMCOcHz4ZBgAAQLBYDAMAACBYLIYBAAAQLIZMACBm/JnBhQsXlqkSAMXWs2dPkxctWlSmSsLFJ8MAAAAIFothAAAABIvFMAAAAIKlzrnSHUx1m4hsEpEuIrK9ZAduGmpM72/Oua7FeOKE9YlIMuqkV8qPGtMrRZ+I8DoUSqX3Cq9B4ZSrzqz7pKSL4f8/qGqtc6665AfOATWWX1K+vyTUmYQa85GE748a4yEJ3yM1ll8Svr8k1CiSjDoZkwAAAECwWAwDAAAgWOVaDM8u03FzQY3ll5TvLwl1JqHGfCTh+6PGeEjC90iN5ZeE7y8JNYokoM6yzAwDAAAAccCYBAAAAIJV0sWwqp6vqutVdaOqTi7lsdNR1RdUtU5VP2+wrbOqLlXVDamvncpcY5Wqvq+qX6jqWlUdH8c6CyWOvUKfxE8c+0SEXokjeqXJ9QXVJyLx7JW490mqnsT2SskWw6raXESeFpELRGSAiFypqgNKdfwM5ojI+d62ySKyzDnXT0SWpXI51YvIHc65ASIyWETGpn7/4lZn3mLcK3OEPomNGPeJCL0SK/RKXoLpE5FY98ociXefiCS5V5xzJfkhIkNE5J0GeYqITCnV8bOor7eIfN4grxeRHqmf9xCR9eWu0av3dRE5N+51Vlqv0Cfx+RHnPqFX4vWDXqFPKqFXktQnSeuVUo5JHCMimxvk71Pb4qqbc25L6uc/iUi3chbTkKr2FpFTReQTiXGdeUhSr8T2958+iZ3Yvgb0SuzE8jUIoE9EktUrsX0NktYrnECXBXfwnzOxuOyGqh4uIgtEZIJzblfDX4tTnSGK0+8/fRJvcXoN6JV4i8trQJ/EW5xegyT2SikXwz+ISFWD3DO1La62qmoPEZHU17oy1yOq2lIONtjLzrmFqc2xq7MAktQrsfv9p09iK3avAb0SW7F6DQLqE5Fk9UrsXoOk9kopF8MrRKSfqvZR1VYi8ncRWVzC4+dqsYhcnfr51XJw9qVsVFVF5HkR+dI590SDX4pVnQWSpF6J1e8/fRLbPhGJ2WtAr9Ar2QisT0SS1Suxeg0S3SslHqYeLiJficj/isi/lntgukFdr4jIFhE5IAfng64TkSPl4FmPG0TkP0Skc5lr/Cc5+F8La0RkderH8LjVWcm9Qp/E70cc+4ReiecPeoU+SXKvxL1Pkt4r3IEOAAAAweIEOgAAAASLxTAAAACCxWIYAAAAwWIxDAAAgGCxGAYAAECwWAwDAAAgWCyGAQAAECwWwwAAAAjW/wEgPmufEARJLAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "nsamples = 5\n", + "idx = np.random.choice(len(mnist['train_data']), nsamples)\n", + "_, axarr = plt.subplots(1, nsamples, sharex='col', sharey='row',figsize=(12,3))\n", + "\n", + "for i,j in enumerate(idx):\n", + " axarr[i].imshow(np.reshape(image[j,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can optionally save the parameters in the directory variable 'model_prefix'. We first create data iterators for MXNet, with each batch of data containing 100 images." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "model_prefix = None\n", + "\n", + "batch_size = 100\n", + "latent_dim = 5\n", + "nd_iter = mx.io.NDArrayIter(data={'data':image},label={'loss_label':label},\n", + " batch_size = batch_size)\n", + "nd_iter_test = mx.io.NDArrayIter(data={'data':image_test},label={'loss_label':label_test},\n", + " batch_size = batch_size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Building the Network Architecture\n", + "\n", + "### 2.1 Gaussian MLP as encoder\n", + "Next we constuct the neural network, as in the [paper](https://arxiv.org/abs/1312.6114/), we use *Multilayer Perceptron (MLP)* for both the encoder and decoder. For encoder, a Gaussian MLP is used as follows:\n", + "\n", + "\\begin{align}\n", + "\\log q_{\\phi}(z|x) &= \\log \\mathcal{N}(z:\\mu,\\sigma^2I) \\\\\n", + "\\textit{ where } \\mu &= W_2h+b_2, \\log \\sigma^2 = W_3h+b_3\\\\\n", + "h &= \\tanh(W_1x+b_1)\n", + "\\end{align}\n", + "\n", + "where $\\{W_1,W_2,W_3,b_1,b_2,b_3\\}$ are the weights and biases of the MLP.\n", + "Note below that `encoder_mu`(`mu`) and `encoder_logvar`(`logvar`) are symbols. So, we can use `get_internals()` to get the values of them, after which we can sample the latent variable $z$.\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "## define data and loss labels as symbols \n", + "data = mx.sym.var('data')\n", + "loss_label = mx.sym.var('loss_label')\n", + "\n", + "## define fully connected and activation layers for the encoder, where we used tanh activation function.\n", + "encoder_h = mx.sym.FullyConnected(data=data, name=\"encoder_h\",num_hidden=400)\n", + "act_h = mx.sym.Activation(data=encoder_h, act_type=\"tanh\",name=\"activation_h\")\n", + "\n", + "## define mu and log variance which are the fully connected layers of the previous activation layer\n", + "mu = mx.sym.FullyConnected(data=act_h, name=\"mu\",num_hidden = latent_dim)\n", + "logvar = mx.sym.FullyConnected(data=act_h, name=\"logvar\",num_hidden = latent_dim)\n", + "\n", + "## sample the latent variables z according to Normal(mu,var)\n", + "z = mu + mx.symbol.broadcast_mul(mx.symbol.exp(0.5 * logvar), \n", + " mx.symbol.random_normal(loc=0, scale=1, shape=(batch_size, latent_dim)),\n", + " name=\"z\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 Bernoulli MLP as decoder\n", + "\n", + "In this case let $p_\\theta(x|z)$ be a multivariate Bernoulli whose probabilities are computed from $z$ with a feed forward neural network with a single hidden layer:\n", + "\n", + "\\begin{align}\n", + "\\log p(x|z) &= \\sum_{i=1}^D x_i\\log y_i + (1-x_i)\\log (1-y_i) \\\\\n", + "\\textit{ where } y &= f_\\sigma(W_5\\tanh (W_4z+b_4)+b_5)\n", + "\\end{align}\n", + "\n", + "where $f_\\sigma(\\dot)$ is the elementwise sigmoid activation function, $\\{W_4,W_5,b_4,b_5\\}$ are the weights and biases of the decoder MLP. A Bernouilli likelihood is suitable for this type of data but you can easily extend it to other likelihood types by parsing into the argument `likelihood` in the `VAE` class, see section 4 for details." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# define fully connected and tanh activation layers for the decoder\n", + "decoder_z = mx.sym.FullyConnected(data=z, name=\"decoder_z\",num_hidden=400)\n", + "act_z = mx.sym.Activation(data=decoder_z, act_type=\"tanh\",name=\"activation_z\")\n", + "\n", + "# define the output layer with sigmoid activation function, where the dimension is equal to the input dimension\n", + "decoder_x = mx.sym.FullyConnected(data=act_z, name=\"decoder_x\",num_hidden=features)\n", + "y = mx.sym.Activation(data=decoder_x, act_type=\"sigmoid\",name='activation_x')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 Joint Loss Function for the Encoder and the Decoder\n", + "\n", + "The variational lower bound also called evidence lower bound (ELBO) can be estimated as:\n", + "\n", + "\\begin{align}\n", + "\\mathcal{L}(\\theta,\\phi;x_{(i)}) \\approx \\frac{1}{2}\\left(1+\\log ((\\sigma_j^{(i)})^2)-(\\mu_j^{(i)})^2-(\\sigma_j^{(i)})^2\\right) + \\log p_\\theta(x^{(i)}|z^{(i)})\n", + "\\end{align}\n", + "\n", + "where the first term is the KL divergence of the approximate posterior from the prior, and the second term is an expected negative reconstruction error. We would like to maximize this lower bound, so we can define the loss to be $-\\mathcal{L}$(minus ELBO) for MXNet to minimize." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# define the objective loss function that needs to be minimized\n", + "KL = 0.5*mx.symbol.sum(1+logvar-pow( mu,2)-mx.symbol.exp(logvar),axis=1)\n", + "loss = -mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(y)) \n", + " + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-y)),axis=1)-KL\n", + "output = mx.symbol.MakeLoss(sum(loss),name='loss')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Training the model\n", + "\n", + "Now, we can define the model and train it. First we will initilize the weights and the biases to be Gaussian(0,0.01), and then use stochastic gradient descent for optimization. To warm start the training, one may also initilize with pre-trainined parameters `arg_params` using `init=mx.initializer.Load(arg_params)`. \n", + "\n", + "To save intermediate results, we can optionally use `epoch_end_callback = mx.callback.do_checkpoint(model_prefix, 1)` which saves the parameters to the path given by model_prefix, and with period every $1$ epoch. To assess the performance, we output $-\\mathcal{L}$(minus ELBO) after each epoch, with the command `eval_metric = 'Loss'` which is defined above. We will also plot the training loss for mini batches by accessing the log and saving it to a list, and then parsing it to the argument `batch_end_callback`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# set up the log\n", + "nd_iter.reset()\n", + "logging.getLogger().setLevel(logging.DEBUG) \n", + "\n", + "# define function to trave back training loss\n", + "def log_to_list(period, lst):\n", + " def _callback(param):\n", + " \"\"\"The checkpoint function.\"\"\"\n", + " if param.nbatch % period == 0:\n", + " name, value = param.eval_metric.get()\n", + " lst.append(value)\n", + " return _callback\n", + "\n", + "# define the model\n", + "model = mx.mod.Module(\n", + " symbol = output ,\n", + " data_names=['data'],\n", + " label_names = ['loss_label'])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Epoch[0] Train-loss=373.547317\n", + "INFO:root:Epoch[0] Time cost=5.020\n", + "INFO:root:Epoch[1] Train-loss=212.232684\n", + "INFO:root:Epoch[1] Time cost=4.651\n", + "INFO:root:Epoch[2] Train-loss=207.448528\n", + "INFO:root:Epoch[2] Time cost=4.665\n", + "INFO:root:Epoch[3] Train-loss=205.369479\n", + "INFO:root:Epoch[3] Time cost=4.758\n", + "INFO:root:Epoch[4] Train-loss=203.651983\n", + "INFO:root:Epoch[4] Time cost=4.672\n", + "INFO:root:Epoch[5] Train-loss=202.061007\n", + "INFO:root:Epoch[5] Time cost=5.087\n", + "INFO:root:Epoch[6] Train-loss=199.348143\n", + "INFO:root:Epoch[6] Time cost=5.056\n", + "INFO:root:Epoch[7] Train-loss=196.266242\n", + "INFO:root:Epoch[7] Time cost=4.813\n", + "INFO:root:Epoch[8] Train-loss=194.694945\n", + "INFO:root:Epoch[8] Time cost=4.776\n", + "INFO:root:Epoch[9] Train-loss=193.699284\n", + "INFO:root:Epoch[9] Time cost=4.756\n", + "INFO:root:Epoch[10] Train-loss=193.036517\n", + "INFO:root:Epoch[10] Time cost=4.757\n", + "INFO:root:Epoch[11] Train-loss=192.555736\n", + "INFO:root:Epoch[11] Time cost=4.678\n", + "INFO:root:Epoch[12] Train-loss=192.020813\n", + "INFO:root:Epoch[12] Time cost=4.630\n", + "INFO:root:Epoch[13] Train-loss=191.648876\n", + "INFO:root:Epoch[13] Time cost=5.158\n", + "INFO:root:Epoch[14] Train-loss=191.057798\n", + "INFO:root:Epoch[14] Time cost=4.781\n", + "INFO:root:Epoch[15] Train-loss=190.315835\n", + "INFO:root:Epoch[15] Time cost=5.117\n", + "INFO:root:Epoch[16] Train-loss=189.311271\n", + "INFO:root:Epoch[16] Time cost=4.707\n", + "INFO:root:Epoch[17] Train-loss=187.285967\n", + "INFO:root:Epoch[17] Time cost=4.745\n", + "INFO:root:Epoch[18] Train-loss=185.271324\n", + "INFO:root:Epoch[18] Time cost=4.692\n", + "INFO:root:Epoch[19] Train-loss=183.510888\n", + "INFO:root:Epoch[19] Time cost=4.762\n", + "INFO:root:Epoch[20] Train-loss=181.756008\n", + "INFO:root:Epoch[20] Time cost=4.838\n", + "INFO:root:Epoch[21] Train-loss=180.546818\n", + "INFO:root:Epoch[21] Time cost=4.764\n", + "INFO:root:Epoch[22] Train-loss=179.479776\n", + "INFO:root:Epoch[22] Time cost=4.791\n", + "INFO:root:Epoch[23] Train-loss=178.352077\n", + "INFO:root:Epoch[23] Time cost=4.981\n", + "INFO:root:Epoch[24] Train-loss=177.385084\n", + "INFO:root:Epoch[24] Time cost=5.292\n", + "INFO:root:Epoch[25] Train-loss=175.920123\n", + "INFO:root:Epoch[25] Time cost=5.097\n", + "INFO:root:Epoch[26] Train-loss=174.377171\n", + "INFO:root:Epoch[26] Time cost=4.907\n", + "INFO:root:Epoch[27] Train-loss=172.590589\n", + "INFO:root:Epoch[27] Time cost=4.484\n", + "INFO:root:Epoch[28] Train-loss=170.933683\n", + "INFO:root:Epoch[28] Time cost=4.348\n", + "INFO:root:Epoch[29] Train-loss=169.866807\n", + "INFO:root:Epoch[29] Time cost=4.647\n", + "INFO:root:Epoch[30] Train-loss=169.182084\n", + "INFO:root:Epoch[30] Time cost=5.034\n", + "INFO:root:Epoch[31] Train-loss=168.121719\n", + "INFO:root:Epoch[31] Time cost=5.615\n", + "INFO:root:Epoch[32] Train-loss=167.389992\n", + "INFO:root:Epoch[32] Time cost=4.733\n", + "INFO:root:Epoch[33] Train-loss=166.189067\n", + "INFO:root:Epoch[33] Time cost=5.041\n", + "INFO:root:Epoch[34] Train-loss=163.783392\n", + "INFO:root:Epoch[34] Time cost=5.168\n", + "INFO:root:Epoch[35] Train-loss=162.167959\n", + "INFO:root:Epoch[35] Time cost=5.019\n", + "INFO:root:Epoch[36] Train-loss=161.192039\n", + "INFO:root:Epoch[36] Time cost=5.064\n", + "INFO:root:Epoch[37] Train-loss=160.307114\n", + "INFO:root:Epoch[37] Time cost=5.180\n", + "INFO:root:Epoch[38] Train-loss=159.591957\n", + "INFO:root:Epoch[38] Time cost=5.440\n", + "INFO:root:Epoch[39] Train-loss=159.109593\n", + "INFO:root:Epoch[39] Time cost=5.119\n", + "INFO:root:Epoch[40] Train-loss=158.463844\n", + "INFO:root:Epoch[40] Time cost=5.299\n", + "INFO:root:Epoch[41] Train-loss=158.037287\n", + "INFO:root:Epoch[41] Time cost=4.856\n", + "INFO:root:Epoch[42] Train-loss=157.598576\n", + "INFO:root:Epoch[42] Time cost=5.227\n", + "INFO:root:Epoch[43] Train-loss=157.097344\n", + "INFO:root:Epoch[43] Time cost=5.237\n", + "INFO:root:Epoch[44] Train-loss=156.594472\n", + "INFO:root:Epoch[44] Time cost=4.783\n", + "INFO:root:Epoch[45] Train-loss=156.177069\n", + "INFO:root:Epoch[45] Time cost=4.834\n", + "INFO:root:Epoch[46] Train-loss=155.825302\n", + "INFO:root:Epoch[46] Time cost=4.902\n", + "INFO:root:Epoch[47] Train-loss=155.318117\n", + "INFO:root:Epoch[47] Time cost=4.966\n", + "INFO:root:Epoch[48] Train-loss=154.890766\n", + "INFO:root:Epoch[48] Time cost=5.012\n", + "INFO:root:Epoch[49] Train-loss=154.504158\n", + "INFO:root:Epoch[49] Time cost=4.844\n", + "INFO:root:Epoch[50] Train-loss=154.035214\n", + "INFO:root:Epoch[50] Time cost=4.736\n", + "INFO:root:Epoch[51] Train-loss=153.692903\n", + "INFO:root:Epoch[51] Time cost=5.057\n", + "INFO:root:Epoch[52] Train-loss=153.257554\n", + "INFO:root:Epoch[52] Time cost=5.044\n", + "INFO:root:Epoch[53] Train-loss=152.849715\n", + "INFO:root:Epoch[53] Time cost=4.783\n", + "INFO:root:Epoch[54] Train-loss=152.483047\n", + "INFO:root:Epoch[54] Time cost=4.842\n", + "INFO:root:Epoch[55] Train-loss=152.091617\n", + "INFO:root:Epoch[55] Time cost=5.044\n", + "INFO:root:Epoch[56] Train-loss=151.715490\n", + "INFO:root:Epoch[56] Time cost=5.029\n", + "INFO:root:Epoch[57] Train-loss=151.362293\n", + "INFO:root:Epoch[57] Time cost=4.873\n", + "INFO:root:Epoch[58] Train-loss=151.003241\n", + "INFO:root:Epoch[58] Time cost=4.729\n", + "INFO:root:Epoch[59] Train-loss=150.619678\n", + "INFO:root:Epoch[59] Time cost=5.068\n", + "INFO:root:Epoch[60] Train-loss=150.296043\n", + "INFO:root:Epoch[60] Time cost=4.458\n", + "INFO:root:Epoch[61] Train-loss=149.964152\n", + "INFO:root:Epoch[61] Time cost=4.828\n", + "INFO:root:Epoch[62] Train-loss=149.694102\n", + "INFO:root:Epoch[62] Time cost=5.012\n", + "INFO:root:Epoch[63] Train-loss=149.290113\n", + "INFO:root:Epoch[63] Time cost=5.193\n", + "INFO:root:Epoch[64] Train-loss=148.934186\n", + "INFO:root:Epoch[64] Time cost=4.999\n", + "INFO:root:Epoch[65] Train-loss=148.657502\n", + "INFO:root:Epoch[65] Time cost=4.810\n", + "INFO:root:Epoch[66] Train-loss=148.331948\n", + "INFO:root:Epoch[66] Time cost=5.201\n", + "INFO:root:Epoch[67] Train-loss=148.018539\n", + "INFO:root:Epoch[67] Time cost=4.833\n", + "INFO:root:Epoch[68] Train-loss=147.746825\n", + "INFO:root:Epoch[68] Time cost=5.187\n", + "INFO:root:Epoch[69] Train-loss=147.406399\n", + "INFO:root:Epoch[69] Time cost=5.355\n", + "INFO:root:Epoch[70] Train-loss=147.181831\n", + "INFO:root:Epoch[70] Time cost=4.989\n", + "INFO:root:Epoch[71] Train-loss=146.860770\n", + "INFO:root:Epoch[71] Time cost=4.934\n", + "INFO:root:Epoch[72] Train-loss=146.604369\n", + "INFO:root:Epoch[72] Time cost=5.283\n", + "INFO:root:Epoch[73] Train-loss=146.351628\n", + "INFO:root:Epoch[73] Time cost=5.062\n", + "INFO:root:Epoch[74] Train-loss=146.102506\n", + "INFO:root:Epoch[74] Time cost=4.540\n", + "INFO:root:Epoch[75] Train-loss=145.828805\n", + "INFO:root:Epoch[75] Time cost=4.875\n", + "INFO:root:Epoch[76] Train-loss=145.571626\n", + "INFO:root:Epoch[76] Time cost=4.856\n", + "INFO:root:Epoch[77] Train-loss=145.365383\n", + "INFO:root:Epoch[77] Time cost=5.003\n", + "INFO:root:Epoch[78] Train-loss=145.101047\n", + "INFO:root:Epoch[78] Time cost=4.718\n", + "INFO:root:Epoch[79] Train-loss=144.810765\n", + "INFO:root:Epoch[79] Time cost=5.127\n", + "INFO:root:Epoch[80] Train-loss=144.619876\n", + "INFO:root:Epoch[80] Time cost=4.737\n", + "INFO:root:Epoch[81] Train-loss=144.399066\n", + "INFO:root:Epoch[81] Time cost=4.742\n", + "INFO:root:Epoch[82] Train-loss=144.220090\n", + "INFO:root:Epoch[82] Time cost=4.810\n", + "INFO:root:Epoch[83] Train-loss=143.904279\n", + "INFO:root:Epoch[83] Time cost=5.176\n", + "INFO:root:Epoch[84] Train-loss=143.734935\n", + "INFO:root:Epoch[84] Time cost=4.921\n", + "INFO:root:Epoch[85] Train-loss=143.499403\n", + "INFO:root:Epoch[85] Time cost=4.692\n", + "INFO:root:Epoch[86] Train-loss=143.304287\n", + "INFO:root:Epoch[86] Time cost=4.778\n", + "INFO:root:Epoch[87] Train-loss=143.096145\n", + "INFO:root:Epoch[87] Time cost=4.962\n", + "INFO:root:Epoch[88] Train-loss=142.877920\n", + "INFO:root:Epoch[88] Time cost=4.815\n", + "INFO:root:Epoch[89] Train-loss=142.677429\n", + "INFO:root:Epoch[89] Time cost=5.127\n", + "INFO:root:Epoch[90] Train-loss=142.499622\n", + "INFO:root:Epoch[90] Time cost=5.463\n", + "INFO:root:Epoch[91] Train-loss=142.300291\n", + "INFO:root:Epoch[91] Time cost=4.639\n", + "INFO:root:Epoch[92] Train-loss=142.111362\n", + "INFO:root:Epoch[92] Time cost=5.064\n", + "INFO:root:Epoch[93] Train-loss=141.912848\n", + "INFO:root:Epoch[93] Time cost=4.894\n", + "INFO:root:Epoch[94] Train-loss=141.723130\n", + "INFO:root:Epoch[94] Time cost=4.635\n", + "INFO:root:Epoch[95] Train-loss=141.516580\n", + "INFO:root:Epoch[95] Time cost=5.063\n", + "INFO:root:Epoch[96] Train-loss=141.362380\n", + "INFO:root:Epoch[96] Time cost=4.785\n", + "INFO:root:Epoch[97] Train-loss=141.178878\n", + "INFO:root:Epoch[97] Time cost=4.699\n", + "INFO:root:Epoch[98] Train-loss=141.004168\n", + "INFO:root:Epoch[98] Time cost=4.959\n", + "INFO:root:Epoch[99] Train-loss=140.865592\n", + "INFO:root:Epoch[99] Time cost=5.155\n" + ] + } + ], + "source": [ + "# training the model, save training loss as a list.\n", + "training_loss=list()\n", + "\n", + "# initilize the parameters for training using Normal.\n", + "init = mx.init.Normal(0.01)\n", + "model.fit(nd_iter, # train data\n", + " initializer=init,\n", + " # if eval_data is supplied, test loss will also be reported\n", + " # eval_data = nd_iter_test,\n", + " optimizer='sgd', # use SGD to train\n", + " optimizer_params={'learning_rate':1e-3,'wd':1e-2}, \n", + " # save parameters for each epoch if model_prefix is supplied\n", + " epoch_end_callback = None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),\n", + " batch_end_callback = log_to_list(N/batch_size,training_loss), \n", + " num_epoch=100,\n", + " eval_metric = 'Loss')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:matplotlib.font_manager:findfont: Matching :family=sans-serif:style=normal:variant=normal:weight=normal:stretch=normal:size=12.0 to DejaVu Sans ('/usr/local/lib/python3.5/dist-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf') with score of 0.050000\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAAEWCAYAAABIVsEJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XmYXFWd//H3t6t676S7kw5kX4AAsgbMBMQNFQUZZ9AZxsFlwJVxfjjO4vxQ0BkUxVHHGZfRB0WG38AMi7tEh0cEFRjHYQmrbEIgW4ck3Ul3J7133brf3x/3VlPp1K0Kla6uTufzep56UnXurapz63bO957lnmPujoiIyEtVU+0MiIjIwUkBREREyqIAIiIiZVEAERGRsiiAiIhIWRRARESkLAogMqnM7Jtm9veTve9MYGaNZvYTM9ttZt+rwvdfbmbXHui+ZnammXVObu4S87HRzM6aiu+Sly5d7QzI9GFmG4EPuPud5X6Gu3+oEvvOEOcDhwNz3T2Y6i93989VYt+XwsyWAxuA2mr8BjK5VAOR/WZmh9QFRwWOdxnwTDkF56H228vBQQFEADCz/wCWAj8xswEzu9TMlpuZm9n7zWwz8Mt43++Z2fa4KeYeMzs+73P+3cw+Gz8/08w6zeyjZtZlZtvM7L1l7js3bv7ZY2YPmNlnzezXRY7nVWb2GzPrM7MtZvaeOP0uM/tA3n7vyf+c+HgvMbNngWfN7Goz+9KEz77VzP42fr7QzH5gZt1mtsHMPpKQn08D/wD8afz7vt/Maszsk2a2KT7mG8ysNd6/4G8/4TNzv9mleb/ZW83sXDN7xsx6zOzyvP0/ZWb/OeHzLzKzzWa208w+UWjfIr/x5fH7NprZu/LSf9/MHo7P1RYz+1Te2+6J/+2Lf4dXxO/5oJk9ZWb9ZvakmZ2a955VZvZY/Pf2HTNryPuut5jZI/F5/o2ZnZS37WNmtjX+zN+Z2RuKHY+Uwd310AN3B9gInJX3ejngwA1AM9AYp78PmAXUA18BHsl7z78Dn42fnwkEwJVALXAuMAS0l7HvLfGjCTgO2AL8OuE4lgH9wDviz5oLrIq33UXUTJfb9z35nxMf7x3AHKAReE38XRZvbweGgYVEF2APEgWGOuAI4Hng7IR8fQr4z7zX7wPWx+9rAX4I/Eex337C5+V+s3+Ij/ODQDdwU3x+jo/zumLi9+d9/rfj4zwZGAVeViivCd/7L/HfwGuBQeCYvO0nxr/PScAO4K0Tvjed93l/AmwFfg8w4ChgWd7f5P3x7z0HeAr4ULztFKALOA1IARfF+9cDx8TnbWHe9x5Z7f9jM+2hGojsj0+5+6C7DwO4+3Xu3u/uo0QFzcm5K+cCMsCV7p5x99uAAaL/3Pu9r5mlgD8GrnD3IXd/Eri+SH7fCdzp7jfHn7XL3R95Ccf7j+7eEx/vfxMVeK+Ot50P/K+7v0BU4M1z9yvdfczdnycqkC/Yz+95F/Av7v68uw8AlwEXTGiu2uu3LyADXOXuGaIA2wF8NT4/TwBPEgWHJJ9292F3fxR4tMS+E/29u4+6+93AfwFvB3D3u9z9t+4euvtjwM1EQSbJB4AvuvsDHlnv7pvytn/N3V9w9x7gJ8CqOP1i4Fvufp+7Z939eqIgeDqQJQokx5lZrbtvdPfnXsKxyX5QAJH9sSX3xMxSZvZ5M3vOzPYQXfFBVHAVssv3bvMfIrrafin7ziMa8LElb1v+84mWAAdSWIx/trs7UcH8jjjpncCN8fNlwMK4+aTPzPqAy4k6yvfHQiC/oNxEdJz57y92nBD9Ztn4eS7I7MjbPkzy7w2wPe95sXMzUa+7D+a93kR0PJjZaWb2q7hZbzfwIZL/PqD0+UrK4zLgoxN+/yVEtY71wF8TXeB0mdktZrZwP49N9pMCiORLmpo5P/2dwHnAWUArUdMARE0PldJN1GSyOC9tSZH9twBHJmwbJGoGy5lfYJ+Jv8PNwPlmtoyoueQHed+zwd3b8h6z3P3cInnL9wJRIZizlOg48wPAdJ0uu93MmvNeLyU6Hoia0NYCS9y9FfgmL/59FDqeYuermC1Eta/837/J3W8GcPeb3P1VRL+xA18o4zukCAUQybeDqD2+mFlEzQS7iAriigz3zBdfYf8Q+JSZNZnZscCFRd5yI3CWmb3dzNJxB3yu2eMR4I/izzkKeP9+fP/DwE7gWuB2d++LN90P9MedtY1x7ewEM/u9/Ty0m4G/MbMVZtZC9Ft+xw+e4a2fNrM6M3s18BYgd2/LLKDH3UfMbA3RRUdONxCy99/ZtcDfmdnLLXJUHKxL+TbwobjGY2bWHHfgzzKzY8zs9WZWD4wQ1cTCAzxemUABRPL9I/DJuDng7xL2uYGouWIrUfv6vVOUtw8T1Xi2A/9BVPiOFtrR3TcTdcJ/FOghChq5tv0vA2NEwfJ6XmyOKuUmolrXTXnfkyUqOFcR3duQCzJJ/UETXRcfyz3x+0eAv9zP91bbdqCXqNZxI1HH9tPxtv8DXGlm/UQd/N/Nvcndh4CrgP+J/85Od/fvxWk3EQ1++DFRh3lR7r6OaODA1+O8rCcaFAFR/8fnic7JduAwoj4mmUS5kSUiBxUz+wIw390vqnZeRA5VqoHIQcHMjjWzk+KmijVETU8/qna+RA5lurtVDhaziJqtFhI1P/0zcGtVcyRyiFMTloiIlEVNWCIiUpYZ3YTV0dHhy5cvr3Y2REQOKg8++OBOd59Xar8ZHUCWL1/OunXrqp0NEZGDipltKr2XmrBERKRMCiAiIlIWBRARESmLAoiIiJRFAURERMqiACIiImVRABERkbLM6PtAREQOBkNjATv7x+gbHqOjpZ7DZzeQqonW4BocDdi2e4TB0YCxbMhoJmTPSIaewTH6hsYIQqcuXUNdqoa6dA2pGiNdY8ybVc/rj93fxTHLowAiIhIbC0I6e4foHwnIZEPGsiFhCKE7oTtmRo1ByozhTJau/lG69ozSP5IhlTJSZoQOA6MZBkYCRjIhqVRUoNeYMZLJMpzJMjSWZc9wht3xY2gsu1c+alPG/NYG9gwH7B7OlHUspyxtUwAREUni7uwaHGPTriG69oxgFhXWZjAahAyPRQX2SCbL8FiWoUyWgZGAPSOZ8SCRDZ0gdLbtHmZr7zBhGfPLNtamXgwyGC0NaVrq0zTU1hCETpCNtjXWpmisS9FYm2LJnCZOaKyltbGWuS11dLTU09ZYS/fAKFt6hnmhb5jWxloWtDWwsLWRWQ1p6tMp6tI1zG5M095UR1tTLemaGjLZkNEgJMg7nlwNppIUQESkYsaCkL6hMXqGxkiZ0VyfprkuTf9ohm27R3ihb5gg6zTXp2iuTzM0lmXzriE29wyxbfcw3QNj7OyPrvBDj2oC7kRX9DVGJhvuc/VeTLrGmNWQZlZDLbMa0lGTj0WftWpJO29btYhlc5tpb44K5nTKqE3VEJXFUYEcupONm40On93AvJZ66tLV7U5O1aRoqE1N+fcqgIjIPsLQ6R0ao3coM15oB2HISCa6qh8aC+jPu5Lfndcc0zs4xq7BMXYOjNI/Ut7y7rMb0ixqb6KjpY4jO5qZ3VhLjRmpuJzOhpANQ1I1NSyZ08jSOU0saG2Mt0VX+w210ZV+Q11N9G9titqUxg1NJgUQkRlmJJMlCJ3muhRm0VXz0FjAroExegbHxgv6gdGA0UyWkSBk93CGLT1DdPZGTSe7BsfIvoS2nOa6FLPj5pg5zXWcsKiVuc11zG2uo725jvamOkJ3BkcDBkYDWurTLGhrZGFrA3XpGgZGAwZHszTU1rBsTjOtTbWV+nlkEimAiEwz7k7P4BizGmrHm0bC0NnaN8z6rgH6hscYHI1qAcNjISNB1L7f2TvMs139bO4Zwh1qDJrr0wRZZzhTvJmnNmUsbGtkcXsjZx4zj3mz6pnXUk97cx2puAO4xqChNkVTXZrG2hSzG9PMbqilpSGtK/tDlAKISAXlgkHv0BhjgZPJhgyOBWzri9r/dw6MjnfajmSyPNc9wLM7BugfjZp+OlrqmNNcx9beYQYT2vprU0Z9OsWC1gZOWNjK205ZRFNdiv6RqJkpVWN0tNQzt6WOOU11tDZFNYWokzdFQ20NDekUNVPQ6SoziwKISAm5kT4DIwFB3L6+s3+UzT1RZ2+und8MRjMhuwZH2TkwRnf/KF39I2SyyU1BsxvSpOOr91SNcURHM287NerI7R/JsGPPCDsHxjjjyA6OPnwWRx/ewtyWeprrX6wJTMVoG5FCFEDkkDQ4GjA4Go3TH8pEN2p19kbDOHcPZ8bb6l/oG2ZLz1Di1X9uVA+AA3WpGua21NPRUseKjmYOn93A4bPrmdNcR326htpU1KG7oK2RBa0NVRk5IzJZFEBkRtqxZ4SHNvXy9Pb+8fHxQ5ksG7oHWd89QHf/aMH31aaM1sba8eGmC9saOf2IuSyb20RbUy2pmmjYZ1tTbTzyp2G8BiFyqFEAkRlhcDTg1+t38qunu/j1+p109g4DUbNSbaqGdI1Rn65h2dxmXnv0PI6Y18zshtrxPoD5sxtY3N7EYbPq1Rcgsp8UQOSglMmGPNa5m/99bif/s34XD27qZSwb0lKf5pVHzeU9Zyzn5cvaOX5ha9Vv8hKZqRRAZNpzd17YPcKjW/p4ZEsfD2/u5bHO3YwGIQDHLZjNRWcs43XHHMbq5XMUMESmiAKITBuDowEPbOzh3ud72NI7xMBI1JG9adcQOweiPou6VA3HL5rNu09fxsuXtXP6EXOZ01xX5ZyLHJoUQKQqBkcDfvF0F09t28PmXUNs3DXI77b3E4RObcpY0t7ErIY0zfVpXnN0B6uWtHHS4jZetmAW9WmNXBKZDhRAZMqMBSG/eW4ntz7yAj97fDvDmSzpGmNxeyNL5zbzwdfM44wj5/LyZe001elPU2S60/9SqRh3p7N3mIe39HHnkzv41dNd9I8GzG5I89ZTFvG2UxZx6tI2DYMVOUgpgMikCkPnrme6uOm+LTy8uZddg2MAtDfVcs4J8zn7+Pm8amWHbqATmQEUQGS/BdmQrv5Rtu8ZYdfAGLsGRtkzkqEuVUNTXZqB0YAb79vEc92DzJ/dwOuOPYyTl7Rx8uJWjlswWzUNkRlGAUQS7R7KcNczXfziqS7WbexhR/9oySm+T1g0m69esIpzT1ygGVpFZjgFEBn34KZebvjfjWzbPcKOPdHcUNnQmdtcxxlHdbBsThML4zmcOlrqmdNSR2tjLWNByHAmSxg6i9sbx9egEJGZTQFExn3xZ0/z+NbdHL+wlZMWt3HeyQt57TGHsWpJW/EZX+unLo8iMn0ogBxCdg9n+PkT2/mv327jpMVt/O0bjx7ftnNglAc29vDh1x3F377pmCrmUkQOFgogM1AmG7Jh5yDP7Ohn066h8Rv1Ht7cx1g2pKG2ht+s38V7zlg+fhf3HU/uIHQ454QFVc69iBwsFECmscHRgK19w4xmQjJhyFgQsnNglK49o+wcGCWTDcmGkA1DeocydPWP0NU/ypaeob0WMepoqWfpnEbeffoy/uDkBTTVpTn7K/fww4c6+cCrjwDgZ49vZ9ncJl62YFa1DldEDjIKINPIWBDy40e28qOHtvJc9wBdCWtWQLR6XV2qJl6vGtqb65jXUs+x82dx9vHzOfrwFlYeNosj5jUXvKv71KVt3HT/Zt7/qhXsGQn4zXM7ed8rV6gDXET2mwJIFWzYOcj1v9nII1v6OOqwFo6dPwt3uO5/NrBt9wgrD2vhNUfPY0VHM0vmNNFYmyKdigLGnOY6Dp/dQFtj7QGtW3HBmqVc+v3HeGBjL529UY3lnBPmT+JRishMpwAyhe7f0MM3736OXz7dRW3KWLWkjbuf6eb7D3YCsGb5HD73Rydy5tHzKl4TeMtJC/jMT57klvs3MzAaMH92Aycvbqvod4rIzKIAMgUe6+zjSz9/hnue6aajpY6/esNK3nX6Ug6b1QBEI6B2D2c4cl7LlOWpqS7NH65aOB683rFmqVbiE5GXpCoBxMz+CfgDYAx4Dnivu/fF2y4D3g9kgY+4++1x+jnAV4EUcK27f74aed8f7s4zOwb41e+6+OXTXdy/oYf2plouP/dY/uz05TTW7T0PVEdLPR0tU38zxTvWLOXG+zYDcPbxar4SkZemWjWQO4DL3D0wsy8AlwEfM7PjgAuA44GFwJ1mlrtZ4RvAG4FO4AEzW+vuT1Yh7/vYM5Lh1oe38tDmPp7fOciG7gH2jARAtFre/z37GC58xTJmNdRWOad7O2FRKycuauWFvmHWrJhT7eyIyEGmKgHE3X+e9/Je4Pz4+XnALe4+Cmwws/XAmnjbend/HsDMbon3rVoAcXce2dLHzfdv5iePbmM4k2VhawNHzGvhvFWLOH7hbM485jDmtzZUK4v75V/fcQr9I0HxO81FRAqYDn0g7wO+Ez9fRBRQcjrjNIAtE9JPK/RhZnYxcDHA0qVLJzWjADv2jPD9Bzv5wUOdPN89SFNdireespB3rlnGiYtbJ/37Km15R3O1syAiB6mKBRAzuxMo1LD+CXe/Nd7nE0AA3DhZ3+vu1wDXAKxevbr41LEvUVf/CGd/5R76hjKsWTGHP3/NEZx74oJp1zQlIjIVKhZA3P2sYtvN7D3AW4A3uHuuoN8KLMnbbXGcRpH0KfPZnz7F0GiWn/7lqzhh0cFX2xARmUxVWbAhHlF1KfCH7j6Ut2ktcIGZ1ZvZCmAlcD/wALDSzFaYWR1RR/vaqczzPc90s/bRF/iLM49U8BARoXp9IF8nmgT8jviGuXvd/UPu/oSZfZeoczwALnH3LICZfRi4nWgY73Xu/sRUZXYkk+WTP36cIzqa+Yszj5yqrxURmdaqNQrrqCLbrgKuKpB+G3BbJfOV5F9/+Sybe4a46YOnaS1vEZGY1hwtIQydb//3Bv7g5IWccWRHtbMjIjJtKICUMJaNplE/dr6mORcRyacAUkIQRgPEalO60U5EJJ8CSAlBNgQgXaOfSkQkn0rFEnIr+6kGIiKyNwWQEoIwroGk9FOJiORTqVhCENdA0ppsUERkLwogJWRyfSBqwhIR2YsCSAm5UVjqRBcR2ZtKxRICdaKLiBSkAFLCeCe6aiAiIntRqVhCbhiv+kBERPamAFJC7kbCWg3jFRHZi0rFEl7sRFcNREQknwJICS8O49VPJSKST6ViCRqFJSJSmAJICRqFJSJSmErFEjSZoohIYQogJWgyRRGRwlQqlpDRZIoiIgUpgJTwYie6fioRkXwqFUt4sQlLNRARkXwKICWMd6JrFJaIyF5UKpYQaD0QEZGCFEBKyE1lklInuojIXhRASlAnuohIYSoVSwjCEDPVQEREJlIAKSGTdXWgi4gUoJKxhCAbqgNdRKQABZASgtB1F7qISAEKICVksqE60EVEClDJWEKQdTVhiYgUoABSQiYMtRaIiEgBKhlLCLKutUBERApQACkhCEOtBSIiUoBKxhIyWY3CEhEpRAGkhECjsEREClLJWEIQahSWiEghCiAlZLKhpjIRESmgKiWjmX3GzB4zs0fM7OdmtjBONzP7mpmtj7efmveei8zs2fhx0VTlVfeBiIgUVq1L639y95PcfRXwU+Af4vQ3Ayvjx8XA1QBmNge4AjgNWANcYWbtU5HRTOiaiVdEpICqBBB335P3shnw+Pl5wA0euRdoM7MFwNnAHe7e4+69wB3AOVORV3Wii4gUlq7WF5vZVcCFwG7gdXHyImBL3m6dcVpSesVlNZmiiEhBFbu0NrM7zezxAo/zANz9E+6+BLgR+PAkfu/FZrbOzNZ1d3cf8OdpMkURkcIqVgNx97P2c9cbgduI+ji2Akvyti2O07YCZ05Ivyvhe68BrgFYvXq1F9rnpdAwXhGRwqo1Cmtl3svzgKfj52uBC+PRWKcDu919G3A78CYza487z98Up1VckHVNpigiUkC1+kA+b2bHACGwCfhQnH4bcC6wHhgC3gvg7j1m9hnggXi/K929ZyoyGjVhqQYiIjJRVQKIu/9xQroDlyRsuw64rpL5KkRNWCIihaltpoRMVuuBiIgUopKxBK0HIiJSmAJICVoPRESksJIlo5mdYGY35O6tMLPrzeykqchctbk7maxTqxsJRUT2UTSAxDf9/Yjonov3xY+7gR/kbgicybJhdBuJaiAiIvsqNQrrSuCN7r4xL+0xM/slcGv8mLGC8QCiGoiIyESlLq3TE4IHAHFabSUyNJ1ksiGA1gMRESmgVMkYmNnSiYlmtgwIKpOl6SPIqgYiIpKkVBPWFcCdZvY54ME4bTXwceBjlczYdJAJoxqIZuMVEdlX0QDi7j82sw3AR4G/jJOfBN7u7o9WOnPV9mINRE1YIiITlZzKJA4UF05BXqad8VFYqoGIiOyj1DDeDjO7wsw+YmYtZnZ1vKbHrWZ21FRlslrGO9FVAxER2UepkvEmoJ5ojfL7gQ3A+UTrmF9b2axVn4bxiogkK9WEdbi7X25mBmxy9y/G6U+bWcFZc2eSXA1EkymKiOyrVMmYhfFp1ndO2BZWJEfTSK4TXZMpiojsq1QN5AgzWwtY3nPi1ysqmrNpIMgN41UfiIjIPkoFkPz5rr40YdvE1zNOJlcD0SgsEZF9lLoP5O6kbWb2HaKJFWcs3QciIpLsQErGV0xaLqap8TvR1QciIrIPXVoXMd6JrlFYIiL7KNqEZWanJm3iEJiNN8iqBiIikqRUJ/o/F9n29GRmZDrKhBrGKyKSpFQn+uumKiPTUaAbCUVEEpWaC+vSvOd/MmHb5yqVqelC64GIiCQrdWl9Qd7zyyZsO2eS8zLt5EZhaTJFEZF9lSoZLeF5odczTq4GktKNhCIi+ygVQDzheaHXM05uNl4N4xUR2VepUVgnm9keotpGY/yc+HVDRXM2DWgYr4hIslKjsFJTlZHpSOuBiIgkU9tMEeMrEqoJS0RkHyoZiwiyTo1BjTrRRUT2oQBSRCYMNROviEgClY5FBFnXWiAiIgkUQIoIsqqBiIgkUelYRCZ0TaQoIpJAAaSIIBtqIkURkQQqHYsIsq57QEREEiiAFBE1YeknEhEpRKVjEVETlmogIiKFKIAUkcm6RmGJiCSoauloZh81Mzezjvi1mdnXzGy9mT2Wvya7mV1kZs/Gj4umIn9BGGoUlohIglKz8VaMmS0B3gRszkt+M7AyfpwGXA2cZmZzgCuA1UTTyD9oZmvdvbeSeQyyrrVAREQSVLMG8mXgUvZeV+Q84AaP3Au0mdkC4GzgDnfviYPGHUzBioiZbKiJFEVEElSldDSz84Ct7v7ohE2LgC15rzvjtKT0Qp99sZmtM7N13d3dB5TPbKhhvCIiSSrWhGVmdwLzC2z6BHA5UfPVpHP3a4BrAFavXn1AqyZmQqdJnegiIgVVLIC4+1mF0s3sRGAF8KiZASwGHjKzNcBWYEne7ovjtK3AmRPS75r0TE8QZENNpigikmDKL6/d/bfufpi7L3f35UTNUae6+3ZgLXBhPBrrdGC3u28DbgfeZGbtZtZOVHu5vdJ51Z3oIiLJqjYKK8FtwLnAemAIeC+Au/eY2WeAB+L9rnT3nkpnRuuBiIgkq3oAiWshuecOXJKw33XAdVOULUDrgYiIFKPL6yK0HoiISDKVjkVoPRARkWQKIEVoPRARkWQqHYvQKCwRkWQKIEVkwlDrgYiIJFDpWESQda0HIiKSQAEkgbsThFoPREQkiUrHBEEYTaOl+0BERApTAEkQZKMAohqIiEhhKh0TZMIQQH0gIiIJFEASZMdrIAogIiKFKIAkGK+BqAlLRKQglY4Jcn0g6kQXESlMASSBOtFFRIpT6Zgg14SlyRRFRApTAEkwXgPRZIoiIgWpdEyQyeY60VUDEREpRAEkwfid6AogIiIFKYAkCHI1EDVhiYgUpNIxQUY3EoqIFKUAkiAYH4Wln0hEpBCVjgleHIWlGoiISCEKIAlyo7BUAxERKUylY4LcKCz1gYiIFKYAkiCjUVgiIkWpdEygPhARkeIUQBJk1YQlIlKUAkiCjIbxiogUpdIxgZqwRESKUwBJ8OJkivqJREQKUemYQJMpiogUpwCSQJMpiogUp9IxQW4yRdVAREQKUwBJEIQhqRrDTAFERKQQBZAEQdY1AktEpAgFkASZrOseEBGRIlRCJgjCUHehi4gUoQCSIJN1jcASESlCJWSCIBtqBJaISBFVCSBm9ikz22pmj8SPc/O2XWZm683sd2Z2dl76OXHaejP7eKXzGISuJiwRkSLSVfzuL7v7l/ITzOw44ALgeGAhcKeZHR1v/gbwRqATeMDM1rr7k5XKXCYbUqsmLBGRRNUMIIWcB9zi7qPABjNbD6yJt6139+cBzOyWeN+KBZAg66Q0jFdEJFE1L7E/bGaPmdl1ZtYepy0CtuTt0xmnJaXvw8wuNrN1Zrauu7u77MxFo7BUAxERSVKxEtLM7jSzxws8zgOuBo4EVgHbgH+erO9192vcfbW7r543b17ZnxOErk50EZEiKtaE5e5n7c9+ZvZt4Kfxy63AkrzNi+M0iqRXhO5EFxEprlqjsBbkvXwb8Hj8fC1wgZnVm9kKYCVwP/AAsNLMVphZHVFH+9pK5jGTVROWiEgx1epE/6KZrQIc2Aj8OYC7P2Fm3yXqHA+AS9w9C2BmHwZuB1LAde7+RCUzGIROQ60CiIhIkqoEEHf/syLbrgKuKpB+G3BbJfOVL8iGpOun2yA1EZHpQ5fYCaLJFNUHIiKSRAEkQRCGmgtLRKQIlZAJgqymMhERKUYBJEEmDLUeiIhIESohE+g+EBGR4hRAEmSyrvtARESKUAmZIAi1HoiISDEKIAkCrUgoIlKUSsgEGa1IKCJSlAJIAq1IKCJSnAJIAe5ONnRSasISEUmkErKAIHQAajWMV0QkkQJIAUE2CiAaxisikkwlZAGZMARQJ7qISBEKIAWM10DUhCUikkgBpIBUjfH7Jy5gxbyWamdFRGTa0opJBbQ21vKNd51a7WyIiExrqoGIiEhZFEBERKQsCiAiIlIWBRARESmLAoiIiJRFAURERMqiACIiImVRABERkbKYu1c7DxVjZt3ApgP4iA5g5yRl52BxKB4zHJrHfSgeMxyax/0mnpXeAAAFlUlEQVRSj3mZu88rtdOMDiAHyszWufvqaudjKh2KxwyH5nEfiscMh+ZxV+qY1YQlIiJlUQAREZGyKIAUd021M1AFh+Ixw6F53IfiMcOhedwVOWb1gYiISFlUAxERkbIogIiISFkUQAows3PM7Hdmtt7MPl7t/FSKmS0xs1+Z2ZNm9oSZ/VWcPsfM7jCzZ+N/26ud18lmZikze9jMfhq/XmFm98Xn/DtmVlftPE42M2szs++b2dNm9pSZvWKmn2sz+5v4b/txM7vZzBpm4rk2s+vMrMvMHs9LK3huLfK1+PgfM7OyV89TAJnAzFLAN4A3A8cB7zCz46qbq4oJgI+6+3HA6cAl8bF+HPiFu68EfhG/nmn+Cngq7/UXgC+7+1FAL/D+quSqsr4K/MzdjwVOJjr+GXuuzWwR8BFgtbufAKSAC5iZ5/rfgXMmpCWd2zcDK+PHxcDV5X6pAsi+1gDr3f15dx8DbgHOq3KeKsLdt7n7Q/HzfqICZRHR8V4f73Y98Nbq5LAyzGwx8PvAtfFrA14PfD/eZSYecyvwGuDfANx9zN37mOHnmmjZ7kYzSwNNwDZm4Ll293uAngnJSef2POAGj9wLtJnZgnK+VwFkX4uALXmvO+O0Gc3MlgOnAPcBh7v7tnjTduDwKmWrUr4CXAqE8eu5QJ+7B/HrmXjOVwDdwP+Lm+6uNbNmZvC5dvetwJeAzUSBYzfwIDP/XOckndtJK+MUQAQzawF+APy1u+/J3+bROO8ZM9bbzN4CdLn7g9XOyxRLA6cCV7v7KcAgE5qrZuC5bie62l4BLASa2beZ55BQqXOrALKvrcCSvNeL47QZycxqiYLHje7+wzh5R65KG//bVa38VcArgT80s41EzZOvJ+obaIubOWBmnvNOoNPd74tff58ooMzkc30WsMHdu909A/yQ6PzP9HOdk3RuJ62MUwDZ1wPAynikRh1Rp9vaKuepIuK2/38DnnL3f8nbtBa4KH5+EXDrVOetUtz9Mndf7O7Lic7tL939XcCvgPPj3WbUMQO4+3Zgi5kdEye9AXiSGXyuiZquTjezpvhvPXfMM/pc50k6t2uBC+PRWKcDu/Oaul4S3YlegJmdS9ROngKuc/erqpylijCzVwH/DfyWF/sDLifqB/kusJRoOvy3u/vEDrqDnpmdCfydu7/FzI4gqpHMAR4G3u3uo9XM32Qzs1VEAwfqgOeB9xJdRM7Yc21mnwb+lGjE4cPAB4ja+2fUuTazm4EziaZt3wFcAfyYAuc2DqZfJ2rOGwLe6+7ryvpeBRARESmHmrBERKQsCiAiIlIWBRARESmLAoiIiJRFAURERMqiACIyTZnZmbnZgkWmIwUQEREpiwKIyAEys3eb2f1m9oiZfStea2TAzL4cr0XxCzObF++7yszujddh+FHeGg1HmdmdZvaomT1kZkfGH9+St4bHjfFNYCLTggKIyAEws5cR3en8SndfBWSBdxFN3LfO3Y8H7ia6MxjgBuBj7n4S0QwAufQbgW+4+8nAGUSzx0I0Q/JfE61NcwTRXE4i00K69C4iUsQbgJcDD8SVg0aiSetC4DvxPv8J/DBek6PN3e+O068Hvmdms4BF7v4jAHcfAYg/735374xfPwIsB35d+cMSKU0BROTAGHC9u1+2V6LZ30/Yr9w5g/LnaMqi/7MyjagJS+TA/AI438wOg/F1qJcR/d/Kzfj6TuDX7r4b6DWzV8fpfwbcHa8G2Wlmb40/o97Mmqb0KETKoKsZkQPg7k+a2SeBn5tZDZABLiFasGlNvK2LqJ8Eomm1vxkHiNyMuBAFk2+Z2ZXxZ/zJFB6GSFk0G69IBZjZgLu3VDsfIpWkJiwRESmLaiAiIlIW1UBERKQsCiAiIlIWBRARESmLAoiIiJRFAURERMry/wGCuVFXyLXDyQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "ELBO = [-training_loss[i] for i in range(len(training_loss))]\n", + "plt.plot(ELBO)\n", + "plt.ylabel('ELBO');plt.xlabel('epoch');plt.title(\"training curve for mini batches\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, the ELBO is monotonically increasing over epoch, and we reproduced the results given in the paper [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/). Now we can extract/load the parameters and then feed the network forward to calculate $y$ which is the reconstructed image, and we can also calculate the ELBO for the test set. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "arg_params = model.get_params()[0]\n", + "nd_iter_test.reset()\n", + "test_batch = nd_iter_test.next()\n", + "\n", + "# if saved the parameters, can load them using `load_checkpoint` method at e.g. 100th epoch\n", + "# sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 100)\n", + "# assert sym.tojson() == output.tojson()\n", + "\n", + "e = y.bind(mx.cpu(), {'data': test_batch.data[0],\n", + " 'encoder_h_weight': arg_params['encoder_h_weight'],\n", + " 'encoder_h_bias': arg_params['encoder_h_bias'],\n", + " 'mu_weight': arg_params['mu_weight'],\n", + " 'mu_bias': arg_params['mu_bias'],\n", + " 'logvar_weight':arg_params['logvar_weight'],\n", + " 'logvar_bias':arg_params['logvar_bias'],\n", + " 'decoder_z_weight':arg_params['decoder_z_weight'],\n", + " 'decoder_z_bias':arg_params['decoder_z_bias'],\n", + " 'decoder_x_weight':arg_params['decoder_x_weight'],\n", + " 'decoder_x_bias':arg_params['decoder_x_bias'], \n", + " 'loss_label':label})\n", + "\n", + "x_fit = e.forward()\n", + "x_construction = x_fit[0].asnumpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAADACAYAAADhh27FAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XmUXWWZ7/HfAxnIPJC6SQgJCRAwYQzUihG4t72SjggyZNnQAkKiMngXAi7TKtpqR1qQq40IF5cNNpNCqyBCEIIKNAi0QFPYQYaQhEwmUCEVMlUChCS894+zo4e8z0l21Rn32d/PWrVy6tnTu8956q03u/azXwshCAAAAMijPerdAAAAAKBeGAwDAAAgtxgMAwAAILcYDAMAACC3GAwDAAAgtxgMAwAAILcYDDcAM/s3M/tavdsBlMPMxppZMLMeJZa/ZGYfrnGzABf9LrKOPrdyjOcMS2a2qejbvpK2SNqefH9hCOGO2rcKeWZmyySdF0J4uN5tScvMxkpaKqlnCGFbfVuDRke/i0ZCn5tv7v8m8iaE0H/H6zQ/EGbWg8RD1pHHqCf6XeQNOdy4uE0iBTP7tpn9wsx+Zmadkj5lZreb2eyidaYmHfqO7/c1s3vMrMPMlprZRbvY/1/2tWM/ZvbVZNvXzexkM/u4mS0ys7Vm9uWibT9kZk+b2Xozazez68ysZ9Hyj5nZQjPbYGb/z8z+08xmFi0/z8xeMbN1ZvagmY2u0NuGKklyYV7ymf/BzA4vWnaZmS02s04ze9nMphctm5l8/teY2ZuSZiexJ83sX5IcWGpmHyvaZpCZ3ZTk1mvJz8KeybI9k+3WmNkSSSftpt3LzGxq8nq2md2V5H6nmb1gZgcleb/azFaY2bSibT9tZvOTdZeY2YU77fvLSRtfT3I6mNmBybLeSTv/bGZvmNm/mlmf8j4FVBv9LhoFfW7z97kMhtObLunfJQ2S9ItdrWhme0i6X9KzkkZJ+ltJXzKz41Mea18VPpt9JP2zpJskfVLSJEkflnS5mY1J1t0m6VJJwyQdK+kESRcm7fgfku6U9KVk+VJJk4va+Ylk2amSWiQ9k5wjGpSZTZJ0swqf8d6SbpB0n5n1TlZZLOl/qpCn35J0u5mNLNrFByUtkTRc0hVFsQUq5Mh3Jd1kZpYsu1WFHDtQhfybJum8ZNn5kj6exFsl/V0XT+dkST+VNETSf0v6rQp5P0rS5cm57bA6OdZASZ+WdI2ZHZW8JydI+qKkqUk7P7zTca6SdJCkI5PloyR9s4ttRX3Q76Ku6HNz0ueGEPgq+pK0TNLUnWLflvQfO8VulzS76PupkpYlr4+VtGSn9b8h6ccljvmXfSX72SRpz+T7IZKCpKOL1n9e0sdL7OsfJN2VvP6MpCeKlpmkdkkzk+8fkjSjaHkPFe7bG1XvzyHvX14eJvEfSfrnnWILJP1Nif3Mk3Rq8nqmpD/vtHympFeLvu+b5NsIFTrvLZL6FC0/U9Kjyev/kPS5omXTkm177O6cJM2W9FDRspN3yvsByb4Gl9jXvZIuTV7fLOk7RcsOTLY9MMn5zZIOKFr+IUlL6/0Z87XrfKff5aveOZjE6XND8/e53DOc3oourLufpDFmtr4otqekx1JuvyaEsKOQ5O3k3zeKlr8tqb8kmdkHJF0t6WgVfqh6qHClQSpc4fhLu0MIwcxW7tTOH5rZtUWx91S4QvJayraitvaTNMPMLi6K9VLhs5aZnavC/9jHJsv6q3D1YQcvj1fteBFCeCu5QNFf0lBJPSW1//WihfYo2sf78kvS8i6ey8457eV9f0nrkz8j/pMKVxv2UCHXXyhqR1vRvorb1JKs+1zROZgKP49ofPS7qDf63Bz0uQyG09v5sRubVfjAdxhR9HqFpEUhhAlVb1XhzxpPS/r7EMImM/sHFf60IRWuRhTfA2Qq/LmiuJ3fCCHs8s+PaCgrJF0RQrhi5wVmtp+kH0s6XtJTIYTtZjZPhY5oh648PmaFClcphgW/6KNdUvG9jmOcdcqW/DnybknnSpoTQthqZvfqr+fVrsJAYofiNq1RoZM/JITAQCN76HdRb/S5OehzuWe4++ZJOsnMhiT3B11StOwpSe+a2Swz2yu56f0wMzu6Cu0YIGmDpM1mNkHJfWuJ+yUdZYVCkB4q3OPWUrT8XyX9Y7KdzGywmXX1HiRUT88kf3Z89VCh4/2cmX3QCvqZ2UlmNkBSPxU63g6pUAAh6dDuHjyE0C7pd5KuNrOBZraHmR1gZn+TrHKnpEusULQ0RNJlZZzrrvSS1FuF89qWXLGYVrT8TkmfNrMJZtZXhT+N7ziH91R4z65J7uWUmY0ys49Wqa2oLvpdVBN9bkHu+lwGw913q6T5KvyZ4jeSfr5jQfI/uhNVKJpYpsL/lG5Q4Ub0SpslaYakzuQYf7naEEJ4Q9LfS/q+pDclHaDCTfNbkuV3JcvuMrONkv4kqaETNmfmqvA/7B1fs0MIbSoUUVwvaZ2kV1W4B00hhJdV+NPtUyr8OewwSf9ZZhvOVaFjfDk53i8l7SgO+bEKBRjPS/qjpF+VeSxXCKFThUHPnUkbzpJ0X9HyByVdJ+lRFd6Pp5NFW5J/v7IjnuT5w5IOrkZbUXW3in4X1UOfq3z2uUy6kSNWeDzL65L+LoTwRL3bA1RDcsXtRUm9S/ypEagZ+l00u2boc7ky3OTM7ITkz3C9VfhTxlZJ/1XnZgEVZWbTrfBsyyGS/q+kX2e1U0b20e+i2TVbn8tguPkdp8IzDjtU+FPc9BDCll1vAmTOhSo8F3OxClP6/p/6Ngc5R7+LZtdUfS63SQAAACC3uDIMAACA3CrrOcNWmJLvWhUepvxvIYSrdrX+sGHDwtixY8s5JKDnnntuTQihZfdrVg65i3ItW7ZMa9assd2vWTnkLSqBPhdZlTZ3uz0YTipkf6jC/O8rJT1rZvcljxpxjR07Vm1tbaUWA6mYWVdn3SkbuYtytba21vyY5C0qgT4XWZU2d8u5TWKyCvNrLwkhvKvC8x5PLWN/AAAAQE2VMxgepffPR71S759yUpJkZheYWZuZtXV0dJRxOKC2yF1kEXmLrCJ3US9VL6ALIdwYQmgNIbS2tNT0liOgLOQusoi8RVaRu6iXcgbDr0kaXfT9vkkMAAAAyIRyBsPPShpvZuPMrJekT6po7moAAACg0XX7aRIhhG1m9nlJv1Xh0Wo3hxBeqljLAAAAgCor6znDIYS5kuZWqC0AAABATTEDHQAAAHKrrCvDAAAA+KsQQr2b8D5mNZ34MpO4MgwAAIDcYjAMAACA3GIwDAAAgNxiMAwAAIDcYjAMAACA3OJpEgAAAN1QjSdH8PSH2uPKMAAAAHKLwTAAAAByi8EwAAAAcovBMAAAAHKLAjoAAIBdqOUUy2mPRaFd5XBlGAAAALnFYBgAAAC5xWAYAAAAucVgGAAAALlVVgGdmS2T1Clpu6RtIYTWSjQKQP7UskCl0ihkAbIpbb9Taj0vvn379lSxUrz+pEePeLjmrefFSrX9vffeS7XuHnvE1029WFf6wUbrMyvxNIn/HUJYU4H9AAAAADXFbRIAAADIrXIHw0HS78zsOTO7wFvBzC4wszYza+vo6CjzcEDtkLvIIvIWWUXuol7KHQwfF0I4StLHJF1kZv9r5xVCCDeGEFpDCK0tLS1lHg6oHXIXWUTeIqvIXdRLWfcMhxBeS/5dbWb3SJos6fFKNAxoduUUjHWlmMMrkvBUo6DBa081iizqXbhR7nki5r2nb775ZhSbM2eOu/0ll1wSxd56661Ux+7Xr18Uu+6669x1zz333CjmFTuh/rycShsrVQC3devWKLZhw4ZUsVL9eO/evVPFevXqFcW8fqfU7wAv3rNnzyjWt2/fVMf2iupKtanRdPvKsJn1M7MBO15LmibpxUo1DAAAAKi2cv77OlzSPcmIv4ekfw8h/KYirQIAAABqoNuD4RDCEklHVLAtAAAAQE3xaDUAAADkVm7u8n/66aej2LXXXuuuO2rUqCjWp0+fKDZjxowoNnToUHefpeLIB69QwYt5RRpbtmyJYp2dne5xNm3alGr7rhSIpG3TO++8E8XefffdKLbnnnu6x/EKMrxijr322iuKjRgxIooNGzbMPY63z7QzOaE6lixZEsW+/e1vR7HbbrutrOOUKvDZ2dtvvx3Fzj//fHfdhQsXRrErr7yy28dGZaQtjEvbN3v9myStWLEiij377LNRbOXKlVHM6/MkaeDAgVFs7733jmLeuMIr3vT6Ycnv271+c/To0VFs8ODBUSzLOZ7dlgMAAABlYjAMAACA3GIwDAAAgNxiMAwAAIDcYjAMAACA3MrN0yS8Jz8sWrSorH1eccUVUWzQoEHuulOmTCnrWLUwduzYKPbVr37VXXfMmDFVbk02lZr2ctu2bVHMm8Zz3bp1UeyVV16JYk8++aR7nD//+c9RbOPGjamOU2qq2s2bN6eKlTPVreRXUHtPfvB+xiZPnhzFZs6c6R7Hy12mz62NUtPPfulLX4pi9957b8WP7z3JxIuVqr73fO9734tixxxzTBQ75ZRTUu8T6XVlavq0T5Pwnpbj9a2SdOedd0axF154IYp5fZn3O1fyn9Tg9ZvePr0noZRqu/d7YMiQIVHMe+qF155S/WgWnszDlWEAAADkFoNhAAAA5BaDYQAAAOQWg2EAAADkVm6qRrxijHnz5rnrHnLIIVHspZdeimLPPPNMFJszZ467z9/+9rdRbNy4cVFs6dKl7vZpeTewjxw5Mop5U0h6St3g/5WvfKVL7cqLUsUc3rSXXsGZV+jwu9/9Loo9+uij7nHa29tTHSftVNCleNt75967d+8o5k2nLElr166NYhs2bIhiXju9IsHjjjvOPY433ToFdJXn5cMNN9zgrpu2WK5Pnz5R7NBDD3XXnTVrVhT7yEc+EsW8aW4vu+yyKOYVypVyxx13RLGPfvSjUcz7+UD1eP2WV9zsFZbNnTvX3ef9998fxbwitkmTJkWxAw880N2nl9NeUZ1X2Ob1owsWLHCP4z1EwOufW1paotg+++yTatus4MowAAAAcovBMAAAAHKLwTAAAAByi8EwAAAAcmu3VSNmdrOkj0taHUI4NIkNlfQLSWMlLZN0RgghvuO8gUyYMCFVrJTDDz88ip155plR7KqrrnK3X7ZsWRTzCuiWLFmSuk0e74Z6r4DOO3ZHR0cU+8AHPlBWe1CaV8yxxx7x/0+9Ipthw4a5+/SKlrziMK8YwyuSkPwCI2/WLm+WIW+2OG+GI8kvaH3ggQeimFcgkrZIsFqyMMNSrXnv/0UXXZR6ey/HvvnNb0axL3/5y11rWAqzZ8+OYt5MY5K0fPnyKPbLX/4yinkz0J199tldbxxSSTvbnDfboPf7+vHHH3eP4xU9jx49OopNnDgxinkzZ0p+/963b98o5vU7XSmEXr16dRTzZt8bMWJEFPMKAgcMGOAex/u91mh9Zporw7dKOmGn2GWSHgkhjJf0SPI9AAAAkCm7HQyHEB6XtPOlmFMl3Za8vk3SaRVuFwAAAFB13b1neHgIYccDTVdJGl5qRTO7wMzazKzN+zM80KjIXWQReYusIndRL2UX0IXCjTn+TAOF5TeGEFpDCK2l7kkEGhG5iywib5FV5C7qpbvTLr1hZiNDCO1mNlJSfBd2TpWagSVtIVpXivrS8mbKW7NmTRT74Ac/GMWmTZtW8fbkkVdA4BVEeLOjHX/88anWk6TNmzdHMa8Yw8vHUr98vHam5RXvebMzSdLWrVuj2G9+85so5r2Xw4fHf5zyij6k9MV/qK/LL788ilWjWM7j9eN33323u25ra2uqfV555ZVR7NRTT41i/fv3T7U/7Fran2mv3/EK2UvNDtuzZ88otv/++0exD33oQ1HM67ckPwe8fittsZxXFCdJ69evj2LerJ/eeKGzszOKlfod4v0e8Aoc69kPd/fK8H2SZiSvZ0jy5yAGAAAAGthuB8Nm9jNJT0k62MxWmtlnJV0l6W/NbJGkqcn3AAAAQKbs9jaJEEL8MN2C+G+3AAAAQIYwAx0AAAByi8EwAAAAcqu7T5NAg/KeJjB9+vQo5k1L+YMf/CCK9enTpzINywnvSQeSXwnsTZ3tTX3sPc1hzJgx7nG86uKBAwdGMW/aTK/iV0pf4Zt2+tNSlc0rV66MYl4+e+/HUUcdFcW8acgl/zx5mkTlvfjii6nX9abovvjiiyvZnC7xcvSss84qa5+vvPJKFPvud78bxbynaKB6vD7Km/J927Zt7vZeP+7lsxfr3bu3u89Sv0fStGnjxo1RzHtqhCStW7cuinlT23uxd955J4qVerqF97vBi3lq1TdzZRgAAAC5xWAYAAAAucVgGAAAALnFYBgAAAC5RQFdk7n11luj2KpVq6KYV6i13377VaNJuVLqZv+0BRFeoZ0XK1XY6BUleAVj5U5JnLb4wSuoKDWt6R/+8Ico9u6770axww8/PIqdeOKJUWzw4MHucSiWq42XX3459br1LGr0iotmzpwZxRYuXFjxY3v99ezZs9110/YhedOVPPHW9fpCr6jOW0/y+7hNmzZFMe/3sFfcLPn9u9cmb5rk+fPnR7H29nb3OB0dHVHM69u9Yjmvby5VZOi9R+X+fFe6f+CnCwAAALnFYBgAAAC5xWAYAAAAucVgGAAAALlFAV1GLV682I1/8YtfTLX9U089FcVGjBhRVptQmnezf9qCGG+9UsUcaY9dDV6Bx4YNG6LYnDlz3O2XLFkSxbwCE69Y7uCDD45iPXv2dI+D2mhtbU29rlfI85Of/CSKfe5zn0u9T68QyCs4+vrXvx7Ffv3rX6c+TjnOOOOMKEaBZ/V4763XTwwbNiyKlSp282are+mll6LYgw8+GMVeffVVd5977bVXFNu6dWsU6+zsjGLez9Ly5cvd43jFo9579Pbbb0cxb3bQUrOLejPteb/XvBgz0AEAAABVxmAYAAAAucVgGAAAALnFYBgAAAC5tdsCOjO7WdLHJa0OIRyaxGZLOl/Sjju1vxZCmFutRiJWqsDDu8n+9NNPj2L7779/xduErimnMKBUAV3ameEqva3kzz70wgsvRLHHHnvM3d6bpWjy5MlR7OSTT45igwYNimKlChQpUKqNcvuYW265JYqNHz8+ih1wwAHu9tdff30Uu+aaa8pqUzm8HD3nnHOiGPlZPV6f0KtXryh26KGHpopJfgGdV6jp9XsLFixw9zlgwIAo5hWh9e3bN4p551NqZjhvZjkv/7y+2Suq88Yfkl9cXe7vm0pLc2X4VkknOPFrQghHJl8MhAEAAJA5ux0MhxAelxT/1wcAAADIuHLuGf68mf3JzG42syGlVjKzC8yszczavOffAY2K3EUWkbfIKnIX9dLdwfCPJB0g6UhJ7ZKuLrViCOHGEEJrCKG1paWlm4cDao/cRRaRt8gqchf10q0Z6EIIb+x4bWY/lnR/xVqEiHdT+j333OOu691k/53vfCeKdWUGM5TPK0rwCgjSFs+UKj5Ie5y025ba3iuIWLVqVRS7/fbbo9iKFSvc44wcOTKKfeITn4hi48aNi2LeLFIUItWXV6x09dX+dZNZs2ZFsba2tig2bdq08hvWTUcccYQbf/7551NtP3Xq1NT7RPnSzvrZo0c8DBo7dmwUO+2009zjeL9zX3nllSjmFat5hWml2uT952D06NGp2vPmm2+6x0k745s3I16fPn2iWJbHFd26Mmxmxb+1pkt6sTLNAQAAAGonzaPVfibpw5KGmdlKSf8k6cNmdqSkIGmZpAur2EYAAACgKnY7GA4hnOmEb6pCWwAAAICaYgY6AAAA5BaDYQAAAORWt54mgdq66ab4rpQnnnjCXfess86KYky93JjKedpBV578UM56pdZ96623otgdd9wRxR5++OHUxzn++OOjmFeB701BmrYqGrXjvf+XXHKJu6431e3Pf/7zVDFvWthSxx8+fHgU86b87spTUA455BA3vrNLL7001XqoHi8nvCcgDBw4MIpNmTLF3aeXU4sWLYpi3hMdSvVRw4YNi2LeU3S89To7O6PY0qVL3eP069cvinnvx5Ah8VQS3nvkPdWn1D4brX/myjAAAAByi8EwAAAAcovBMAAAAHKLwTAAAAByiwK6BjNv3rwodvHFF0exwYMHu9tffvnlFW8Tmk9Xpm3etm1bFHvuueeimFdAt379+ih29NFHu8c555xzopg3BWkWijHg8wodJb9Q0ot94xvfiGKvvfaau09vSluvWC6tMWPGuPEDDzwwim3YsCGKHXbYYd0+NqrHy0mvEGzQoEHu9hMnToxi3nTOW7ZsiWLetPaSP/1x2sJhbz2vAE7yC+i86Zy9ftgroPO2LdXORit65sowAAAAcovBMAAAAHKLwTAAAAByi8EwAAAAcosCujryZk4688wzo9j27duj2Nlnn+3uk9nm8i1tAYJXLFeqmGPVqlVR7Prrr49i3ixHXiHI9OnT3eMcdNBBUcwrgvLOkQK6fNhvv/1SxaqhVIGp1z97RUxewREak9efeH2RlL4Ar1T/mvb43nHS7tMr3it1HC9PvVifPn2iWLkz0NWzb+fKMAAAAHKLwTAAAAByi8EwAAAAcovBMAAAAHJrtwV0ZjZa0k8kDZcUJN0YQrjWzIZK+oWksZKWSTojhLCuek3NNu9G95NOOimKLViwIIpNmDAhin3rW9+qTMOQS14xkFfQKUl33XVXFPv9738fxbZu3RrFvNnmTjnlFPc43uxF5RQEUlSHSlq8eLEb9wpHb7jhhmo3BzVWqj9JO5NaVwro0vL6vc2bN0exdev8oZlX8Ja2WC5tcXOpeKP1z2muDG+TNCuEMFHSFEkXmdlESZdJeiSEMF7SI8n3AAAAQGbsdjAcQmgPIfwxed0pab6kUZJOlXRbstptkk6rViMBAACAaujSPcNmNlbSJEnPSBoeQmhPFq1S4TYKb5sLzKzNzNo6OjrKaCpQW+Qusoi8RVaRu6iX1INhM+sv6W5JXwghbCxeFgo3rrhPJA8h3BhCaA0htLa0tJTVWKCWyF1kEXmLrCJ3US+pZqAzs54qDITvCCH8Kgm/YWYjQwjtZjZS0upqNbIZrF27Noo99thjqbb96U9/GsWGDh1abpOQE16RhVfsNn/+fHf7uXPnRrFNmzZFscGDB0exT33qU1Fsn332cY+TdpYioB66UhT3wAMPRLHzzjuvks1Bg0jbR3mFdl3hFeB5sXfeeSeKlWqjV7Ts9cOetIWDu4p3d71q2O2nY4XW3SRpfgjh+0WL7pM0I3k9Q9KcyjcPAAAAqJ40V4aPlXSOpBfMbF4S+5qkqyTdaWaflbRc0hnVaSIAAABQHbsdDIcQnpRU6tr18ZVtDgAAAFA7zEAHAACA3GIwDAAAgNxK9TQJpLdhwwY3PmXKlFTb33777VFs0qRJZbUJ+eE9OWL79u1RzMvTxx9/3N3nihUrolj//v2j2LRp06KYN/Vyr1693ONUupLYey9K4akVqKQnn3wyiq1fvz6KeU9gQfaV25+UM7289zSIUk+I8J5G4U2z7D05oiv9aznnUytcGQYAAEBuMRgGAABAbjEYBgAAQG4xGAYAAEBuUUBXYbfccosbX7JkSartjzvuuCjWaDeao/5KFS94BRFbtmyJYsuXL49iCxcudPfpTdl52GGHRbHTTz89ig0ZMiSKlTstaTWKMbJQ4IHsWLt2bRTzfr4mT55ci+agAZTbn3h9u9dvbd26NYqV6nP33nvvKDZw4MAo5hXgecfuSlFdo+HKMAAAAHKLwTAAAAByi8EwAAAAcovBMAAAAHKLAroyLFq0KIrNnj279g1BJqUtNuhKoYJXPOHNfPX8889HsXXr1rn79IrgjjjiiCh28MEHR7GuzFzkFZhQ2AYA6fXs2TOKjRkzxl1348aNUaylpSWKjRs3Lop5hdWlCvWy0GdzZRgAAAC5xWAYAAAAucVgGAAAALnFYBgAAAC5tdsCOjMbLeknkoZLCpJuDCFca2azJZ0vqSNZ9WshhLnVamgjeuKJJ6KYd0N6KRMmTIhiffr0KatNyI60BWNebNu2be4+N2/eHMVWrFgRxRYvXhzFNm3a5O6zb9++Ucybucgrnkg7a1K9ZaHAA/U1ffp0N37PPfdEsbfeeiuK7bvvvhVvE/LD66N69IiHcMOHD49iU6dOdfd51FFHRbH+/ftHsdGjR0exAQMGRDFvpjrJb3uj9blpniaxTdKsEMIfzWyApOfM7KFk2TUhhH+pXvMAAACA6tntYDiE0C6pPXndaWbzJY2qdsMAAACAauvSPcNmNlbSJEnPJKHPm9mfzOxmM4sfRlrY5gIzazOzto6ODm8VoCGRu8gi8hZZRe6iXlIPhs2sv6S7JX0hhLBR0o8kHSDpSBWuHF/tbRdCuDGE0BpCaPUe5gw0KnIXWUTeIqvIXdRLqsGwmfVUYSB8RwjhV5IUQngjhLA9hPCepB9Lmly9ZgIAAACVl+ZpEibpJknzQwjfL4qPTO4nlqTpkl6sThObwzHHHBPFHnrooSjG0ySwM6/qttS0l95UnN50ygcddFDq4/fr1y+KHXvssVFs8ODBUaxUOz1pp14ud4rmRqtiRjaUqsh//fXXa9wS5JHXb3lPb/Ce/rP//vu7+/SmWfaeUOEdx4uV6luz0OemeZrEsZLOkfSCmc1LYl+TdKaZHanC49aWSbqwKi0EAAAAqiTN0ySelOQN63P1TGEAAAA0H2agAwAAQG4xGAYAAEBupblnGCV85jOfSRUD0ko7bWWpggRvikyvWG78+PFRzJs6WUo/fXK5bS93ewBoVmkL6EpNiVzpYzcbrgwDAAAgtxgMAwAAILcYDAMAACC3GAwDAAAgtyxtcUxFDmbWIWl58u0wSWtqdvDq43xqZ78QQk0nri/K3UZ+X7qD86mdeuat1NjvTXdwPrVD7lYW51M7qXK3poPh9x3YrC2E0FqXg1cB55MPzfa+cD750WzvDeeTH8323nA+jYfbJAAAAJBbDIYBAACQW/UcDN9Yx2NXA+eTD832vnA++dFs7w3nkx/N9t5wPg2mbvcMAwAAAPXGbRIAAADILQbDAAAAyK2aD4Zu3yMbAAACn0lEQVTN7AQzW2Bmr5rZZbU+frnM7GYzW21mLxbFhprZQ2a2KPl3SD3b2BVmNtrMHjWzl83sJTO7NIln9pyqhdxtLORuOlnPW6m5cpe8TS/rudtMeSs1d+7WdDBsZntK+qGkj0maKOlMM5tYyzZUwK2STtgpdpmkR0II4yU9knyfFdskzQohTJQ0RdJFyWeS5XOqOHK3IZG7u9EkeSs1V+6Styk0Se7equbJW6mJc7fWV4YnS3o1hLAkhPCupJ9LOrXGbShLCOFxSWt3Cp8q6bbk9W2STqtpo8oQQmgPIfwxed0pab6kUcrwOVUJudtgyN1UMp+3UnPlLnmbWuZzt5nyVmru3K31YHiUpBVF369MYlk3PITQnrxeJWl4PRvTXWY2VtIkSc+oSc6pgsjdBkbultSseSs1wedM3u5Ss+ZuU3zOzZa7FNBVWCg8qy5zz6szs/6S7pb0hRDCxuJlWT0ndE1WP2dyF1n8nMlbZPVzbsbcrfVg+DVJo4u+3zeJZd0bZjZSkpJ/V9e5PV1iZj1VSOw7Qgi/SsKZPqcqIHcbELm7W82at1KGP2fyNpVmzd1Mf87Nmru1Hgw/K2m8mY0zs16SPinpvhq3oRrukzQjeT1D0pw6tqVLzMwk3SRpfgjh+0WLMntOVULuNhhyN5VmzVspo58zeZtas+ZuZj/nps7dEEJNvySdKGmhpMWS/rHWx69A+38mqV3SVhXuYfqspL1VqKBcJOlhSUPr3c4unM9xKvxJ40+S5iVfJ2b5nKr4XpG7DfRF7qZ+nzKdt8k5NE3ukrddeq8ynbvNlLfJ+TRt7jIdMwAAAHKLAjoAAADkFoNhAAAA5BaDYQAAAOQWg2EAAADkFoNhAAAA5BaDYQAAAOQWg2EAAADk1v8HLgAFn4S3bUIAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# learning images on the test set\n", + "f, ((ax1, ax2, ax3, ax4)) = plt.subplots(1,4, sharex='col', sharey='row',figsize=(12,3))\n", + "ax1.imshow(np.reshape(image_test[0,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + "ax1.set_title('True image')\n", + "ax2.imshow(np.reshape(x_construction[0,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + "ax2.set_title('Learned image')\n", + "ax3.imshow(np.reshape(image_test[99,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + "ax3.set_title('True image')\n", + "ax4.imshow(np.reshape(x_construction[99,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + "ax4.set_title('Learned image')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('loss', 140.17346005859375)]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# calculate the ELBO which is minus the loss for test set\n", + "metric = mx.metric.Loss()\n", + "model.score(nd_iter_test, metric)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. All together: MXNet-based class VAE" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from VAE import VAE" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One can directly call the class `VAE` to do the training:\n", + "\n", + "```VAE(n_latent=5,num_hidden_ecoder=400,num_hidden_decoder=400,x_train=None,x_valid=None,\n", + "batch_size=100,learning_rate=0.001,weight_decay=0.01,num_epoch=100,optimizer='sgd',model_prefix=None,\n", + "initializer = mx.init.Normal(0.01),likelihood=Bernoulli)```\n", + "\n", + "The outputs are the learned model and training loss." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Epoch[0] Train-loss=383.478870\n", + "INFO:root:Epoch[0] Time cost=5.075\n", + "INFO:root:Epoch[1] Train-loss=211.923867\n", + "INFO:root:Epoch[1] Time cost=4.741\n", + "INFO:root:Epoch[2] Train-loss=206.789445\n", + "INFO:root:Epoch[2] Time cost=4.601\n", + "INFO:root:Epoch[3] Train-loss=204.428186\n", + "INFO:root:Epoch[3] Time cost=4.865\n", + "INFO:root:Epoch[4] Train-loss=202.417322\n", + "INFO:root:Epoch[4] Time cost=4.606\n", + "INFO:root:Epoch[5] Train-loss=200.635136\n", + "INFO:root:Epoch[5] Time cost=4.711\n", + "INFO:root:Epoch[6] Train-loss=199.009614\n", + "INFO:root:Epoch[6] Time cost=5.159\n", + "INFO:root:Epoch[7] Train-loss=197.565788\n", + "INFO:root:Epoch[7] Time cost=4.588\n", + "INFO:root:Epoch[8] Train-loss=196.524507\n", + "INFO:root:Epoch[8] Time cost=4.905\n", + "INFO:root:Epoch[9] Train-loss=195.725745\n", + "INFO:root:Epoch[9] Time cost=4.426\n", + "INFO:root:Epoch[10] Train-loss=194.902025\n", + "INFO:root:Epoch[10] Time cost=4.685\n", + "INFO:root:Epoch[11] Train-loss=194.026873\n", + "INFO:root:Epoch[11] Time cost=4.622\n", + "INFO:root:Epoch[12] Train-loss=193.350646\n", + "INFO:root:Epoch[12] Time cost=4.712\n", + "INFO:root:Epoch[13] Train-loss=192.737502\n", + "INFO:root:Epoch[13] Time cost=4.618\n", + "INFO:root:Epoch[14] Train-loss=192.338165\n", + "INFO:root:Epoch[14] Time cost=4.763\n", + "INFO:root:Epoch[15] Train-loss=191.888625\n", + "INFO:root:Epoch[15] Time cost=5.168\n", + "INFO:root:Epoch[16] Train-loss=191.170650\n", + "INFO:root:Epoch[16] Time cost=4.809\n", + "INFO:root:Epoch[17] Train-loss=190.307264\n", + "INFO:root:Epoch[17] Time cost=4.622\n", + "INFO:root:Epoch[18] Train-loss=188.988063\n", + "INFO:root:Epoch[18] Time cost=4.543\n", + "INFO:root:Epoch[19] Train-loss=187.616311\n", + "INFO:root:Epoch[19] Time cost=5.154\n", + "INFO:root:Epoch[20] Train-loss=186.352783\n", + "INFO:root:Epoch[20] Time cost=4.661\n", + "INFO:root:Epoch[21] Train-loss=185.428020\n", + "INFO:root:Epoch[21] Time cost=5.193\n", + "INFO:root:Epoch[22] Train-loss=184.543097\n", + "INFO:root:Epoch[22] Time cost=4.519\n", + "INFO:root:Epoch[23] Train-loss=184.029907\n", + "INFO:root:Epoch[23] Time cost=4.732\n", + "INFO:root:Epoch[24] Train-loss=183.643270\n", + "INFO:root:Epoch[24] Time cost=5.011\n", + "INFO:root:Epoch[25] Train-loss=183.246912\n", + "INFO:root:Epoch[25] Time cost=4.706\n", + "INFO:root:Epoch[26] Train-loss=183.065233\n", + "INFO:root:Epoch[26] Time cost=4.673\n", + "INFO:root:Epoch[27] Train-loss=182.680542\n", + "INFO:root:Epoch[27] Time cost=4.628\n", + "INFO:root:Epoch[28] Train-loss=182.428677\n", + "INFO:root:Epoch[28] Time cost=4.772\n", + "INFO:root:Epoch[29] Train-loss=182.219946\n", + "INFO:root:Epoch[29] Time cost=4.571\n", + "INFO:root:Epoch[30] Train-loss=182.070927\n", + "INFO:root:Epoch[30] Time cost=4.603\n", + "INFO:root:Epoch[31] Train-loss=181.837968\n", + "INFO:root:Epoch[31] Time cost=4.559\n", + "INFO:root:Epoch[32] Train-loss=181.624303\n", + "INFO:root:Epoch[32] Time cost=5.069\n", + "INFO:root:Epoch[33] Train-loss=181.534547\n", + "INFO:root:Epoch[33] Time cost=4.654\n", + "INFO:root:Epoch[34] Train-loss=181.239556\n", + "INFO:root:Epoch[34] Time cost=4.776\n", + "INFO:root:Epoch[35] Train-loss=181.098188\n", + "INFO:root:Epoch[35] Time cost=4.571\n", + "INFO:root:Epoch[36] Train-loss=180.820560\n", + "INFO:root:Epoch[36] Time cost=4.815\n", + "INFO:root:Epoch[37] Train-loss=180.828095\n", + "INFO:root:Epoch[37] Time cost=4.455\n", + "INFO:root:Epoch[38] Train-loss=180.495569\n", + "INFO:root:Epoch[38] Time cost=5.096\n", + "INFO:root:Epoch[39] Train-loss=180.389106\n", + "INFO:root:Epoch[39] Time cost=4.797\n", + "INFO:root:Epoch[40] Train-loss=180.200965\n", + "INFO:root:Epoch[40] Time cost=5.054\n", + "INFO:root:Epoch[41] Train-loss=179.851014\n", + "INFO:root:Epoch[41] Time cost=4.642\n", + "INFO:root:Epoch[42] Train-loss=179.719933\n", + "INFO:root:Epoch[42] Time cost=4.603\n", + "INFO:root:Epoch[43] Train-loss=179.431740\n", + "INFO:root:Epoch[43] Time cost=4.341\n", + "INFO:root:Epoch[44] Train-loss=179.235384\n", + "INFO:root:Epoch[44] Time cost=4.638\n", + "INFO:root:Epoch[45] Train-loss=179.108771\n", + "INFO:root:Epoch[45] Time cost=4.754\n", + "INFO:root:Epoch[46] Train-loss=178.714163\n", + "INFO:root:Epoch[46] Time cost=4.457\n", + "INFO:root:Epoch[47] Train-loss=178.508338\n", + "INFO:root:Epoch[47] Time cost=4.960\n", + "INFO:root:Epoch[48] Train-loss=178.288002\n", + "INFO:root:Epoch[48] Time cost=4.562\n", + "INFO:root:Epoch[49] Train-loss=178.083288\n", + "INFO:root:Epoch[49] Time cost=4.619\n", + "INFO:root:Epoch[50] Train-loss=177.791330\n", + "INFO:root:Epoch[50] Time cost=4.580\n", + "INFO:root:Epoch[51] Train-loss=177.570741\n", + "INFO:root:Epoch[51] Time cost=4.704\n", + "INFO:root:Epoch[52] Train-loss=177.287114\n", + "INFO:root:Epoch[52] Time cost=5.172\n", + "INFO:root:Epoch[53] Train-loss=177.122645\n", + "INFO:root:Epoch[53] Time cost=4.678\n", + "INFO:root:Epoch[54] Train-loss=176.816022\n", + "INFO:root:Epoch[54] Time cost=4.819\n", + "INFO:root:Epoch[55] Train-loss=176.670484\n", + "INFO:root:Epoch[55] Time cost=4.568\n", + "INFO:root:Epoch[56] Train-loss=176.459671\n", + "INFO:root:Epoch[56] Time cost=4.450\n", + "INFO:root:Epoch[57] Train-loss=176.174175\n", + "INFO:root:Epoch[57] Time cost=4.579\n", + "INFO:root:Epoch[58] Train-loss=175.935856\n", + "INFO:root:Epoch[58] Time cost=4.552\n", + "INFO:root:Epoch[59] Train-loss=175.739928\n", + "INFO:root:Epoch[59] Time cost=4.385\n", + "INFO:root:Epoch[60] Train-loss=175.579695\n", + "INFO:root:Epoch[60] Time cost=4.496\n", + "INFO:root:Epoch[61] Train-loss=175.403871\n", + "INFO:root:Epoch[61] Time cost=5.088\n", + "INFO:root:Epoch[62] Train-loss=175.157114\n", + "INFO:root:Epoch[62] Time cost=4.628\n", + "INFO:root:Epoch[63] Train-loss=174.953950\n", + "INFO:root:Epoch[63] Time cost=4.826\n", + "INFO:root:Epoch[64] Train-loss=174.743393\n", + "INFO:root:Epoch[64] Time cost=4.832\n", + "INFO:root:Epoch[65] Train-loss=174.554056\n", + "INFO:root:Epoch[65] Time cost=4.375\n", + "INFO:root:Epoch[66] Train-loss=174.366719\n", + "INFO:root:Epoch[66] Time cost=4.583\n", + "INFO:root:Epoch[67] Train-loss=174.160622\n", + "INFO:root:Epoch[67] Time cost=4.586\n", + "INFO:root:Epoch[68] Train-loss=173.981699\n", + "INFO:root:Epoch[68] Time cost=5.149\n", + "INFO:root:Epoch[69] Train-loss=173.751617\n", + "INFO:root:Epoch[69] Time cost=4.495\n", + "INFO:root:Epoch[70] Train-loss=173.548732\n", + "INFO:root:Epoch[70] Time cost=4.588\n", + "INFO:root:Epoch[71] Train-loss=173.380950\n", + "INFO:root:Epoch[71] Time cost=5.042\n", + "INFO:root:Epoch[72] Train-loss=173.158519\n", + "INFO:root:Epoch[72] Time cost=4.817\n", + "INFO:root:Epoch[73] Train-loss=172.970726\n", + "INFO:root:Epoch[73] Time cost=4.791\n", + "INFO:root:Epoch[74] Train-loss=172.782357\n", + "INFO:root:Epoch[74] Time cost=4.377\n", + "INFO:root:Epoch[75] Train-loss=172.581992\n", + "INFO:root:Epoch[75] Time cost=4.518\n", + "INFO:root:Epoch[76] Train-loss=172.385020\n", + "INFO:root:Epoch[76] Time cost=4.863\n", + "INFO:root:Epoch[77] Train-loss=172.198309\n", + "INFO:root:Epoch[77] Time cost=5.104\n", + "INFO:root:Epoch[78] Train-loss=172.022333\n", + "INFO:root:Epoch[78] Time cost=4.571\n", + "INFO:root:Epoch[79] Train-loss=171.816585\n", + "INFO:root:Epoch[79] Time cost=4.557\n", + "INFO:root:Epoch[80] Train-loss=171.643714\n", + "INFO:root:Epoch[80] Time cost=4.567\n", + "INFO:root:Epoch[81] Train-loss=171.460581\n", + "INFO:root:Epoch[81] Time cost=4.735\n", + "INFO:root:Epoch[82] Train-loss=171.284854\n", + "INFO:root:Epoch[82] Time cost=5.012\n", + "INFO:root:Epoch[83] Train-loss=171.113129\n", + "INFO:root:Epoch[83] Time cost=4.877\n", + "INFO:root:Epoch[84] Train-loss=170.947790\n", + "INFO:root:Epoch[84] Time cost=4.487\n", + "INFO:root:Epoch[85] Train-loss=170.766223\n", + "INFO:root:Epoch[85] Time cost=4.723\n", + "INFO:root:Epoch[86] Train-loss=170.602559\n", + "INFO:root:Epoch[86] Time cost=4.803\n", + "INFO:root:Epoch[87] Train-loss=170.448713\n", + "INFO:root:Epoch[87] Time cost=4.636\n", + "INFO:root:Epoch[88] Train-loss=170.273053\n", + "INFO:root:Epoch[88] Time cost=4.562\n", + "INFO:root:Epoch[89] Train-loss=170.099485\n", + "INFO:root:Epoch[89] Time cost=4.567\n", + "INFO:root:Epoch[90] Train-loss=169.934289\n", + "INFO:root:Epoch[90] Time cost=4.905\n", + "INFO:root:Epoch[91] Train-loss=169.768920\n", + "INFO:root:Epoch[91] Time cost=4.636\n", + "INFO:root:Epoch[92] Train-loss=169.620803\n", + "INFO:root:Epoch[92] Time cost=4.429\n", + "INFO:root:Epoch[93] Train-loss=169.448189\n", + "INFO:root:Epoch[93] Time cost=4.985\n", + "INFO:root:Epoch[94] Train-loss=169.295794\n", + "INFO:root:Epoch[94] Time cost=4.649\n", + "INFO:root:Epoch[95] Train-loss=169.143627\n", + "INFO:root:Epoch[95] Time cost=4.602\n", + "INFO:root:Epoch[96] Train-loss=168.989410\n", + "INFO:root:Epoch[96] Time cost=4.904\n", + "INFO:root:Epoch[97] Train-loss=168.841089\n", + "INFO:root:Epoch[97] Time cost=4.602\n", + "INFO:root:Epoch[98] Train-loss=168.694906\n", + "INFO:root:Epoch[98] Time cost=4.589\n", + "INFO:root:Epoch[99] Train-loss=168.527604\n", + "INFO:root:Epoch[99] Time cost=4.560\n", + "INFO:root:Epoch[100] Train-loss=168.385596\n", + "INFO:root:Epoch[100] Time cost=4.835\n", + "INFO:root:Epoch[101] Train-loss=168.246526\n", + "INFO:root:Epoch[101] Time cost=4.558\n", + "INFO:root:Epoch[102] Train-loss=168.093663\n", + "INFO:root:Epoch[102] Time cost=4.609\n", + "INFO:root:Epoch[103] Train-loss=167.938807\n", + "INFO:root:Epoch[103] Time cost=4.599\n", + "INFO:root:Epoch[104] Train-loss=167.814916\n", + "INFO:root:Epoch[104] Time cost=4.394\n", + "INFO:root:Epoch[105] Train-loss=167.676473\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:root:Epoch[105] Time cost=4.724\n", + "INFO:root:Epoch[106] Train-loss=167.560241\n", + "INFO:root:Epoch[106] Time cost=4.316\n", + "INFO:root:Epoch[107] Train-loss=167.424132\n", + "INFO:root:Epoch[107] Time cost=4.646\n", + "INFO:root:Epoch[108] Train-loss=167.284482\n", + "INFO:root:Epoch[108] Time cost=4.472\n", + "INFO:root:Epoch[109] Train-loss=167.184511\n", + "INFO:root:Epoch[109] Time cost=4.768\n", + "INFO:root:Epoch[110] Train-loss=167.037793\n", + "INFO:root:Epoch[110] Time cost=4.717\n", + "INFO:root:Epoch[111] Train-loss=166.916652\n", + "INFO:root:Epoch[111] Time cost=4.803\n", + "INFO:root:Epoch[112] Train-loss=166.796803\n", + "INFO:root:Epoch[112] Time cost=4.617\n", + "INFO:root:Epoch[113] Train-loss=166.655028\n", + "INFO:root:Epoch[113] Time cost=4.420\n", + "INFO:root:Epoch[114] Train-loss=166.561129\n", + "INFO:root:Epoch[114] Time cost=4.333\n", + "INFO:root:Epoch[115] Train-loss=166.434593\n", + "INFO:root:Epoch[115] Time cost=4.526\n", + "INFO:root:Epoch[116] Train-loss=166.322805\n", + "INFO:root:Epoch[116] Time cost=4.310\n", + "INFO:root:Epoch[117] Train-loss=166.195452\n", + "INFO:root:Epoch[117] Time cost=4.458\n", + "INFO:root:Epoch[118] Train-loss=166.073792\n", + "INFO:root:Epoch[118] Time cost=4.333\n", + "INFO:root:Epoch[119] Train-loss=165.967437\n", + "INFO:root:Epoch[119] Time cost=4.459\n", + "INFO:root:Epoch[120] Train-loss=165.876094\n", + "INFO:root:Epoch[120] Time cost=5.070\n", + "INFO:root:Epoch[121] Train-loss=165.748064\n", + "INFO:root:Epoch[121] Time cost=4.782\n", + "INFO:root:Epoch[122] Train-loss=165.656283\n", + "INFO:root:Epoch[122] Time cost=4.640\n", + "INFO:root:Epoch[123] Train-loss=165.540462\n", + "INFO:root:Epoch[123] Time cost=4.522\n", + "INFO:root:Epoch[124] Train-loss=165.448734\n", + "INFO:root:Epoch[124] Time cost=4.858\n", + "INFO:root:Epoch[125] Train-loss=165.347751\n", + "INFO:root:Epoch[125] Time cost=4.842\n", + "INFO:root:Epoch[126] Train-loss=165.230048\n", + "INFO:root:Epoch[126] Time cost=4.495\n", + "INFO:root:Epoch[127] Train-loss=165.147932\n", + "INFO:root:Epoch[127] Time cost=4.766\n", + "INFO:root:Epoch[128] Train-loss=165.036021\n", + "INFO:root:Epoch[128] Time cost=4.526\n", + "INFO:root:Epoch[129] Train-loss=164.977613\n", + "INFO:root:Epoch[129] Time cost=5.091\n", + "INFO:root:Epoch[130] Train-loss=164.881467\n", + "INFO:root:Epoch[130] Time cost=5.223\n", + "INFO:root:Epoch[131] Train-loss=164.785627\n", + "INFO:root:Epoch[131] Time cost=4.165\n", + "INFO:root:Epoch[132] Train-loss=164.707629\n", + "INFO:root:Epoch[132] Time cost=4.527\n", + "INFO:root:Epoch[133] Train-loss=164.598039\n", + "INFO:root:Epoch[133] Time cost=4.167\n", + "INFO:root:Epoch[134] Train-loss=164.502932\n", + "INFO:root:Epoch[134] Time cost=4.354\n", + "INFO:root:Epoch[135] Train-loss=164.422286\n", + "INFO:root:Epoch[135] Time cost=4.387\n", + "INFO:root:Epoch[136] Train-loss=164.344749\n", + "INFO:root:Epoch[136] Time cost=4.662\n", + "INFO:root:Epoch[137] Train-loss=164.264898\n", + "INFO:root:Epoch[137] Time cost=4.671\n", + "INFO:root:Epoch[138] Train-loss=164.178707\n", + "INFO:root:Epoch[138] Time cost=4.776\n", + "INFO:root:Epoch[139] Train-loss=164.109071\n", + "INFO:root:Epoch[139] Time cost=4.787\n", + "INFO:root:Epoch[140] Train-loss=163.993291\n", + "INFO:root:Epoch[140] Time cost=4.726\n", + "INFO:root:Epoch[141] Train-loss=163.956234\n", + "INFO:root:Epoch[141] Time cost=4.337\n", + "INFO:root:Epoch[142] Train-loss=163.845638\n", + "INFO:root:Epoch[142] Time cost=4.787\n", + "INFO:root:Epoch[143] Train-loss=163.790882\n", + "INFO:root:Epoch[143] Time cost=5.563\n", + "INFO:root:Epoch[144] Train-loss=163.723495\n", + "INFO:root:Epoch[144] Time cost=4.529\n", + "INFO:root:Epoch[145] Train-loss=163.634262\n", + "INFO:root:Epoch[145] Time cost=5.028\n", + "INFO:root:Epoch[146] Train-loss=163.552854\n", + "INFO:root:Epoch[146] Time cost=4.933\n", + "INFO:root:Epoch[147] Train-loss=163.501429\n", + "INFO:root:Epoch[147] Time cost=4.912\n", + "INFO:root:Epoch[148] Train-loss=163.444245\n", + "INFO:root:Epoch[148] Time cost=5.034\n", + "INFO:root:Epoch[149] Train-loss=163.348476\n", + "INFO:root:Epoch[149] Time cost=4.600\n", + "INFO:root:Epoch[150] Train-loss=163.256955\n", + "INFO:root:Epoch[150] Time cost=4.704\n", + "INFO:root:Epoch[151] Train-loss=163.216139\n", + "INFO:root:Epoch[151] Time cost=4.670\n", + "INFO:root:Epoch[152] Train-loss=163.144691\n", + "INFO:root:Epoch[152] Time cost=4.678\n", + "INFO:root:Epoch[153] Train-loss=163.050236\n", + "INFO:root:Epoch[153] Time cost=4.595\n", + "INFO:root:Epoch[154] Train-loss=162.991225\n", + "INFO:root:Epoch[154] Time cost=5.307\n", + "INFO:root:Epoch[155] Train-loss=162.907200\n", + "INFO:root:Epoch[155] Time cost=4.684\n", + "INFO:root:Epoch[156] Train-loss=162.838075\n", + "INFO:root:Epoch[156] Time cost=4.686\n", + "INFO:root:Epoch[157] Train-loss=162.759286\n", + "INFO:root:Epoch[157] Time cost=4.750\n", + "INFO:root:Epoch[158] Train-loss=162.725998\n", + "INFO:root:Epoch[158] Time cost=4.637\n", + "INFO:root:Epoch[159] Train-loss=162.635852\n", + "INFO:root:Epoch[159] Time cost=4.498\n", + "INFO:root:Epoch[160] Train-loss=162.563777\n", + "INFO:root:Epoch[160] Time cost=5.048\n", + "INFO:root:Epoch[161] Train-loss=162.527387\n", + "INFO:root:Epoch[161] Time cost=5.040\n", + "INFO:root:Epoch[162] Train-loss=162.395881\n", + "INFO:root:Epoch[162] Time cost=4.764\n", + "INFO:root:Epoch[163] Train-loss=162.353654\n", + "INFO:root:Epoch[163] Time cost=4.561\n", + "INFO:root:Epoch[164] Train-loss=162.285584\n", + "INFO:root:Epoch[164] Time cost=5.051\n", + "INFO:root:Epoch[165] Train-loss=162.204332\n", + "INFO:root:Epoch[165] Time cost=4.455\n", + "INFO:root:Epoch[166] Train-loss=162.147100\n", + "INFO:root:Epoch[166] Time cost=5.021\n", + "INFO:root:Epoch[167] Train-loss=162.051296\n", + "INFO:root:Epoch[167] Time cost=4.551\n", + "INFO:root:Epoch[168] Train-loss=161.978708\n", + "INFO:root:Epoch[168] Time cost=4.744\n", + "INFO:root:Epoch[169] Train-loss=161.927990\n", + "INFO:root:Epoch[169] Time cost=4.821\n", + "INFO:root:Epoch[170] Train-loss=161.883088\n", + "INFO:root:Epoch[170] Time cost=4.365\n", + "INFO:root:Epoch[171] Train-loss=161.785367\n", + "INFO:root:Epoch[171] Time cost=4.448\n", + "INFO:root:Epoch[172] Train-loss=161.716386\n", + "INFO:root:Epoch[172] Time cost=4.622\n", + "INFO:root:Epoch[173] Train-loss=161.656391\n", + "INFO:root:Epoch[173] Time cost=4.500\n", + "INFO:root:Epoch[174] Train-loss=161.598127\n", + "INFO:root:Epoch[174] Time cost=4.677\n", + "INFO:root:Epoch[175] Train-loss=161.518613\n", + "INFO:root:Epoch[175] Time cost=4.958\n", + "INFO:root:Epoch[176] Train-loss=161.418783\n", + "INFO:root:Epoch[176] Time cost=4.607\n", + "INFO:root:Epoch[177] Train-loss=161.407767\n", + "INFO:root:Epoch[177] Time cost=4.427\n", + "INFO:root:Epoch[178] Train-loss=161.319552\n", + "INFO:root:Epoch[178] Time cost=4.930\n", + "INFO:root:Epoch[179] Train-loss=161.234087\n", + "INFO:root:Epoch[179] Time cost=4.240\n", + "INFO:root:Epoch[180] Train-loss=161.187404\n", + "INFO:root:Epoch[180] Time cost=4.484\n", + "INFO:root:Epoch[181] Train-loss=161.123118\n", + "INFO:root:Epoch[181] Time cost=4.937\n", + "INFO:root:Epoch[182] Train-loss=160.999420\n", + "INFO:root:Epoch[182] Time cost=4.489\n", + "INFO:root:Epoch[183] Train-loss=160.955369\n", + "INFO:root:Epoch[183] Time cost=4.894\n", + "INFO:root:Epoch[184] Train-loss=160.908542\n", + "INFO:root:Epoch[184] Time cost=4.269\n", + "INFO:root:Epoch[185] Train-loss=160.846908\n", + "INFO:root:Epoch[185] Time cost=4.998\n", + "INFO:root:Epoch[186] Train-loss=160.765964\n", + "INFO:root:Epoch[186] Time cost=4.467\n", + "INFO:root:Epoch[187] Train-loss=160.687773\n", + "INFO:root:Epoch[187] Time cost=4.609\n", + "INFO:root:Epoch[188] Train-loss=160.652674\n", + "INFO:root:Epoch[188] Time cost=5.327\n", + "INFO:root:Epoch[189] Train-loss=160.551175\n", + "INFO:root:Epoch[189] Time cost=4.267\n", + "INFO:root:Epoch[190] Train-loss=160.477424\n", + "INFO:root:Epoch[190] Time cost=4.798\n", + "INFO:root:Epoch[191] Train-loss=160.501221\n", + "INFO:root:Epoch[191] Time cost=4.695\n", + "INFO:root:Epoch[192] Train-loss=160.370335\n", + "INFO:root:Epoch[192] Time cost=4.640\n", + "INFO:root:Epoch[193] Train-loss=160.279749\n", + "INFO:root:Epoch[193] Time cost=4.653\n", + "INFO:root:Epoch[194] Train-loss=160.242415\n", + "INFO:root:Epoch[194] Time cost=5.044\n", + "INFO:root:Epoch[195] Train-loss=160.197063\n", + "INFO:root:Epoch[195] Time cost=4.684\n", + "INFO:root:Epoch[196] Train-loss=160.132983\n", + "INFO:root:Epoch[196] Time cost=4.460\n", + "INFO:root:Epoch[197] Train-loss=160.083149\n", + "INFO:root:Epoch[197] Time cost=4.713\n", + "INFO:root:Epoch[198] Train-loss=160.025012\n", + "INFO:root:Epoch[198] Time cost=4.779\n", + "INFO:root:Epoch[199] Train-loss=159.945513\n", + "INFO:root:Epoch[199] Time cost=4.659\n" + ] + } + ], + "source": [ + "# can initilize weights and biases with the learned parameters as follows: \n", + "# init = mx.initializer.Load(params)\n", + "\n", + "# call the VAE, output model contains the learned model and training loss\n", + "out = VAE(n_latent=2, x_train=image, x_valid=None, num_epoch=200) " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# encode test images to obtain mu and logvar which are used for sampling\n", + "[mu,logvar] = VAE.encoder(out,image_test)\n", + "# sample in the latent space\n", + "z = VAE.sampler(mu,logvar)\n", + "# decode from the latent space to obtain reconstructed images\n", + "x_construction = VAE.decoder(out,z)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "f, ((ax1, ax2, ax3, ax4)) = plt.subplots(1,4, sharex='col', sharey='row',figsize=(12,3))\n", + "ax1.imshow(np.reshape(image_test[0,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + "ax1.set_title('True image')\n", + "ax2.imshow(np.reshape(x_construction[0,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + "ax2.set_title('Learned image')\n", + "ax3.imshow(np.reshape(image_test[146,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + "ax3.set_title('True image')\n", + "ax4.imshow(np.reshape(x_construction[146,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + "ax4.set_title('Learned image')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DEBUG:matplotlib.font_manager:findfont: Matching :family=sans-serif:style=normal:variant=normal:weight=normal:stretch=normal:size=15.0 to DejaVu Sans ('/usr/local/lib/python3.5/dist-packages/matplotlib/mpl-data/fonts/ttf/DejaVuSans.ttf') with score of 0.050000\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXYAAAEICAYAAABLdt/UAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJztnXuQJFd15r9T1VXqqepBgmx5tUbT2RIWbLQxL7URhLyWlsK2GMkrO0wIyy0Y1oTbKtiNMQFhHh1reTeiYQkcYFgZRIM1zKpqWWQEy2MF2EiyMCC09IDQogeyzHSPhBCaaUkITQ2a0fTZP7KylZWdj5uZNx+VeX4RJ2a6KjPrZlXmd0+ee+65xMwQBEEQykMt7wYIgiAIehFhFwRBKBki7IIgCCVDhF0QBKFkiLALgiCUDBF2QRCEkiHCLuQKEa0R0avzbocglAkRdmFsICImol/RdKwLiehBHccShKIhwi4IglAyRNiFwkBELyei24jocSL6CRFdTUTN4XtfH272fSJ6koheN3z9EiK6Y7jPt4joRY7jrRHR24noTiL6GRF9mogmiagN4MsAfnl4rCeJ6Jc92rObiO4mop8T0Y+J6O3D1y8kogeJ6N1EdGT4OQuO/S4mou8R0RNE9AAR/aXruL8xbOvjw/ffOHz9FCL6KyI6REQ/JaJriGiH1i9ZqAQi7EKROAngrQCmAbwSQAfAmwGAmX9zuM2LmXmKmT9NRC8FcC2APwVgAPgYgC8Q0SmOY14G4CIAZwF4EYA3MvNRAK8B8NDwWFPM/JBHe/4WwJ8y804ALwRws+O9M4btfC6APQBWiOgFw/eOAngDgNMAXAygS0S/BwBEZMLqVP47gNMBvATAHcP9/huA5w9f+5Xhsf9C8bsThC1E2IXCwMwHmPnbzPw0M6/BEuoLAnZZBPAxZr6dmU8y834ATwF4hWObDzPzQ8z8KIAvwhJNVU4AmCOiZzHzY8z8Xdf7/5mZn2LmWwH8H1idCJj5H5n5/zHzJjPfCeBTjvP4IwBfY+ZPMfMJZt5g5juIiIbn81ZmfpSZfw7gPQD+MEJ7BQGACLtQIIjo+UT0JSJ6mIiegCVs0wG7mADeNgxpPE5EjwPYBcAZVnnY8f8BgKkITfoDALsBrBPRrUT0Ssd7jw09f5t1+3OJ6DwiuoWIDhPRzwBc6TiPXQD+xeOzTgfQAnDAcS5fGb4uCJEQYReKxEcB3AvgHGZ+FoB3A6CA7R8AsMzMpzmsxcyfUvis0LKmzPwdZr4UwC8B+N8Arne8/exhrN5mBoAdzvmfAL4AYBcznwrgGsd5PADgeR4fdwTAMQC/6jiXU5k5SkckCABE2IVisRPAEwCeJKJ/A6Drev+nAM52/P1xAFcOPWQiovZw4HKnwmf9FIBBRKd6vUlETSJaIKJTmfnEsF2brs3+y3C7fwvgEgB/5ziPR5n5F0T0cljhF5s+gFcT0WVENEFEBhG9hJk3h+fzQSL6pWEbnktEv6NwLoIwggi7UCTeDksEfw5L5D7tev8vAewfhiouY+ZVAH8C4GoAjwG4H8AbVT6Ime+FFfv+0fB427JiALwewNowLHQlgAXHew8PP/MhWGJ95fCYgDXg+1+J6OewBj+3PH1mPgQrvPM2AI/CGjh98fDtdwzP4dvDz/waAHtAVhCUIVloQxCiQUQXAugx85l5t0UQvBCPXRAEoWSIsAuCIJQMCcUIgiCUDPHYBUEQSsaEjoMQ0WkAPgFr2jUD+GNmvs1v++npaZ6dndXx0YIgCJXhwIEDR5g5dNKaFmEH8CEAX2Hm1w6LNrWCNp6dncXq6qqmjxYEQagGRLSusl1iYR9O8PhNDPOHmfk4gONJjysIgiDEQ0eM/SwAhwHsG5Yq/YRrqjUAgIgWiWiViFYPHz6s4WMFQRAEL3QI+wSAlwH4KDO/FFbJ0ne6N2LmFWaeZ+b500+XukaCIAhpoUPYHwTwIDPfPvz7M7CEXhAEQciBxMLOzA8DeMCxyEAHwN1JjysIgiDEQ1ce+38C0CeiO2EtZPAeTccVhBH6/T5mZ2dRq9UwOzuLfr+fd5MEoXBoSXdk5jsAzOs4liD40e/3sbi4iMFgAABYX1/H4uIiAGBhYSFoV0GoFDLzVBgblpaWtkTdZjAYYGlpKacWCUIxEWEXCo0z9LK+7j0349ChQxm3ShCKjQi7UFjs0Mv6+jqCitXNzMxsbS/x92Ijv1FGMHPmdu6557IghGGaJsOqPeRrrVaLe70e93o9brVanu8J+uj1emyaJhMRm6YZ6fuV3yg5AFZZQWNF2IXCQkS+gu4WFr9OwDTNfE+iRCQVZvmNkqMq7LnUY5+fn2cpAiaEMTs76xlXN00Ta2trI6/VajXPcA0RYXPTvQa1EIcov4cX8hslh4gOMHNoBqLE2IXCsry8jFZrtFBoq9XC8vLytm3tOLvq60J0/AapVQev5TfKDhF2obAsLCxgZWUFpmmCiGCaJlZWVjxz1qN0AoJF1IHMpMIsv1GGqMRrdJvE2IU0SDKwVzX84uXdbtf3O/Tap9FosGEYyt+5/EbJgMTYBUHwwy9eTkQjcfBWqzXylNTv97F3715sbGx4Hte9vaAXibELguCLX1zc7eh5zew9duyY73EHgwH27t0rueo5I8IuCBUkyoDl+vr6ljh7lXVws7GxsTWpzK7nI+KeLSLsgpAiecy0dH7m9PQ0pqent33+7t27QUQj+7n/dmKLs19ZhyAGgwH27Nkj4p4lKoF43SaDp0Ke9Ho9NgxjawDQMIxUBvHSnGnpNwjp9Zlum5yc9JzwNTc3F7ifPUgatE2QOc9dBlHjAZl5KpSdOOLQ6/W40WhsE51ms6ldXNKaaRnUYaiUYfCzer0ee19Vs38nKS0QDxF2odR4CXSj0QgVhyDhs0XH7iwMw/BN5VPpVPy8WyLadi5ROqigDiNtYU5q9jmm0eFVARF2oRCk9cjtDKU4zTCMwP3CQglBYYyoBcf8BMwwjJHOw91BhYUskoRDwkQ3bWG3z8Xv84VgRNiF3NH9yK0aaggiaH+VUETQNm6P029CT7PZVBJAv++v3W5rF9xGo8ETExOpinpYuEg89nBE2IXc0XkDqwwKuoXdKSK2IPt5+jrMK8TiHqRV/fygkEUa7Y7SrrifEzTAKzF2NUTYhdzR+citKnJ2KCZKR2Bb0sFDZxjI6/OjiGJQyEK3dTqdzMIwdskC5/ctWTHqiLALuaPTY1cRHmdmS1xvN4nAOYU9ibdte69TU1OZCHueJp56NFSFXSYoCamhs5pf2EzJer2Oa6+9dqtGSdx1UJl5a6KOYRgwDANEhHq9HrrvxsbG1kSgKBN5ms0m2u321t87duwAABw9ejRi68cPWYw8JVTUX7eJx14ddGXFBIVWnF5ft9tVCqmEeeb2U0XS3HAV63Q6kcNGZTLJhlEHEooRyoaXaDs7i263q/z474z1BgluFrHnWq2Wu7jmaZINo46qsEsoRhgL+v0+9u/fj5MnT269Zod17PDLysqK0rH27NmD888/P3S7m266yfJ+UqbMy8IF1Z8BZKGN1FBRf90mHnt1iRuaURmI9XrfywzDqHToI2tz/t7OhTyCZvYK3kBCMULRSJK/rJI6mVV6oNh28xvX8AuzSC57PDIXdgB1AN8D8KWwbUXYq0mS9EeVfU855ZTcBa6qNjc3t61jDRJqmX0aD1Vh1xlj3wvgHo3HE0pGklXuw1In+/0+nnrqqeSNFGJxzz33jIxHEBH27NkDANvq0QfVdY+bpiq4UFH/MANwJoCbALwK4rELPiT10oLi8+NQ2bBqVqvVPCtwBtXKEY89GGTssf81gD8H4Du8T0SLRLRKRKuHDx/W9LHCOJF0wtLCwgLW1tawubmJtbW1kQWT46zsI6TL5uYmTpw4MfLaiRMncPz4cc/tJUNGH4mFnYguAfAIMx8I2o6ZV5h5npnnTz/99KQfK4whCwsLWFlZgWmaICKYpqltRXuVmaFCsdF1LQgAsSMuFusARO8F8HoATwOYBPAsAJ9l5iv89pmfn+fV1dVEnysITsLypYV0ISIk0RLTNLG2tqavQSWFiA4w83zYdok9dmZ+FzOfycyzAP4QwM1Boi4Iuun3+yLsOaMq6o1GA81mc+Q1CcHoR2aeCmPP0tJSIm9R8Ea1+Jl7HyfNZnOrkJppmti3bx+uvfbaVMJxwjMkDsXEQUIxgk5qtZoIu2ZM08Ty8jL27t2LjY2NyPseOnQIMzMzIyUfhOSohmImsmiMIKTJzMyMZMVopNVqYffu3VhcXMRgMIi0r8TKi4GEYgQAVpzaPZFkXPBKoxTiYU8suvHGGz1FPWgsQ2LlxUGEXUC/38fi4iLW19fBzFhfX8fi4uJYibu9OIWQDGbGjTfe6DsD1C/kFRQrH2enYWxRmcWk22TmabHwm7VZr9cLX3kvztqmYuEWZdHvoIUypNiXXiDVHQVVVKoi6rwZda2qxCylBNKyZrOpvABIUBkA1TISOq+JMiPCLnjidQOpiqOOOh66PTgp1ZudTUxMbKv9EvbbqZRb9rsmnLXbRewtRNgFZh4VcsMwthVgsm8g1XBGWh523E5DPPZszTCMbWIbpzib8/f22yZKGeCqIMIuKMef7ZtRNa6ahoftF6cNe0RXPceqryuqy9y/U9gTmMoTWpSnrqpXfxRhrwhu4XM+vvqtauNlUbz2JDdYFI9dNWzT6/WUzjXK9yHm/7s7rzmVlZPCOucoT11BA7VVACLs5UdnRkicWLWuNvs9AYR1AmFhJqfV63Vut9u5C+M4mVcoRNUBiCLAXteE3/UoHrsIe+nJO74cJRzjFmGVRYyDOhuvsJGItz7zG7xMMtAe5Ll7PXlKmuR2IMJeLrxuirwzQgzDUG57nKwH1UE1d5tUQzNi/qIcp7N1/rZe4TKvJ6put8vMzN1ud+s3q9fr3O12JQXSA4iwlwc/YYwyiSQt82uv84b0a2dY1kOUR3R3m/Lu9MbRGo1GqHiGeex2x+om6FrtdDqer9uiLzwDRNjLg9/NNDk56XuzvATgpwE+nLIYtNvtkbBK1EFYt7kf4d2dhMoxgkIGdrpe3iJaRHM/gXl5zGHjOkQ0IshRwjduq9frWdxeYwVE2PND9yNkVO+zXqvxNwB+mCh1YddtYYNuKk8ptVrNM9Zuf4/2b5P3uRbNVCYNOfPWg45l/05Jv2cJv4wCEfZ8SKM2RlSP5wqAD05M8Ad27Bg7YQ/LeihC+GlczTCMkVh20HevkpaaRecoA6ajQIQ9H6LkaavS6/W40WiwCVg/mYddMPycKYB/DPClAF+F9EMxOs1+jA962hFPO7q5s5BUMk6Cjhd2rafRfsECisIuZXs141fu1O91VYgIPwHwCpf9HYBjAB4YbvcXAO4B8PmQYxURZsb+/ft9ywf3+33UanLJRsEwDBw7dgwbGxtb3+n+/fuxZ88eGIaxtZ277LHf9+xcKm95eTnRtTQxobbOz8bGhpT6jYqK+us28dhH8Ur1UjnmxQCfBPgNw7+fD/BRgF84/PsqFNNj9wun+IUIbI8z73aPmwVlThmGERhDDzquTa/X46mpqUzOpeoTk2wgoZh8iBpj73a7nhdyp9MJFLNzAH4M4Ksdr30Z4L9x/H0ViifstVrN95zF9FncuQ6maQaGWJyzfr2yY9KaIGYP7Lo7E3cWTtmBCHt+RMmKiTORZgrguwD+BsATw9cuAvgJgJ8H8KlDey/AR4b/b6YsJFfBO/b/O67tOp2O9tisePPeIsisPw5uX8t+37kdw8/6nKempnzvM3eBO79c+3EAIuzFIEzkwy7YywE+CCvkcnD49w0APwTwGY7t9sJbWG1bSvnGugrWE8R5LnuWazt7EE/nZwfl81fRnGELnTOU7UHMsFBN2PtpmZf37tfJNJvNsRR3iLDnT9Djqi30QeVkLwf4SYwK9FMAnwD4/OHFaW/7XFiZMU7bB/Djw//PpHxTXQX1sI942OnZxMREZOdBxZzhxLBwS56zoolopGRwUKc2jnF7iLDnj8pjcFAo5iC8ve8nsN0z3umx/1UIF1vVUJDp0xaG1XGofJbz5svjpi+7OSdgxS3f7Gd2+CKKN57X72wLdljnMo4lgCHCnj+qF3a73d66+YiIJyYmGLDCL6xoF3gc9yoEiy2z/6Oq25rY3plcD/AA4LOHn3Vi+HnHAf4uwL+fw01dNbPDe7Kg9zNme+0q3924ARH2/FEduHKO+Dtv0IPwFvGDmm6ATqcTWxDcqZYLAL8V4AsB/l2AvzRsq4h7upakFktZTWW1LImxi7DHRtUb9itM5RVjf3L4ep43jleqpZd9C+DvFeBGL6u1221mjjYbVzx7S9THdaFsZCXsAHYBuAXA3QDuArA3bJ+yCrtqudoo5pUVk+dN4ZVq6WdvH7a7VoCbuYxme52qHrvUqrdsYmKCG43GyGvjkg+PDIX9XwN42fD/OwHcB2AuaJ+iC3vQOqLu3t15Y1VhUNAr1dLP3gardLAIe3rmHJsJ29YW9rzbXHQrsgePvEIxsMqU/FbQNkUWdpWBKOfU6zI/2rqfFq6HlW55vuL+3wL4QAHOoypWBcciKytqVUkoCjtZ2+qBiGYBfB3AC5n5Cdd7iwAWAWBmZubc9fV1bZ+rk9nZWai0zTAMTE1NKW07jlwO4OMA2o7XGMA/AniXa9u7AXwRwA0A7h3u8ycALgLwe8P3hGwgIui8p6uMaZpYW1vLuxkjENEBZp4P3U7XRUBEUwBuBbDMzJ8N2nZ+fp5XV1e1fK5uarWa3BgADgKYVdz2QgCvB/DvYMXlNgF8F8B7AHwlhbYJQhYQETY3N/Nuxgiqwq5WNzP8wxqwHLZ+mKgXnZmZmdJ64VGY8Xl9E0Dd4/VbU2yLkC31eh0nT57MuxmpMTk5iV/84heh2z3nOc/JoDXpkLi4NVkFmf8WwD3M/IHkTcqX5eVltFqtXNtQhHrpftXjk1WVF4pOq9XC/v37R+qulw0VUR93dKxacD6sJ/FXEdEdQ9ut4bip0O/3MTs7i1qthtnZ2W0F/BcWFrCysgLTNEFEME0T7Xbb52jp4BcKIqLM2vJuAEddrx0dvi6Ul8FggKWlJSwuLubdlNx59NFH825CfFRGWHVbXlkxKgv0+qU0qsxmy8I6nU5mn1W0HHqxbMyekl+Uaz4vK2LJAcjM01GCJmYErSZjU4SLPI2St2JibgtbbKMKNu7pjpUQ9rj55s7VYvK+0AApdyuWvtneehUdCGd1zCKKOrMI+whxvQ/7Ii/KJKQq3mxi2ZlzWn2Ue8a5LsA4W7fbjbT6WR5AhP0ZggQxaFEAeSQVK7NNTk5uiZi9YLizjEbe7cvaarXatk6qaDVkIML+DH7iXK/XfUsD2DG2cfOSbS9DwjZiYWYLutd7rVZra12AqptzVaa8gQj7MwQJt3Mbr0cwv06hCIOpfhdhULvFxFRt3JyaNK1erxei1C+qKOxe4uysvmhnxUT5Ufw6hbB1H/Mye8FhuSnFxNK1PDJnUDVhVx3kjPJj+HUKRY4/Rq3RLSYmFt+yznVH1YQ9ipCp/BhB4ZtxEE3DMAobLhIbbytLFoyfnQvwPoDvhTU5b1/AtlkviA1FYddRUqAQHDqkXsUkaFu75MAVV1yBwWAw8t5gMMCePXvGokjYxsZG4SrTCeNNvV5Ht9vFU089hV6vl3dzUuN8AL8B4DsAHg7ZdmbGr1xeCN/8JnDeecDkJHDWWcCHPxzvOH6oqL9uK4rH7rVSkko4R+LXYlW1RqMRmlgw7kaO/38H/h67agLGNv75n5nbbebXvY75ppuY3/te5nqd+eMfD9U5VC0Uo5qa2Gq1uNvtJk4HFHEXq6rZA/RFmZEd1UzAkj4Pu8C1rZ+w12q1LeH2Sy/2Hc9bXGQ+5xzmEyeeea3bZT7zTObNzUCdQ9WEfXjSoRekqleudIGU1GMREwuzcRlr8rImwOe57HqABwCf7drWT9idnZtbT0z4dxx8yy3Mu3Yxv+Mdo+J1663W+3feGaZx1RP2sIlIQdtENTucU6QMmbIPaomJpWEXwxokfYPHe0GhGD89cXYcrwCYb7uN+bWvZZ6cZP7BDyzZ3bdvVLweecR6/frrAzUOVRs8BYDdu73LwC8uLmJhYQEAtAx8tlotLC8vAwA+8pGPJD6eLt70pjfl3QRBGCvOAdAD8FEA/yPivv1+3zMR4ziA24f2E9MEjhwBbrgBuOYa4LTTrI3sf22e/Wzr38cei9gKH1TUX7dl7bE7B0vjxMYNwwgcFCnKI2lR2iE2XqZjzCmKERF3Op3cr9cpgO8C+BsAT/hsE+Sx+5UBt63VavHn3/9+5lNPZX7zmy2xePBByzP/3OdGxevECev1j30sUONQxVBMkGgTUegP4bdfWBGgoFrvYmJFNjtMmbXI2gOLWd437oVjbgf4IYDPCNgnSNjDbArg+xoNfuScc5iPH7fE4sknLdn95CdHRURzKKZUwp7WxakyW1WKbomNq+ksSx3lPsha1J8ELMkb2ibAfxmyXxRh9+s4zt6xY1Q/du1ifuc7RwXk61+32iWDp9tJu3Z6UG6qpD+KiWErZJl3O9x2EKOibttDGM2O2QnwNMB/MLT7Ab7Z8bff8cM6jpHZ7ouLzC94AfPTTz/z2lveYgm+pDt645wkkMYF4ue95x0vFBMrihXxXjgJb2F32wVD83vf7/gHfbZ3dhx8223MP/vZMxOULr+c+eabmd/3PuaJCZmgpEpQ+qM9EDo1NRXrwrUZx3xeSYsUS9M6nU5hVh2z7SC8hfegpuOrdhx8yy2WcPzTPzH/+q8zn3IKs2kyf+hDSpoGEXa1OuxxPHu78H5RS/eKiemyRqMRecENe0A277Y7zStU8uTwdR3HP+g6tm0HHdvoACLsFmH1G+J424ZhcKPRyP1ijdPuvNsgNj5Wq9W40+nEutaZOff2u809uKlL1O1jh3UcOmq3Q4RdjagDrkFrpBbZ5ubmCjmoJVYc8yrzHPeJdtzCkzosrOOwyxAkASLs6ji9eq9Ffd0evwikWNksjsPiNz4la6X6W1Igwp4eVfRGxMprdkw8isNie5/dbncrH71er8dKRqiSJV0zFSLs3ijXTA45hsw0FSuLxVkA3S+sIE+z6hZnzVRkKewALgLwQwD3A3hn2PZ5CbtKlowquuLs4xivFyuX2VleUceb/BaNF1O3qGumIithB1AH8C8AzgbQBPB9AHNB++Ql7GFFwsLQPfnJMAyJ2YsVwuyVkeIIdLPZHMsssSJY1DVTkaGwvxLAVx1/vwvAu4L2yUvY/QRU5cvVXa7A+aRQpJruYtU123NnlnEk3eY356XIHvtrAXzC8ffrAVztsd0igFUAqzMzM5FORhdJPHadF7rtqTuRkIxYUUyuRf3Wbre1hIFRNGF32rjF2KPMogvrAPwGncKeCBqNRiFKATSbTe50Orm3Qyxdk/CgftORuAEJxXgT9uW634+yRqpTtL3EL6wTCcqnd8Y/7fe9JpRkYXLTV9ckGyy+6QAZCvsEgB8BOAvPDJ7+atA+Rc1j9/Kao4hYs9n0zS7wW7AjSS8usXkxsfEwHbNOmTMUduuzsBvAfbCyY5bCti+qsOuIowd5NO5YftL0SxngEhMrvtVqNS11YpgzFvaoVlRhTzvEYGcd2B66XyegMpirEvefm5vL/aIOu+AlTU6szOaVKJEEiLBHx88D1iX4qkLmTL/0CtWopF7aC3DnfWGrXPgSsxcrm8Wd+BgGRNij4xcasQuBAdkMHDpF2f15KsWa7Fj/uAjmuLRTTCzMkmS8qAAR9ngEDWZm4QEnTWt0PvqNg8cuJlYmSxuIsOsnzLOM63naS/UlTWFUGZwVExNLx6LOIo0DFIW9BkGZmZkZ3/darRauvPJKmKYJIoJpmjAMI/SYrVYL+/fvx3XXXYdjx45hc3MzdJ92uw0i2nac5eXlrb/7/T6WlpYwGAxQr9cBAKZpotPpbP0tCIIe3Pdf7qiov24bN4/dWRjJyyv3G/n28pibzea2iUfM6mETv1DNKaecsvX/dru9bRt7rEA8eDExfZZ2TN0NJBSjh6BJSyo/qOoEJJUwjixkoM8WAf57gB8G+HGAvwHwbxWgXWLjY1mEXtxAhF0PSUv9Jv0csXRsHeCPAXwpwK8G+JOw1qr83QK0Taz4Zmee2eioA6MCRNj1kKTUbxRkoDNbMzxe+ybANxegbWLFtlqtNlIeROcCPmFAhF0PWXnszNYFIiVT9ZgJWJe3h13gs8/VAN9bgLYX3ewxJXeN8SotYu0U7iw1AiLsekirN/Z7dFMNyeRV2XFcrAnweS67HuABwGf77HMA4C8UoO1FNr9BePdEviqYLdxZPdUzi7D7EicWFmUflW2DZrjmfbGW1S6GFUN/g8/7/wGWN39hAdpaZAt6orRnTBPR2Dkecdobtgi4eOwpoFpjxcv79tvXS7DdNdLdtWDcx+/1elLTOmM7B+DHYIVavN5/GcBPAvzBArS1zGZ7tvb1X4T7wA4rBWlDmHBLjD0jYff7ov28DWfP6rWvX+64V7541AtALJ51Af4OwI8CfBTgO4evubebAvguWOmMEx7vnwXwTwD+PMC1ApxX2cyeVW0OF61JEn6MY41GYyREZHcmXk/TQc5bmHBLVkwGwh71QnHGwtK4yMIe2cSi27uHdgnArwJ4GVaoZWP470GALwf4BoAfAvgMj2OcDvB9AH8b4B0FOKcymn3tB4lj2ES+uJ69zpK5WQl3GKiysEet2eL02NOoNBg2yCKW3C4H+ARgXdJDe2r42vke27dhefz3ATxdgPaPu/ld2/bKQSrhDD/h7PV6nuWu7ZRDdzaZ7hroRQJlE/YoPabfRWQYRugjlW6vWiUtSszjhseoSDvtAo/tD/ps+wS2Z8fsBPirsIT/co/38z73Mlm73WZm9cwRv/Etd8jTPUGoKqBMwh51cCLssS+og4gSYw8z9zqnEmNXN9V0xTos7/sk/DsCt10Q8n7e5142C3JobI/ebw6H6thYVUCZhD1OOlGSmJif1+B1gTUajcAUMHcbJaUxnnmlK/4rhIv4wQK0Xczf7MHNOA5PGnniRQdlEvYsJwCE4ddhqLZRwjG5FRJlAAAR4UlEQVTRzS9dsQ7wubA88M8AvIlRUX8SVqgl7/aLBVvcnHfx2Mdc2LOcAOCFivev2sY8bhx3RcharcadTmfknLrd7kjpXxXLovxBWLqi066H5dU7s2LyEisxfaYyNlYVUCZhz3ICgMpn2+GXqJOfirQO6cTEhGdd+F6vp7R/rVZLtISfqgWlK7rtYlie+vMK8P2K6THVsbGqgDIJO3N+eaQqoRP3xQd4T4QochgmSvZOs9lMpYO6HJanbXvc18PKXPFKV/Sy9wB8DJKTXhYrc9piXFA2Yc8LVQELml1qi2beN0qSc0jbLocVE2eHbcIqo+uVrvh/AX4LrFrqrwH4AwAfB/i9BfgexZKbnS0jjAIRdj2oetkqBYGKUBsjzGyCamSkYQcxKupBdgHAKwD/EFY5gcMAfwvghQJ8f2L6TNgORNj1oOppF2V2qdcMPVUjom2Pvlmdj18e+skCCIxY9lav17fdhxJjF2H3Jc4FEpb9oRqfTmu9UvcgaBJP253Fk5XXfhDewn6wACIjlo8579kk6w6XCWQh7ADeD+BeAHcC+ByA01T2y0vY42bXRLmwgjx8ItIeu/ZK+UwSI7e9drvzc6+Sk5Z5xdglD7265vTYw5yLKqU+IiNh/20AE8P/vw/A+1T2y0vYk+TDq9Rjt18PughVPGrTNJVyxMPKKsQRZ6+c4azMnRUjol5ts+8nlXBgVSYrIetQDIDfB9BX2TYvYdc9g9XvCcBvJp3TCwm6SP2O7TT7M9yLFng9mkYJp0xOTuZ+Q4uJ2RZUK8ZpVSkvgByE/YsArlDZdhw99ijH8/OQnQXB/C5WZ5qX07uPMojp9OTHIc1STCzIVJ4ixWOPKOwAvgbgBx52qWObJVgxdgo4ziKAVQCrMzMzsU8saXEvnTNYg54Aut3uiDftFHW7Le4MlkajEalcgcoNMQ4plmJiQeYc97H/dr4vMfYUPHYAbwRwG4CW6j5xPXYdwqwzbSrpE4BqW/JOoRQTy8L8nBD3/VTl1EdkNHh6EYC7AZweZb+4wq47lJKUtGvYZD1JSEwsb3M7Me41DaoOMhL2+wE8AOCOoV2jsl9cYS9S+V6btLyHLKb1y5OAWNYWdM35vVelUEsYUBR2srbNlvn5eV5dXY283+zsLNbX17e9bpom1tbWNLSsOPidKwDUajXs2LEDR48eTfQZzWYTjUYj8XEEQQUiQq1Ww8mTJyPvW8Z7PA5EdICZ58O2q2XRGF0sLy+j1WqNvNZqtbC8vJxTi9Lj0KFDvu9tbm5CR4d8/PhxTE9PwzTNxMcShDCYOZaoA8H3g7CdsRL2hYUFrKyswDRNEBFM08TKygoWFhbybpp2ZmZmAt8fDAbKx2o2m77vHTp0CLt371Y+liDkQdj9ILhQidfotnEqApYXXiuzxzE77h93+TExsSKY1Ga3gGKMfaw89iqxsLCAnTt3JjpGq9XC7t27sbS0hM3NTU0tE4Ts2djYwBVXXIHp6Wn0+/28m1N4RNgLQr/fx+zsLGq1GmZnZ9Hv97GxsRH5OHa8vF6vYzAY4JprrvEdhBWEIkJEvu9tbGxgcXFxm7h73T+VRsWt120SihnFLx8+ajqiYRiZLDAtJpa3Oeeu5LkmctagjOmOZWV6ejqWd+6k2WyCmXHixAlNrRKE4kJEW+HFKqVBlzLdsYzEDbk0Gg0YhrGVHbRz504RdWHsqdfrSts5s2T8Qo1VTpEUYc+ZpaUl3/cMw9iWt2+/vm/fPhw5cgTXXXcdACh3Do1GA+12O15jBSFlTp48GZiea3PkyBFMT08HxuMrnSKpEq/RbeMcY9ddQgABccRerxf4eaplB9yrPWVRfybJ2qti1TXDMLZdO7VaLfKykl7r95YByJqn+kljkMavol2tVgvdV0WgvfJ/064RY3cgUjK4mhZ3ScWgRTXiXEtlBCLs+kmjumSSCzNIoIOeJtL02JvN5sgiH3ktsxckHlmt41plsx0e1e1rtZrWjK6yLrwBEXb9pFFdMkln4bdvvV4PDBWpriMZZu70Snt2q2ma3O12t9pXFM/dfnrJu8OJGlYYV8sr9basqY7MIuypkIbHniS8oyJQfsfqdrvKN4qXh9tsNtkwDCYiz7ioVzt03OhEFLmjqNfr276DXq+Xi/BURdSzNvc4UlmBCLt+0poIkXS5P3tf1RVonPuGiZthGJ41a+LUnlHpAILMPo+oHnfQE5WquOuo2yOWjlWpjgxE2NMhqghnuYxXklCRX6el28OMK5B2Bxonq8cOTdmhI+dvoRKScoaVxNRtcnIyk88pc+jFDUTY8yfrqc5prMGq8waMW2HSbku329Wa0dNoNEKP1263mVnWnS26lXWw1A1E2PMn6zVa0+hIdN14Ojx1VXG1t9MxaGsYBjOzxMbHwKrgtUOEPX/yWKNVd+gnSgzaGeawwxf230kGKk3TjBwKsT87qVjYE13yFi2xcKtCSAYi7PmTtseeRfy+1+ttG/BsNBrbhNsr68T5ftKbNo5I6/DY43QqYslscnIydjpq2UMyEGHPnzRj7FnG7+MMGKtmvxiGoSScUUM5umLinU4nd6GrmnW73dhPSWk+DRcBiLAXg7S86qzj91FQDbs44+c6a8tkPdCpa8KXnzWbzUjzDopojUYj8nUR1OH7DcQX4fpPE4iwl5s84vdeRM2k8erkdNaVUQ2b6FoDtt1upx6qabfbY72AirP9qh2gfX34zaHodruVWVzDCUTYy00RPHa/cFDQDWsLuFPYdQmj3amFHU9SF/VaGpO37N/SPYnOORmp2+1uXU/1ep273W5m135eQIS93BRhOTA/AVX1hu32Rk1j9DP7xh7HLJZxLXNsC63usYgwB6UI138eQIS9/ITVak87YyZIaFVDK6qpkK1WayQTp91ub3UgtVqN2+32SKplEbzyiYkJ5e9gHDsjpwDrDEcRUaj3XYQn1jyACHt1ycqbCbqZ7bx2lRs5zMO3i3g5OytnZUlVEW80Gtpi6zpFTOX7TNOSfifM0Wfm2tlQROQ5+Svsei3KGFPWQFHYZWm8ErK0tITBYDDy2mAw8F2Gr9/vY3Z2FrVaDbOzs+j3+0qfs7y87Ll0HwAcP34cAAKXLgOAWq22tSixH/v37wcALC4uYn19HcyMjY2NreUAres9nBMnToR+Vhj1eh1EpLw2ZxjO5dt2794d+n2lQavVQqPRSHwMVYgIl1122dbfx44d27bNYDDAnj17fK9Fv2XvKr0cnhMV9Q8zAG+D1WNOq2wvHnu6RPFmonr37hBP2ml4uhdgSGLO70VHqKfRaBR6URJVi3INEBF3Oh3lMQW/a1Fi7CmHYgDsAvBVAOsQYS8EUeKPUbbNqgJkUc0pGjrCJs7VpsZ1dmutVouUqtrtdiN31H5x8ywrpxYFZCjsnwHwYgBrEGEvBFG8mSjevZ/4JPFe2+32WGSE2MXAgr7jOGaPH+ga7C3KalV+FqcDK3vcPArIQtgBXArgQ8P/ryFA2AEsAlgFsDozM5PBV1BtVLyZoIlBXl5SGpkm9qzKooRb/MxrMQddqzDpWl1qHCzONVT2TJcoQJewA/gagB942KUAbgdwKisIu9PEY8+fII/Tz7v387aSeon2jRslVuteb9XP5ubmtImS+3vRGRc3DGNsY+xRf+sonVgV4uZRQNoeO4BfA/AILEFfA/A0gEMAzgjbV4Q9f4Ieif2WGvML8XhN745idt6yijfnvtGz9nSd3qPuuHgVVmqy01ZVZqtWJW4eBWSdxw7x2AuDShgmTESDshH86r3EFSW7jnvUG1138TAVc8Z7dYemnEXRyui9O8cpwq4XCb94AxH2aqI6cKoiwnFuLlVvzNk2Fa87SkZPFtbpdFL5fPs8k3SUft9z3mMZkraYHMjM02qimr6o4hWGZSN4ee9Ra7GrZIREzejJyubm5rR71u7vPMmx7EW8nb+NfX2kMQO33W4H/tZRriPBGxH2ihJ1clLcx2E/T0vFI3TH8MM8U7+6ITo82qSdg87BWa/vPO7AtH1eTlFPowqjiokHrg8R9ooSpzhSnMfhOKLqV1o17OkhaILKOOTAq5p73dgohcGctXncnVWj0cjt6cbO0xf0IMJeUeLGLKM+DscViqBBWb99/EJCUbzQoPYWoTBYrVbb1kmpPgE5xVNnXF7H9yKTi/Qiwl5hsohZJhEQPw886tNGlDaE5ci7OwidYm8/qcQJq6hkDDnFU1eb3QuWB7U9KDddslv0IsIupEpY+CRICII88ChPG1EW6AiaJWrn0Ts7Q93ZIyrfWVDbgs7VKZ46Swo4jxv220h2SzaIsAupExYDjlKuwHlM1aeNKB67fSw/gXS3SWdMul6vb52bs8NQeSpwpj96dTZu8dTZGbk74LDfRrJb0keEXciEqCGZqBOfgojiAdsipSpiqudlL2Yd1hHEnXTkHmwOW+dTZ4xdwijFQ4RdyAQVwbLzqe36Ll6zVuM+xrs7hLBYb5Q8f5WMG2eH4PfZ9mpBScVV5Xvy2sYr28a9GpXXwK143MVDhF3IjLB8eDvG7SdKUQdNg7x7nbFgleqNzjYGCXvc0I7K0nlenVKcpx8JoxQfEXYhc4KEJ+i9KJOqvAYSvbxW3bFglQ4h6Dz8zj/Mm3fO2KzqOp/CM4iwC5kTJH5xRM/LE1Ud/AxrZxzvNGy/oPNQeZLwysl3Lp8X9clGKB8i7EIu+IlfEtGzCQv3qLYvrbQ8FfEO6hjCxgckpVAQYRcKRVgIRcWLVs3lDiJtrzdJrFol1CKx8Gojwi4UBi9P0554EwU/UbYHZ1UocpxaQi1CGKrCXoMgpMzS0hIGg8HIa8yMG2+8MdJxlpeX0Wq1Rl4jIlx55ZVYWFhQOsbMzEyk17PE6/xarRaWl5dzapEwtqiov24Tj71a6PSSk4YiksbB0ybvzxeKDSQUIxSFooUY/MRTBieFoqMq7GRtmy3z8/O8urqa+ecK+dDv97G4uDgSjmm1WlhZWVEOoWTB7Ows1tfXt71umibW1tayb5AguCCiA8w8H7adxNiF1FlYWMDKygpM0wQRwTTNwok6ABw6dCjS64JQVMRjF4Qh4rELRUc8dkGIiGSlCGVBhF0QhoxLyEgQwpBQjCAIwpggoRhBEISKIsIuCIJQMkTYBUEQSoYIuyAIQskQYRcEQSgZuWTFENFhANtngpSLaQBH8m5Ejsj5y/nL+evHZObTwzbKRdirABGtqqQllRU5fzl/Of/8zl9CMYIgCCVDhF0QBKFkiLCnx0reDcgZOf9qI+efIxJjFwRBKBnisQuCIJQMEXZBEISSIcKeMkT0NiJiIprOuy1ZQkTvJ6J7iehOIvocEZ2Wd5uygIguIqIfEtH9RPTOvNuTJUS0i4huIaK7ieguItqbd5vygIjqRPQ9IvpSXm0QYU8RItoF4LcBVHFttX8A8EJmfhGA+wC8K+f2pA4R1QH8DYDXAJgDcDkRzeXbqkx5GsDbmHkOwCsAvKVi52+zF8A9eTZAhD1dPgjgz2GteF8pmPnvmfnp4Z/fBnBmnu3JiJcDuJ+Zf8TMxwH8LwCX5tymzGDmnzDzd4f//zkscXtuvq3KFiI6E8DFAD6RZztE2FOCiC4F8GNm/n7ebSkAfwzgy3k3IgOeC+ABx98PomLCZkNEswBeCuD2fFuSOX8Ny5nbzLMRE3l++LhDRF8DcIbHW0sA3g0rDFNags6fmT8/3GYJ1iN6P8u2CflBRFMAbgDwZ8z8RN7tyQoiugTAI8x8gIguzLMtIuwJYOZXe71ORL8G4CwA3yciwApDfJeIXs7MD2fYxFTxO38bInojgEsAdLgaEyZ+DGCX4+8zh69VBiJqwBL1PjN/Nu/2ZMz5AP49Ee0GMAngWUTUY+Yrsm6ITFDKACJaAzDPzJWpdkdEFwH4AIALmPlw3u3JAiKagDVQ3IEl6N8B8EfMfFeuDcsIsryY/QAeZeY/y7s9eTL02N/OzJfk8fkSYxfS4moAOwH8AxHdQUTX5N2gtBkOFv9HAF+FNXB4fVVEfcj5AF4P4FXD3/yOofcqZIx47IIgCCVDPHZBEISSIcIuCIJQMkTYBUEQSoYIuyAIQskQYRcEQSgZIuyCIAglQ4RdEAShZPx/7EEwhkbXQPkAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "z1 = z[:,0]\n", + "z2 = z[:,1]\n", + "\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(111)\n", + "ax.plot(z1,z2,'ko')\n", + "plt.title(\"latent space\")\n", + "\n", + "#np.where((z1>3) & (z2<2) & (z2>0))\n", + "#select the points from the latent space\n", + "a_vec = [2,5,7,789,25,9993]\n", + "for i in range(len(a_vec)):\n", + " ax.plot(z1[a_vec[i]],z2[a_vec[i]],'ro') \n", + " ax.annotate('z%d' %i, xy=(z1[a_vec[i]],z2[a_vec[i]]), \n", + " xytext=(z1[a_vec[i]],z2[a_vec[i]]),color = 'r',fontsize=15)\n", + "\n", + "\n", + "f, ((ax0, ax1, ax2, ax3, ax4,ax5)) = plt.subplots(1,6, sharex='col', sharey='row',figsize=(12,2.5))\n", + "for i in range(len(a_vec)):\n", + " eval('ax%d' %(i)).imshow(np.reshape(x_construction[a_vec[i],:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n", + " eval('ax%d' %(i)).set_title('z%d'%i)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Above is a plot of points in the 2D latent space and their corresponding decoded images, it can be seen that points that are close in the latent space get mapped to the same digit from the decoder, and we can see how it evolves from left to right." + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example/caffe/caffe_net.py b/example/caffe/caffe_net.py new file mode 100644 index 000000000000..803efda9b68e --- /dev/null +++ b/example/caffe/caffe_net.py @@ -0,0 +1,145 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Generate helper functions to load Caffe into MXNet""" +import argparse +import mxnet as mx +from data import get_iterator +import train_model + + +def get_mlp(): + """Get multi-layer perceptron""" + data = mx.symbol.Variable('data') + fc1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', + prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }") + act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}") + fc2 = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', + prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }") + act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}") + fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', + prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}") + if use_caffe_loss: + label = mx.symbol.Variable('softmax_label') + mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', + prototxt="layer{type:\"SoftmaxWithLoss\"}") + else: + mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax') + return mlp + + +def get_lenet(): + """LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick + Haffner. "Gradient-based learning applied to document recognition." + Proceedings of the IEEE (1998) + """ + data = mx.symbol.Variable('data') + + # first conv + conv1 = mx.symbol.CaffeOp(data_0=data, num_weight=2, + prototxt="layer{type:\"Convolution\" " + "convolution_param { num_output: 20 kernel_size: 5 stride: 1} }") + act1 = mx.symbol.CaffeOp(data_0=conv1, prototxt="layer{type:\"TanH\"}") + pool1 = mx.symbol.CaffeOp(data_0=act1, + prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}") + + # second conv + conv2 = mx.symbol.CaffeOp(data_0=pool1, num_weight=2, + prototxt="layer{type:\"Convolution\" " + "convolution_param { num_output: 50 kernel_size: 5 stride: 1} }") + act2 = mx.symbol.CaffeOp(data_0=conv2, prototxt="layer{type:\"TanH\"}") + pool2 = mx.symbol.CaffeOp(data_0=act2, + prototxt="layer{type:\"Pooling\" pooling_param { pool: MAX kernel_size: 2 stride: 2}}") + + fc1 = mx.symbol.CaffeOp(data_0=pool2, num_weight=2, + prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 500} }") + act3 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}") + + # second fullc + fc2 = mx.symbol.CaffeOp(data_0=act3, num_weight=2, + prototxt="layer{type:\"InnerProduct\"inner_product_param{num_output: 10} }") + if use_caffe_loss: + label = mx.symbol.Variable('softmax_label') + lenet = mx.symbol.CaffeLoss(data=fc2, label=label, grad_scale=1, name='softmax', + prototxt="layer{type:\"SoftmaxWithLoss\"}") + else: + lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax') + return lenet + + +def get_network_from_json_file(file_name): + network = mx.sym.load(file_name) + return network + + +def parse_args(): + """Parse the arguments""" + parser = argparse.ArgumentParser(description='train an image classifier on mnist') + parser.add_argument('--network', type=str, default='lenet', + help='the cnn to use (mlp | lenet | ') + parser.add_argument('--caffe-loss', type=int, default=0, + help='Use CaffeLoss symbol') + parser.add_argument('--caffe-data', action='store_true', + help='Use Caffe input-data layer only if specified') + parser.add_argument('--data-dir', type=str, default='mnist/', + help='the input data directory') + parser.add_argument('--gpus', type=str, + help='the gpus will be used, e.g "0,1,2,3"') + parser.add_argument('--num-examples', type=int, default=60000, + help='the number of training examples') + parser.add_argument('--batch-size', type=int, default=128, + help='the batch size') + parser.add_argument('--lr', type=float, default=.1, + help='the initial learning rate') + parser.add_argument('--model-prefix', type=str, + help='the prefix of the model to load/save') + parser.add_argument('--save-model-prefix', type=str, + help='the prefix of the model to save') + parser.add_argument('--num-epochs', type=int, default=10, + help='the number of training epochs') + parser.add_argument('--load-epoch', type=int, + help="load the model on an epoch using the model-prefix") + parser.add_argument('--kv-store', type=str, default='local', + help='the kvstore type') + parser.add_argument('--lr-factor', type=float, default=1, + help='times the lr with a factor for every lr-factor-epoch epoch') + parser.add_argument('--lr-factor-epoch', type=float, default=1, + help='the number of epoch to factor the lr, could be .5') + return parser.parse_args() + + +if __name__ == '__main__': + args = parse_args() + use_caffe_loss = args.caffe_loss + use_caffe_data = args.caffe_data + + data_shape = () + if args.network == 'mlp': + data_shape = (784, ) + net = get_mlp() + elif args.network == 'lenet': + if not use_caffe_data: + data_shape = (1, 28, 28) + net = get_lenet() + else: + net = get_network_from_json_file(args.network) + + # train + if use_caffe_loss: + train_model.fit(args, net, get_iterator(data_shape, use_caffe_data), mx.metric.Caffe()) + else: + train_model.fit(args, net, get_iterator(data_shape, use_caffe_data)) diff --git a/example/caffe/train_model.py b/example/caffe/train_model.py new file mode 100644 index 000000000000..d7dfd5d7a31e --- /dev/null +++ b/example/caffe/train_model.py @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Train module using Caffe operator in MXNet""" +import os +import logging +import mxnet as mx + + +def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None): + """Train the model using Caffe operator in MXNet""" + # kvstore + kv = mx.kvstore.create(args.kv_store) + + # logging + head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' + if 'log_file' in args and args.log_file is not None: + log_file = args.log_file + log_dir = args.log_dir + log_file_full_name = os.path.join(log_dir, log_file) + if not os.path.exists(log_dir): + os.mkdir(log_dir) + logger = logging.getLogger() + handler = logging.FileHandler(log_file_full_name) + formatter = logging.Formatter(head) + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.DEBUG) + logger.info('start with arguments %s', args) + else: + logging.basicConfig(level=logging.DEBUG, format=head) + logging.info('start with arguments %s', args) + + # load model + model_prefix = args.model_prefix + if model_prefix is not None: + model_prefix += "-%d" % (kv.rank) + model_args = {} + if args.load_epoch is not None: + assert model_prefix is not None + tmp = mx.model.FeedForward.load(model_prefix, args.load_epoch) + model_args = {'arg_params' : tmp.arg_params, + 'aux_params' : tmp.aux_params, + 'begin_epoch' : args.load_epoch} + # save model + save_model_prefix = args.save_model_prefix + if save_model_prefix is None: + save_model_prefix = model_prefix + checkpoint = None if save_model_prefix is None else mx.callback.do_checkpoint(save_model_prefix) + + # data + (train, val) = data_loader(args, kv) + + # train + devs = mx.cpu() if args.gpus is None else [ + mx.gpu(int(i)) for i in args.gpus.split(',')] + + epoch_size = args.num_examples / args.batch_size + + if args.kv_store == 'dist_sync': + epoch_size /= kv.num_workers + model_args['epoch_size'] = epoch_size + + if 'lr_factor' in args and args.lr_factor < 1: + model_args['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( + step=max(int(epoch_size * args.lr_factor_epoch), 1), + factor=args.lr_factor) + + if 'clip_gradient' in args and args.clip_gradient is not None: + model_args['clip_gradient'] = args.clip_gradient + + # disable kvstore for single device + if 'local' in kv.type and ( + args.gpus is None or len(args.gpus.split(',')) is 1): + kv = None + + mod = mx.mod.Module(network, context=devs) + + if eval_metrics is None: + eval_metrics = ['accuracy'] + # TopKAccuracy only allows top_k > 1 + for top_k in [5, 10, 20]: + eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=top_k)) + + if batch_end_callback is not None: + if not isinstance(batch_end_callback, list): + batch_end_callback = [batch_end_callback] + else: + batch_end_callback = [] + batch_end_callback.append(mx.callback.Speedometer(args.batch_size, 50)) + + mod.fit(train_data=train, eval_metric=eval_metrics, eval_data=val, optimizer='sgd', + optimizer_params={'learning_rate':args.lr, 'momentum': 0.9, 'wd': 0.00001}, + num_epoch=args.num_epochs, batch_end_callback=batch_end_callback, + initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), + kvstore=kv, epoch_end_callback=checkpoint, **model_args) diff --git a/example/capsnet/capsulenet.py b/example/capsnet/capsulenet.py new file mode 100644 index 000000000000..4d455dbc504c --- /dev/null +++ b/example/capsnet/capsulenet.py @@ -0,0 +1,373 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Generate MXNet implementation of CapsNet +Reference 1: https://www.cs.toronto.edu/~fritz/absps/transauto6.pdf +Reference 2: https://arxiv.org/pdf/1710.09829.pdf +""" +import os +import re +import gzip +import struct +import numpy as np +import scipy.ndimage as ndi +import mxnet as mx +from capsulelayers import primary_caps, CapsuleLayer + +from mxboard import SummaryWriter + + +def margin_loss(y_true, y_pred): + loss = y_true * mx.sym.square(mx.sym.maximum(0., 0.9 - y_pred)) +\ + 0.5 * (1 - y_true) * mx.sym.square(mx.sym.maximum(0., y_pred - 0.1)) + return mx.sym.mean(data=mx.sym.sum(loss, 1)) + + +def capsnet(batch_size, n_class, num_routing, recon_loss_weight): + """Create CapsNet""" + # data.shape = [batch_size, 1, 28, 28] + data = mx.sym.Variable('data') + + input_shape = (1, 28, 28) + # Conv2D layer + # net.shape = [batch_size, 256, 20, 20] + conv1 = mx.sym.Convolution(data=data, + num_filter=256, + kernel=(9, 9), + layout='NCHW', + name='conv1') + conv1 = mx.sym.Activation(data=conv1, act_type='relu', name='conv1_act') + # net.shape = [batch_size, 256, 6, 6] + + primarycaps = primary_caps(data=conv1, + dim_vector=8, + n_channels=32, + kernel=(9, 9), + strides=[2, 2], + name='primarycaps') + primarycaps.infer_shape(data=(batch_size, 1, 28, 28)) + # CapsuleLayer + kernel_initializer = mx.init.Xavier(rnd_type='uniform', factor_type='avg', magnitude=3) + bias_initializer = mx.init.Zero() + digitcaps = CapsuleLayer(num_capsule=10, + dim_vector=16, + batch_size=batch_size, + kernel_initializer=kernel_initializer, + bias_initializer=bias_initializer, + num_routing=num_routing)(primarycaps) + + # out_caps : (batch_size, 10) + out_caps = mx.sym.sqrt(data=mx.sym.sum(mx.sym.square(digitcaps), 2)) + out_caps.infer_shape(data=(batch_size, 1, 28, 28)) + + y = mx.sym.Variable('softmax_label', shape=(batch_size,)) + y_onehot = mx.sym.one_hot(y, n_class) + y_reshaped = mx.sym.Reshape(data=y_onehot, shape=(batch_size, -4, n_class, -1)) + y_reshaped.infer_shape(softmax_label=(batch_size,)) + + # inputs_masked : (batch_size, 16) + inputs_masked = mx.sym.linalg_gemm2(y_reshaped, digitcaps, transpose_a=True) + inputs_masked = mx.sym.Reshape(data=inputs_masked, shape=(-3, 0)) + x_recon = mx.sym.FullyConnected(data=inputs_masked, num_hidden=512, name='x_recon') + x_recon = mx.sym.Activation(data=x_recon, act_type='relu', name='x_recon_act') + x_recon = mx.sym.FullyConnected(data=x_recon, num_hidden=1024, name='x_recon2') + x_recon = mx.sym.Activation(data=x_recon, act_type='relu', name='x_recon_act2') + x_recon = mx.sym.FullyConnected(data=x_recon, num_hidden=np.prod(input_shape), name='x_recon3') + x_recon = mx.sym.Activation(data=x_recon, act_type='sigmoid', name='x_recon_act3') + + data_flatten = mx.sym.flatten(data=data) + squared_error = mx.sym.square(x_recon-data_flatten) + recon_error = mx.sym.mean(squared_error) + recon_error_stopped = recon_error + recon_error_stopped = mx.sym.BlockGrad(recon_error_stopped) + loss = mx.symbol.MakeLoss((1-recon_loss_weight)*margin_loss(y_onehot, out_caps)+recon_loss_weight*recon_error) + + out_caps_blocked = out_caps + out_caps_blocked = mx.sym.BlockGrad(out_caps_blocked) + return mx.sym.Group([out_caps_blocked, loss, recon_error_stopped]) + + +def download_data(url, force_download=False): + fname = url.split("/")[-1] + if force_download or not os.path.exists(fname): + mx.test_utils.download(url, fname) + return fname + + +def read_data(label_url, image_url): + with gzip.open(download_data(label_url)) as flbl: + magic, num = struct.unpack(">II", flbl.read(8)) + label = np.fromstring(flbl.read(), dtype=np.int8) + with gzip.open(download_data(image_url), 'rb') as fimg: + magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) + image = np.fromstring(fimg.read(), dtype=np.uint8) + np.reshape(image, len(label), (rows, cols)) + return label, image + + +def to4d(img): + return img.reshape(img.shape[0], 1, 28, 28).astype(np.float32)/255 + + +class LossMetric(mx.metric.EvalMetric): + """Evaluate the loss function""" + def __init__(self, batch_size, num_gpus): + super(LossMetric, self).__init__('LossMetric') + self.batch_size = batch_size + self.num_gpu = num_gpus + self.sum_metric = 0 + self.num_inst = 0 + self.loss = 0.0 + self.batch_sum_metric = 0 + self.batch_num_inst = 0 + self.batch_loss = 0.0 + self.recon_loss = 0.0 + self.n_batch = 0 + + def update(self, labels, preds): + """Update the hyper-parameters and loss of CapsNet""" + batch_sum_metric = 0 + batch_num_inst = 0 + for label, pred_outcaps in zip(labels[0], preds[0]): + label_np = int(label.asnumpy()) + pred_label = int(np.argmax(pred_outcaps.asnumpy())) + batch_sum_metric += int(label_np == pred_label) + batch_num_inst += 1 + batch_loss = preds[1].asnumpy() + recon_loss = preds[2].asnumpy() + self.sum_metric += batch_sum_metric + self.num_inst += batch_num_inst + self.loss += batch_loss + self.recon_loss += recon_loss + self.batch_sum_metric = batch_sum_metric + self.batch_num_inst = batch_num_inst + self.batch_loss = batch_loss + self.n_batch += 1 + + def get_name_value(self): + acc = float(self.sum_metric)/float(self.num_inst) + mean_loss = self.loss / float(self.n_batch) + mean_recon_loss = self.recon_loss / float(self.n_batch) + return acc, mean_loss, mean_recon_loss + + def get_batch_log(self, n_batch): + print("n_batch :"+str(n_batch)+" batch_acc:" + + str(float(self.batch_sum_metric) / float(self.batch_num_inst)) + + ' batch_loss:' + str(float(self.batch_loss)/float(self.batch_num_inst))) + self.batch_sum_metric = 0 + self.batch_num_inst = 0 + self.batch_loss = 0.0 + + def reset(self): + self.sum_metric = 0 + self.num_inst = 0 + self.loss = 0.0 + self.recon_loss = 0.0 + self.n_batch = 0 + + +class SimpleLRScheduler(mx.lr_scheduler.LRScheduler): + """A simple lr schedule that simply return `dynamic_lr`. We will set `dynamic_lr` + dynamically based on performance on the validation set. + """ + + def __init__(self, learning_rate=0.001): + super(SimpleLRScheduler, self).__init__() + self.learning_rate = learning_rate + + def __call__(self, num_update): + return self.learning_rate + + +def do_training(num_epoch, optimizer, kvstore, learning_rate, model_prefix, decay): + """Perform CapsNet training""" + summary_writer = SummaryWriter(args.tblog_dir) + lr_scheduler = SimpleLRScheduler(learning_rate) + optimizer_params = {'lr_scheduler': lr_scheduler} + module.init_params() + module.init_optimizer(kvstore=kvstore, + optimizer=optimizer, + optimizer_params=optimizer_params) + n_epoch = 0 + while True: + if n_epoch >= num_epoch: + break + train_iter.reset() + val_iter.reset() + loss_metric.reset() + for n_batch, data_batch in enumerate(train_iter): + module.forward_backward(data_batch) + module.update() + module.update_metric(loss_metric, data_batch.label) + loss_metric.get_batch_log(n_batch) + train_acc, train_loss, train_recon_err = loss_metric.get_name_value() + loss_metric.reset() + for n_batch, data_batch in enumerate(val_iter): + module.forward(data_batch) + module.update_metric(loss_metric, data_batch.label) + loss_metric.get_batch_log(n_batch) + val_acc, val_loss, val_recon_err = loss_metric.get_name_value() + + summary_writer.add_scalar('train_acc', train_acc, n_epoch) + summary_writer.add_scalar('train_loss', train_loss, n_epoch) + summary_writer.add_scalar('train_recon_err', train_recon_err, n_epoch) + summary_writer.add_scalar('val_acc', val_acc, n_epoch) + summary_writer.add_scalar('val_loss', val_loss, n_epoch) + summary_writer.add_scalar('val_recon_err', val_recon_err, n_epoch) + + print('Epoch[%d] train acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, train_acc, train_loss, + train_recon_err)) + print('Epoch[%d] val acc: %.4f loss: %.6f recon_err: %.6f' % (n_epoch, val_acc, val_loss, val_recon_err)) + print('SAVE CHECKPOINT') + + module.save_checkpoint(prefix=model_prefix, epoch=n_epoch) + n_epoch += 1 + lr_scheduler.learning_rate = learning_rate * (decay ** n_epoch) + + +def apply_transform(x, transform_matrix, fill_mode='nearest', cval=0.): + """Apply transform on nd.array""" + x = np.rollaxis(x, 0, 0) + final_affine_matrix = transform_matrix[:2, :2] + final_offset = transform_matrix[:2, 2] + channel_images = [ndi.interpolation.affine_transform( + x_channel, + final_affine_matrix, + final_offset, + order=0, + mode=fill_mode, + cval=cval) for x_channel in x] + x = np.stack(channel_images, axis=0) + x = np.rollaxis(x, 0, 0 + 1) + return x + + +def random_shift(x, width_shift_fraction, height_shift_fraction): + tx = np.random.uniform(-height_shift_fraction, height_shift_fraction) * x.shape[2] + ty = np.random.uniform(-width_shift_fraction, width_shift_fraction) * x.shape[1] + shift_matrix = np.array([[1, 0, tx], + [0, 1, ty], + [0, 0, 1]]) + x = apply_transform(x, shift_matrix, 'nearest') + return x + + +def _shuffle(data, idx): + """Shuffle the data.""" + shuffle_data = [] + + for idx_k, idx_v in data: + shuffle_data.append((idx_k, mx.ndarray.array(idx_v.asnumpy()[idx], idx_v.context))) + + return shuffle_data + + +class MNISTCustomIter(mx.io.NDArrayIter): + """Create custom iterator of mnist dataset""" + def __init__(self, data, label, batch_size, shuffle): + self.data = data + self.label = label + self.batch_size = batch_size + self.shuffle = shuffle + self.cursor = None + + def reset(self): + """Reset class MNISTCustomIter(mx.io.NDArrayIter):""" + # shuffle data + if self.is_train: + np.random.shuffle(self.idx) + self.data = _shuffle(self.data, self.idx) + self.label = _shuffle(self.label, self.idx) + + if self.last_batch_handle == 'roll_over' and self.cursor > self.num_data: + self.cursor = -self.batch_size + (self.cursor % self.num_data) % self.batch_size + else: + self.cursor = -self.batch_size + + def set_is_train(self, is_train): + """Set training flag""" + self.is_train = is_train + + def next(self): + """Generate next of iterator""" + if self.iter_next(): + if self.is_train: + data_raw_list = self.getdata() + data_shifted = [] + for data_raw in data_raw_list[0]: + data_shifted.append(random_shift(data_raw.asnumpy(), 0.1, 0.1)) + return mx.io.DataBatch(data=[mx.nd.array(data_shifted)], label=self.getlabel(), + pad=self.getpad(), index=None) + else: + return mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), pad=self.getpad(), index=None) + + else: + raise StopIteration + + +if __name__ == "__main__": + # Read mnist data set + path = 'http://yann.lecun.com/exdb/mnist/' + (train_lbl, train_img) = read_data(path + 'train-labels-idx1-ubyte.gz', path + 'train-images-idx3-ubyte.gz') + (val_lbl, val_img) = read_data(path + 't10k-labels-idx1-ubyte.gz', path + 't10k-images-idx3-ubyte.gz') + + # set batch size + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--batch_size', default=100, type=int) + parser.add_argument('--devices', default='gpu0', type=str) + parser.add_argument('--num_epoch', default=100, type=int) + parser.add_argument('--lr', default=0.001, type=float) + parser.add_argument('--num_routing', default=3, type=int) + parser.add_argument('--model_prefix', default='capsnet', type=str) + parser.add_argument('--decay', default=0.9, type=float) + parser.add_argument('--tblog_dir', default='tblog', type=str) + parser.add_argument('--recon_loss_weight', default=0.392, type=float) + args = parser.parse_args() + for k, v in sorted(vars(args).items()): + print("{0}: {1}".format(k, v)) + contexts = re.split(r'\W+', args.devices) + for i, ctx in enumerate(contexts): + if ctx[:3] == 'gpu': + contexts[i] = mx.context.gpu(int(ctx[3:])) + else: + contexts[i] = mx.context.cpu() + num_gpu = len(contexts) + + if args.batch_size % num_gpu != 0: + raise Exception('num_gpu should be positive divisor of batch_size') + + # generate train_iter, val_iter + train_iter = MNISTCustomIter(data=to4d(train_img), label=train_lbl, batch_size=int(args.batch_size), shuffle=True) + train_iter.set_is_train(True) + val_iter = MNISTCustomIter(data=to4d(val_img), label=val_lbl, batch_size=int(args.batch_size), shuffle=True) + val_iter.set_is_train(False) + # define capsnet + final_net = capsnet(batch_size=int(args.batch_size/num_gpu), + n_class=10, + num_routing=args.num_routing, + recon_loss_weight=args.recon_loss_weight) + # set metric + loss_metric = LossMetric(args.batch_size/num_gpu, 1) + + # run model + module = mx.mod.Module(symbol=final_net, context=contexts, data_names=('data',), label_names=('softmax_label',)) + module.bind(data_shapes=train_iter.provide_data, + label_shapes=val_iter.provide_label, + for_training=True) + + do_training(num_epoch=args.num_epoch, optimizer='adam', kvstore='device', learning_rate=args.lr, + model_prefix=args.model_prefix, decay=args.decay) diff --git a/example/ctc/lstm_ocr_train.py b/example/ctc/lstm_ocr_train.py new file mode 100644 index 000000000000..49d9531920ae --- /dev/null +++ b/example/ctc/lstm_ocr_train.py @@ -0,0 +1,125 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" An example of using WarpCTC loss for an OCR problem using LSTM and CAPTCHA image data""" + +from __future__ import print_function + +import argparse +import logging +import os + +from captcha_generator import MPDigitCaptcha +from hyperparams import Hyperparams +from ctc_metrics import CtcMetrics +import lstm +import mxnet as mx +from ocr_iter import OCRIter + + +def get_fonts(path): + fonts = list() + if os.path.isdir(path): + for filename in os.listdir(path): + if filename.endswith('.ttf'): + fonts.append(os.path.join(path, filename)) + else: + fonts.append(path) + return fonts + + +def parse_args(): + """Parse command line arguments""" + parser = argparse.ArgumentParser() + parser.add_argument("font_path", help="Path to ttf font file or directory containing ttf files") + parser.add_argument("--loss", help="'ctc' or 'warpctc' loss [Default 'ctc']", default='ctc') + parser.add_argument("--cpu", + help="Number of CPUs for training [Default 8]. Ignored if --gpu is specified.", + type=int, default=8) + parser.add_argument("--gpu", help="Number of GPUs for training [Default 0]", type=int) + parser.add_argument("--num_proc", help="Number CAPTCHA generating processes [Default 4]", type=int, default=4) + parser.add_argument("--prefix", help="Checkpoint prefix [Default 'ocr']", default='ocr') + return parser.parse_args() + + +def main(): + """Program entry point""" + args = parse_args() + if not any(args.loss == s for s in ['ctc', 'warpctc']): + raise ValueError("Invalid loss '{}' (must be 'ctc' or 'warpctc')".format(args.loss)) + + hp = Hyperparams() + + # Start a multiprocessor captcha image generator + mp_captcha = MPDigitCaptcha( + font_paths=get_fonts(args.font_path), h=hp.seq_length, w=30, + num_digit_min=3, num_digit_max=4, num_processes=args.num_proc, max_queue_size=hp.batch_size * 2) + try: + # Must call start() before any call to mxnet module (https://github.com/apache/incubator-mxnet/issues/9213) + mp_captcha.start() + + if args.gpu: + contexts = [mx.context.gpu(i) for i in range(args.gpu)] + else: + contexts = [mx.context.cpu(i) for i in range(args.cpu)] + + init_states = lstm.init_states(hp.batch_size, hp.num_lstm_layer, hp.num_hidden) + + data_train = OCRIter( + hp.train_epoch_size // hp.batch_size, hp.batch_size, init_states, captcha=mp_captcha, name='train') + data_val = OCRIter( + hp.eval_epoch_size // hp.batch_size, hp.batch_size, init_states, captcha=mp_captcha, name='val') + + symbol = lstm.lstm_unroll( + num_lstm_layer=hp.num_lstm_layer, + seq_len=hp.seq_length, + num_hidden=hp.num_hidden, + num_label=hp.num_label, + loss_type=args.loss) + + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + + module = mx.mod.Module( + symbol, + data_names=['data', 'l0_init_c', 'l0_init_h', 'l1_init_c', 'l1_init_h'], + label_names=['label'], + context=contexts) + + metrics = CtcMetrics(hp.seq_length) + module.fit(train_data=data_train, + eval_data=data_val, + # use metrics.accuracy or metrics.accuracy_lcs + eval_metric=mx.metric.np(metrics.accuracy, allow_extra_outputs=True), + optimizer='sgd', + optimizer_params={'learning_rate': hp.learning_rate, + 'momentum': hp.momentum, + 'wd': 0.00001, + }, + initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), + num_epoch=hp.num_epoch, + batch_end_callback=mx.callback.Speedometer(hp.batch_size, 50), + epoch_end_callback=mx.callback.do_checkpoint(args.prefix), + ) + except KeyboardInterrupt: + print("W: interrupt received, stopping...") + finally: + # Reset multiprocessing captcha generator to stop processes + mp_captcha.reset() + + +if __name__ == '__main__': + main() diff --git a/example/deep-embedded-clustering/autoencoder.py b/example/deep-embedded-clustering/autoencoder.py new file mode 100644 index 000000000000..c75634475e3a --- /dev/null +++ b/example/deep-embedded-clustering/autoencoder.py @@ -0,0 +1,205 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=missing-docstring, arguments-differ +from __future__ import print_function + +import logging +import numpy as np +import mxnet as mx +import model +from solver import Solver, Monitor + + +class AutoEncoderModel(model.MXModel): + def setup(self, dims, sparseness_penalty=None, pt_dropout=None, + ft_dropout=None, input_act=None, internal_act='relu', output_act=None): + self.N = len(dims) - 1 + self.dims = dims + self.stacks = [] + self.pt_dropout = pt_dropout + self.ft_dropout = ft_dropout + self.input_act = input_act + self.internal_act = internal_act + self.output_act = output_act + + self.data = mx.symbol.Variable('data') + for i in range(self.N): + if i == 0: + decoder_act = input_act + idropout = None + else: + decoder_act = internal_act + idropout = pt_dropout + if i == self.N-1: + encoder_act = output_act + odropout = None + else: + encoder_act = internal_act + odropout = pt_dropout + istack, iargs, iargs_grad, iargs_mult, iauxs = self.make_stack( + i, self.data, dims[i], dims[i+1], sparseness_penalty, + idropout, odropout, encoder_act, decoder_act + ) + self.stacks.append(istack) + self.args.update(iargs) + self.args_grad.update(iargs_grad) + self.args_mult.update(iargs_mult) + self.auxs.update(iauxs) + self.encoder, self.internals = self.make_encoder( + self.data, dims, sparseness_penalty, ft_dropout, internal_act, output_act) + self.decoder = self.make_decoder( + self.encoder, dims, sparseness_penalty, ft_dropout, internal_act, input_act) + if input_act == 'softmax': + self.loss = self.decoder + else: + self.loss = mx.symbol.LinearRegressionOutput(data=self.decoder, label=self.data) + + def make_stack(self, istack, data, num_input, num_hidden, sparseness_penalty=None, + idropout=None, odropout=None, encoder_act='relu', decoder_act='relu'): + x = data + if idropout: + x = mx.symbol.Dropout(data=x, p=idropout) + x = mx.symbol.FullyConnected(name='encoder_%d'%istack, data=x, num_hidden=num_hidden) + if encoder_act: + x = mx.symbol.Activation(data=x, act_type=encoder_act) + if encoder_act == 'sigmoid' and sparseness_penalty: + x = mx.symbol.IdentityAttachKLSparseReg( + data=x, name='sparse_encoder_%d' % istack, penalty=sparseness_penalty) + if odropout: + x = mx.symbol.Dropout(data=x, p=odropout) + x = mx.symbol.FullyConnected(name='decoder_%d'%istack, data=x, num_hidden=num_input) + if decoder_act == 'softmax': + x = mx.symbol.Softmax(data=x, label=data, prob_label=True, act_type=decoder_act) + elif decoder_act: + x = mx.symbol.Activation(data=x, act_type=decoder_act) + if decoder_act == 'sigmoid' and sparseness_penalty: + x = mx.symbol.IdentityAttachKLSparseReg( + data=x, name='sparse_decoder_%d' % istack, penalty=sparseness_penalty) + x = mx.symbol.LinearRegressionOutput(data=x, label=data) + else: + x = mx.symbol.LinearRegressionOutput(data=x, label=data) + + args = {'encoder_%d_weight' % istack: mx.nd.empty((num_hidden, num_input), self.xpu), + 'encoder_%d_bias' % istack: mx.nd.empty((num_hidden,), self.xpu), + 'decoder_%d_weight' % istack: mx.nd.empty((num_input, num_hidden), self.xpu), + 'decoder_%d_bias' % istack: mx.nd.empty((num_input,), self.xpu), } + args_grad = {'encoder_%d_weight' % istack: mx.nd.empty((num_hidden, num_input), self.xpu), + 'encoder_%d_bias' % istack: mx.nd.empty((num_hidden,), self.xpu), + 'decoder_%d_weight' % istack: mx.nd.empty((num_input, num_hidden), self.xpu), + 'decoder_%d_bias' % istack: mx.nd.empty((num_input,), self.xpu), } + args_mult = {'encoder_%d_weight' % istack: 1.0, + 'encoder_%d_bias' % istack: 2.0, + 'decoder_%d_weight' % istack: 1.0, + 'decoder_%d_bias' % istack: 2.0, } + auxs = {} + if encoder_act == 'sigmoid' and sparseness_penalty: + auxs['sparse_encoder_%d_moving_avg' % istack] = mx.nd.ones(num_hidden, self.xpu) * 0.5 + if decoder_act == 'sigmoid' and sparseness_penalty: + auxs['sparse_decoder_%d_moving_avg' % istack] = mx.nd.ones(num_input, self.xpu) * 0.5 + init = mx.initializer.Uniform(0.07) + for k, v in args.items(): + init(mx.initializer.InitDesc(k), v) + + return x, args, args_grad, args_mult, auxs + + def make_encoder(self, data, dims, sparseness_penalty=None, dropout=None, internal_act='relu', + output_act=None): + x = data + internals = [] + N = len(dims) - 1 + for i in range(N): + x = mx.symbol.FullyConnected(name='encoder_%d'%i, data=x, num_hidden=dims[i+1]) + if internal_act and i < N-1: + x = mx.symbol.Activation(data=x, act_type=internal_act) + if internal_act == 'sigmoid' and sparseness_penalty: + x = mx.symbol.IdentityAttachKLSparseReg( + data=x, name='sparse_encoder_%d' % i, penalty=sparseness_penalty) + elif output_act and i == N-1: + x = mx.symbol.Activation(data=x, act_type=output_act) + if output_act == 'sigmoid' and sparseness_penalty: + x = mx.symbol.IdentityAttachKLSparseReg( + data=x, name='sparse_encoder_%d' % i, penalty=sparseness_penalty) + if dropout: + x = mx.symbol.Dropout(data=x, p=dropout) + internals.append(x) + return x, internals + + def make_decoder(self, feature, dims, sparseness_penalty=None, dropout=None, + internal_act='relu', input_act=None): + x = feature + N = len(dims) - 1 + for i in reversed(range(N)): + x = mx.symbol.FullyConnected(name='decoder_%d'%i, data=x, num_hidden=dims[i]) + if internal_act and i > 0: + x = mx.symbol.Activation(data=x, act_type=internal_act) + if internal_act == 'sigmoid' and sparseness_penalty: + x = mx.symbol.IdentityAttachKLSparseReg( + data=x, name='sparse_decoder_%d' % i, penalty=sparseness_penalty) + elif input_act and i == 0: + x = mx.symbol.Activation(data=x, act_type=input_act) + if input_act == 'sigmoid' and sparseness_penalty: + x = mx.symbol.IdentityAttachKLSparseReg( + data=x, name='sparse_decoder_%d' % i, penalty=sparseness_penalty) + if dropout and i > 0: + x = mx.symbol.Dropout(data=x, p=dropout) + return x + + def layerwise_pretrain(self, X, batch_size, n_iter, optimizer, l_rate, decay, + lr_scheduler=None, print_every=1000): + def l2_norm(label, pred): + return np.mean(np.square(label-pred))/2.0 + solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate, + lr_scheduler=lr_scheduler) + solver.set_metric(mx.metric.CustomMetric(l2_norm)) + solver.set_monitor(Monitor(print_every)) + data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=True, + last_batch_handle='roll_over') + for i in range(self.N): + if i == 0: + data_iter_i = data_iter + else: + X_i = list(model.extract_feature( + self.internals[i-1], self.args, self.auxs, data_iter, X.shape[0], + self.xpu).values())[0] + data_iter_i = mx.io.NDArrayIter({'data': X_i}, batch_size=batch_size, + last_batch_handle='roll_over') + logging.info('Pre-training layer %d...', i) + solver.solve(self.xpu, self.stacks[i], self.args, self.args_grad, self.auxs, + data_iter_i, 0, n_iter, {}, False) + + def finetune(self, X, batch_size, n_iter, optimizer, l_rate, decay, lr_scheduler=None, + print_every=1000): + def l2_norm(label, pred): + return np.mean(np.square(label-pred))/2.0 + solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate, + lr_scheduler=lr_scheduler) + solver.set_metric(mx.metric.CustomMetric(l2_norm)) + solver.set_monitor(Monitor(print_every)) + data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=True, + last_batch_handle='roll_over') + logging.info('Fine tuning...') + solver.solve(self.xpu, self.loss, self.args, self.args_grad, self.auxs, data_iter, + 0, n_iter, {}, False) + + def eval(self, X): + batch_size = 100 + data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=False, + last_batch_handle='pad') + Y = list(model.extract_feature( + self.loss, self.args, self.auxs, data_iter, X.shape[0], self.xpu).values())[0] + return np.mean(np.square(Y-X))/2.0 diff --git a/example/deep-embedded-clustering/dec.py b/example/deep-embedded-clustering/dec.py new file mode 100644 index 000000000000..8fb3891e3e99 --- /dev/null +++ b/example/deep-embedded-clustering/dec.py @@ -0,0 +1,178 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: skip-file +from __future__ import print_function + +import os +import logging +import numpy as np +from sklearn.cluster import KMeans +from scipy.spatial.distance import cdist +import mxnet as mx +import data +import model +from autoencoder import AutoEncoderModel +from solver import Solver, Monitor + + +def cluster_acc(Y_pred, Y): + from sklearn.utils.linear_assignment_ import linear_assignment + assert Y_pred.size == Y.size + D = max(Y_pred.max(), Y.max())+1 + w = np.zeros((D, D), dtype=np.int64) + for i in range(Y_pred.size): + w[Y_pred[i], int(Y[i])] += 1 + ind = linear_assignment(w.max() - w) + return sum([w[i, j] for i, j in ind])*1.0/Y_pred.size, w + + +class DECModel(model.MXModel): + class DECLoss(mx.operator.NumpyOp): + def __init__(self, num_centers, alpha): + super(DECModel.DECLoss, self).__init__(need_top_grad=False) + self.num_centers = num_centers + self.alpha = alpha + + def forward(self, in_data, out_data): + z = in_data[0] + mu = in_data[1] + q = out_data[0] + self.mask = 1.0/(1.0+cdist(z, mu)**2/self.alpha) + q[:] = self.mask**((self.alpha+1.0)/2.0) + q[:] = (q.T/q.sum(axis=1)).T + + def backward(self, out_grad, in_data, out_data, in_grad): + q = out_data[0] + z = in_data[0] + mu = in_data[1] + p = in_data[2] + dz = in_grad[0] + dmu = in_grad[1] + self.mask *= (self.alpha+1.0)/self.alpha*(p-q) + dz[:] = (z.T*self.mask.sum(axis=1)).T - self.mask.dot(mu) + dmu[:] = (mu.T*self.mask.sum(axis=0)).T - self.mask.T.dot(z) + + def infer_shape(self, in_shape): + assert len(in_shape) == 3 + assert len(in_shape[0]) == 2 + input_shape = in_shape[0] + label_shape = (input_shape[0], self.num_centers) + mu_shape = (self.num_centers, input_shape[1]) + out_shape = (input_shape[0], self.num_centers) + return [input_shape, mu_shape, label_shape], [out_shape] + + def list_arguments(self): + return ['data', 'mu', 'label'] + + def setup(self, X, num_centers, alpha, save_to='dec_model'): + sep = X.shape[0]*9//10 + X_train = X[:sep] + X_val = X[sep:] + ae_model = AutoEncoderModel(self.xpu, [X.shape[1], 500, 500, 2000, 10], pt_dropout=0.2) + if not os.path.exists(save_to+'_pt.arg'): + ae_model.layerwise_pretrain(X_train, 256, 50000, 'sgd', l_rate=0.1, decay=0.0, + lr_scheduler=mx.lr_scheduler.FactorScheduler(20000, 0.1)) + ae_model.finetune(X_train, 256, 100000, 'sgd', l_rate=0.1, decay=0.0, + lr_scheduler=mx.lr_scheduler.FactorScheduler(20000, 0.1)) + ae_model.save(save_to+'_pt.arg') + logging.log(logging.INFO, "Autoencoder Training error: %f"%ae_model.eval(X_train)) + logging.log(logging.INFO, "Autoencoder Validation error: %f"%ae_model.eval(X_val)) + else: + ae_model.load(save_to+'_pt.arg') + self.ae_model = ae_model + + self.dec_op = DECModel.DECLoss(num_centers, alpha) + label = mx.sym.Variable('label') + self.feature = self.ae_model.encoder + self.loss = self.dec_op(data=self.ae_model.encoder, label=label, name='dec') + self.args.update({k: v for k, v in self.ae_model.args.items() if k in self.ae_model.encoder.list_arguments()}) + self.args['dec_mu'] = mx.nd.empty((num_centers, self.ae_model.dims[-1]), ctx=self.xpu) + self.args_grad.update({k: mx.nd.empty(v.shape, ctx=self.xpu) for k, v in self.args.items()}) + self.args_mult.update({k: k.endswith('bias') and 2.0 or 1.0 for k in self.args}) + self.num_centers = num_centers + + def cluster(self, X, y=None, update_interval=None): + N = X.shape[0] + if not update_interval: + update_interval = N + batch_size = 256 + test_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=False, + last_batch_handle='pad') + args = {k: mx.nd.array(v.asnumpy(), ctx=self.xpu) for k, v in self.args.items()} + z = list(model.extract_feature(self.feature, args, None, test_iter, N, self.xpu).values())[0] + kmeans = KMeans(self.num_centers, n_init=20) + kmeans.fit(z) + args['dec_mu'][:] = kmeans.cluster_centers_ + solver = Solver('sgd', momentum=0.9, wd=0.0, learning_rate=0.01) + + def ce(label, pred): + return np.sum(label*np.log(label/(pred+0.000001)))/label.shape[0] + solver.set_metric(mx.metric.CustomMetric(ce)) + + label_buff = np.zeros((X.shape[0], self.num_centers)) + train_iter = mx.io.NDArrayIter({'data': X}, {'label': label_buff}, batch_size=batch_size, + shuffle=False, last_batch_handle='roll_over') + self.y_pred = np.zeros((X.shape[0])) + + def refresh(i): + if i%update_interval == 0: + z = list(model.extract_feature(self.feature, args, None, test_iter, N, self.xpu).values())[0] + p = np.zeros((z.shape[0], self.num_centers)) + self.dec_op.forward([z, args['dec_mu'].asnumpy()], [p]) + y_pred = p.argmax(axis=1) + print(np.std(np.bincount(y_pred)), np.bincount(y_pred)) + print(np.std(np.bincount(y.astype(np.int))), np.bincount(y.astype(np.int))) + if y is not None: + print(cluster_acc(y_pred, y)[0]) + weight = 1.0/p.sum(axis=0) + weight *= self.num_centers/weight.sum() + p = (p**2)*weight + train_iter.data_list[1][:] = (p.T/p.sum(axis=1)).T + print(np.sum(y_pred != self.y_pred), 0.001*y_pred.shape[0]) + if np.sum(y_pred != self.y_pred) < 0.001*y_pred.shape[0]: + self.y_pred = y_pred + return True + self.y_pred = y_pred + solver.set_iter_start_callback(refresh) + solver.set_monitor(Monitor(50)) + + solver.solve(self.xpu, self.loss, args, self.args_grad, None, + train_iter, 0, 1000000000, {}, False) + self.end_args = args + if y is not None: + return cluster_acc(self.y_pred, y)[0] + else: + return -1 + + +def mnist_exp(xpu): + X, Y = data.get_mnist() + if not os.path.isdir('data'): + os.makedirs('data') + dec_model = DECModel(xpu, X, 10, 1.0, 'data/mnist') + acc = [] + for i in [10*(2**j) for j in range(9)]: + acc.append(dec_model.cluster(X, Y, i)) + logging.log(logging.INFO, 'Clustering Acc: %f at update interval: %d'%(acc[-1], i)) + logging.info(str(acc)) + logging.info('Best Clustering ACC: %f at update_interval: %d'%(np.max(acc), 10*(2**np.argmax(acc)))) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + mnist_exp(mx.gpu(0)) diff --git a/example/distributed_training-horovod/gluon_mnist.py b/example/distributed_training-horovod/gluon_mnist.py index c2e6f0bdc533..7b39f5776a42 100644 --- a/example/distributed_training-horovod/gluon_mnist.py +++ b/example/distributed_training-horovod/gluon_mnist.py @@ -104,7 +104,7 @@ def conv_nets(): # Function to evaluate accuracy for a model def evaluate(model, data_iter, context): data_iter.reset() - metric = mx.gluon.metric.Accuracy() + metric = mx.metric.Accuracy() for _, batch in enumerate(data_iter): data = batch.data[0].as_in_context(context) label = batch.label[0].as_in_context(context) @@ -149,7 +149,7 @@ def evaluate(model, data_iter, context): # Create loss function and train metric loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() -metric = mx.gluon.metric.Accuracy() +metric = mx.metric.Accuracy() # Train model for epoch in range(args.epochs): diff --git a/example/distributed_training-horovod/module_mnist.py b/example/distributed_training-horovod/module_mnist.py new file mode 100644 index 000000000000..4fcb02a46996 --- /dev/null +++ b/example/distributed_training-horovod/module_mnist.py @@ -0,0 +1,166 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import logging +import os +import zipfile + +import horovod.mxnet as hvd +import mxnet as mx +from mxnet.test_utils import download + +# Training settings +parser = argparse.ArgumentParser(description='MXNet MNIST Example') +parser.add_argument('--batch-size', type=int, default=64, + help='training batch size (default: 64)') +parser.add_argument('--dtype', type=str, default='float32', + help='training data type (default: float32)') +parser.add_argument('--epochs', type=int, default=5, + help='number of training epochs (default: 5)') +parser.add_argument('--lr', type=float, default=0.05, + help='learning rate (default: 0.05)') +parser.add_argument('--momentum', type=float, default=0.5, + help='SGD momentum (default: 0.5)') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training (default: False)') +args = parser.parse_args() + +if not args.no_cuda: + # Disable CUDA if there are no GPUs. + if mx.context.num_gpus() == 0: + args.no_cuda = True + +logging.basicConfig(level=logging.INFO) +logging.info(args) + + +# Function to get mnist iterator given a rank +def get_mnist_iterator(rank): + data_dir = "data-%d" % rank + if not os.path.isdir(data_dir): + os.makedirs(data_dir) + zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip', + dirname=data_dir) + with zipfile.ZipFile(zip_file_path) as zf: + zf.extractall(data_dir) + + input_shape = (1, 28, 28) + batch_size = args.batch_size + + train_iter = mx.io.MNISTIter( + image="%s/train-images-idx3-ubyte" % data_dir, + label="%s/train-labels-idx1-ubyte" % data_dir, + input_shape=input_shape, + batch_size=batch_size, + shuffle=True, + flat=False, + num_parts=hvd.size(), + part_index=hvd.rank() + ) + + val_iter = mx.io.MNISTIter( + image="%s/t10k-images-idx3-ubyte" % data_dir, + label="%s/t10k-labels-idx1-ubyte" % data_dir, + input_shape=input_shape, + batch_size=batch_size, + flat=False, + num_parts=hvd.size(), + part_index=hvd.rank() + ) + + return train_iter, val_iter + +# Step 1: initialize Horovod +hvd.init() + +# Horovod: pin context to process +context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank()) + +# Step 2: load data +train_iter, val_iter = get_mnist_iterator(hvd.rank()) + +# Step 3: define network +def conv_net(): + # placeholder for data + data = mx.sym.var('data') + # first conv layer + conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=10) + relu1 = mx.sym.Activation(data=conv1, act_type='relu') + pool1 = mx.sym.Pooling(data=relu1, pool_type='max', kernel=(2, 2), + stride=(2, 2)) + # second conv layer + conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=20) + relu2 = mx.sym.Activation(data=conv2, act_type='relu') + pool2 = mx.sym.Pooling(data=relu2, pool_type='max', kernel=(2, 2), + stride=(2, 2)) + # first fully connected layer + flatten = mx.sym.flatten(data=pool2) + fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=50) + relu3 = mx.sym.Activation(data=fc1, act_type='relu') + # second fully connected layer + fc2 = mx.sym.FullyConnected(data=relu3, num_hidden=10) + # softmax loss + loss = mx.sym.SoftmaxOutput(data=fc2, name='softmax') + return loss + +net = conv_net() +model = mx.mod.Module(symbol=net, context=context) + +# Step 4: initialize parameters +initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", + magnitude=2) +model.bind(data_shapes=train_iter.provide_data, + label_shapes=train_iter.provide_label) +model.init_params(initializer) + +# Horovod: fetch and broadcast parameters +(arg_params, aux_params) = model.get_params() +if arg_params is not None: + hvd.broadcast_parameters(arg_params, root_rank=0) +if aux_params is not None: + hvd.broadcast_parameters(aux_params, root_rank=0) +model.set_params(arg_params=arg_params, aux_params=aux_params) + +# Step 5: create optimizer +optimizer_params = {'learning_rate': args.lr * hvd.size(), + 'rescale_grad': 1.0 / args.batch_size} +opt = mx.optimizer.create('sgd', **optimizer_params) + +# Horovod: wrap optimizer with DistributedOptimizer +opt = hvd.DistributedOptimizer(opt) + +# Step 6: fit and train model +batch_cb = None +if hvd.rank() == 0: + batch_cb = mx.callback.Speedometer(args.batch_size * hvd.size()) +model.fit(train_iter, # train data + kvstore=None, # no kvstore + eval_data=val_iter, # validation data + optimizer=opt, # use SGD to train + eval_metric='acc', # report accuracy during training + batch_end_callback=batch_cb, # report training speed + num_epoch=args.epochs) # train for at most 10 dataset passes + +# Step 7: evaluate model accuracy +acc = mx.metric.Accuracy() +model.score(val_iter, acc) + +if hvd.rank() == 0: + print(acc) + assert acc.get()[1] > 0.96, "Achieved accuracy (%f) is lower than \ + expected (0.96)" % acc.get()[1] diff --git a/example/distributed_training-horovod/resnet50_imagenet.py b/example/distributed_training-horovod/resnet50_imagenet.py index cdf17a8912e0..251e64f0749a 100644 --- a/example/distributed_training-horovod/resnet50_imagenet.py +++ b/example/distributed_training-horovod/resnet50_imagenet.py @@ -283,8 +283,8 @@ def evaluate(epoch): return val_data.reset() - acc_top1 = mx.gluon.metric.Accuracy() - acc_top5 = mx.gluon.metric.TopKAccuracy(5) + acc_top1 = mx.metric.Accuracy() + acc_top5 = mx.metric.TopKAccuracy(5) for _, batch in enumerate(val_data): data, label = batch_fn(batch, context) output = net(data.astype(args.dtype, copy=False)) @@ -318,7 +318,7 @@ def evaluate(epoch): # Create loss function and train metric loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() - metric = mx.gluon.metric.Accuracy() + metric = mx.metric.Accuracy() # Train model for epoch in range(args.num_epochs): @@ -368,6 +368,92 @@ def evaluate(epoch): evaluate(epoch) +def train_module(): + # Create input symbol + data = mx.sym.var('data') + if args.dtype == 'float16': + data = mx.sym.Cast(data=data, dtype=np.float16) + net.cast(np.float16) + + # Create output symbol + out = net(data) + if args.dtype == 'float16': + out = mx.sym.Cast(data=out, dtype=np.float32) + softmax = mx.sym.SoftmaxOutput(out, name='softmax') + + # Create model + mod = mx.mod.Module(softmax, context=context) + + # Initialize parameters + if args.use_pretrained: + arg_params = {} + for x in net.collect_params().values(): + x.reset_ctx(mx.cpu()) + arg_params[x.name] = x.data() + else: + arg_params = None + aux_params = None + mod.bind(data_shapes=train_data.provide_data, + label_shapes=train_data.provide_label) + mod.init_params(initializer, arg_params=arg_params, aux_params=aux_params) + + # Horovod: fetch and broadcast parameters + (arg_params, aux_params) = mod.get_params() + if arg_params is not None: + hvd.broadcast_parameters(arg_params, root_rank=0) + if aux_params is not None: + hvd.broadcast_parameters(aux_params, root_rank=0) + mod.set_params(arg_params=arg_params, aux_params=aux_params) + + # Create optimizer + # Note that when using Module API, we need to specify rescale_grad since + # we create optimizer first and wrap it with DistributedOptimizer. For + # Gluon API, it is handled in Trainer.step() function so there is no need + # to specify rescale_grad (see above train_gluon() function). + optimizer_params = {'wd': args.wd, + 'momentum': args.momentum, + 'rescale_grad': 1.0 / batch_size, + 'lr_scheduler': lr_sched} + if args.dtype == 'float16': + optimizer_params['multi_precision'] = True + opt = mx.optimizer.create('sgd', **optimizer_params) + + # Horovod: wrap optimizer with DistributedOptimizer + opt = hvd.DistributedOptimizer(opt) + + # Setup validation data and callback during training + eval_data = None + if args.eval_epoch: + eval_data = val_data + batch_callback = None + if args.log_interval > 0 and rank == 0: + batch_callback = mx.callback.Speedometer(batch_size * num_workers, + args.log_interval) + + epoch_callback = None + if args.save_frequency > 0: + epoch_callback = mx.callback.do_checkpoint( + '%s-%d' % (args.model, rank), + period=args.save_frequency) + + # Train model + mod.fit(train_data, + eval_data=eval_data, + num_epoch=args.num_epochs, + kvstore=None, + batch_end_callback=batch_callback, + epoch_end_callback=epoch_callback, + optimizer=opt) + + # Evaluate performance if not using synthetic data + if args.use_rec: + acc_top1 = mx.metric.Accuracy() + acc_top5 = mx.metric.TopKAccuracy(5) + res = mod.score(val_data, [acc_top1, acc_top5]) + for name, val in res: + logging.info('Epoch[%d] Rank[%d] Validation-%s=%f', + args.num_epochs - 1, rank, name, val) + if __name__ == '__main__': train_gluon() diff --git a/example/distributed_training/cifar10_dist.py b/example/distributed_training/cifar10_dist.py index c89619d595f2..b66845702137 100644 --- a/example/distributed_training/cifar10_dist.py +++ b/example/distributed_training/cifar10_dist.py @@ -121,7 +121,7 @@ def evaluate_accuracy(data_iterator, network): ---------- tuple of array element """ - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() # Iterate through data and label for i, (data, label) in enumerate(data_iterator): diff --git a/example/distributed_training/cifar10_kvstore_hvd.py b/example/distributed_training/cifar10_kvstore_hvd.py index ff679864f7c3..e6780e5db85e 100644 --- a/example/distributed_training/cifar10_kvstore_hvd.py +++ b/example/distributed_training/cifar10_kvstore_hvd.py @@ -123,7 +123,7 @@ def evaluate(data_iterator, network, context): ---------- tuple of array element """ - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() # Iterate through data and label for i, (data, label) in enumerate(data_iterator): @@ -208,7 +208,7 @@ def __len__(self): optimizer_params={'learning_rate': args.lr}, kvstore=store) -train_metric = mx.gluon.metric.Accuracy() +train_metric = mx.metric.Accuracy() # Run as many epochs as required for epoch in range(args.epochs): diff --git a/example/fcn-xs/solver.py b/example/fcn-xs/solver.py new file mode 100644 index 000000000000..e99b31a13055 --- /dev/null +++ b/example/fcn-xs/solver.py @@ -0,0 +1,143 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: skip-file +import numpy as np +import mxnet as mx +import time +import logging +from collections import namedtuple +from mxnet import optimizer as opt +from mxnet.optimizer import get_updater +from mxnet import metric + +# Parameter to pass to batch_end_callback +BatchEndParam = namedtuple('BatchEndParams', ['epoch', 'nbatch', 'eval_metric']) +class Solver(object): + def __init__(self, symbol, ctx=None, + begin_epoch=0, num_epoch=None, + arg_params=None, aux_params=None, + optimizer='sgd', **kwargs): + self.symbol = symbol + if ctx is None: + ctx = mx.cpu() + self.ctx = ctx + self.begin_epoch = begin_epoch + self.num_epoch = num_epoch + self.arg_params = arg_params + self.aux_params = aux_params + self.optimizer = optimizer + self.kwargs = kwargs.copy() + + def fit(self, train_data, eval_data=None, + eval_metric='acc', + grad_req='write', + epoch_end_callback=None, + batch_end_callback=None, + kvstore='local', + logger=None): + if logger is None: + logger = logging + logging.info('Start training with %s', str(self.ctx)) + arg_shapes, out_shapes, aux_shapes = self.symbol.infer_shape(data=train_data.provide_data[0][1]) + arg_names = self.symbol.list_arguments() + if grad_req != 'null': + self.grad_params = {} + for name, shape in zip(arg_names, arg_shapes): + if not (name.endswith('data') or name.endswith('label')): + self.grad_params[name] = mx.nd.zeros(shape, self.ctx) + else: + self.grad_params = None + aux_names = self.symbol.list_auxiliary_states() + self.aux_params = {k: mx.nd.zeros(s) for k, s in zip(aux_names, aux_shapes)} + data_name = train_data.data_name + label_name = train_data.label_name + input_names = [data_name, label_name] + self.optimizer = opt.create(self.optimizer, rescale_grad=(1.0/train_data.get_batch_size()), **(self.kwargs)) + self.updater = get_updater(self.optimizer) + eval_metric = metric.create(eval_metric) + # begin training + for epoch in range(self.begin_epoch, self.num_epoch): + nbatch = 0 + train_data.reset() + eval_metric.reset() + for data in train_data: + nbatch += 1 + label_shape = data[label_name].shape + self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx) + self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ + label_shape[1]*label_shape[2]), self.ctx) + output_names = self.symbol.list_outputs() + self.exector = self.symbol.bind(self.ctx, self.arg_params, + args_grad=self.grad_params, + grad_req=grad_req, + aux_states=self.aux_params) + assert len(self.symbol.list_arguments()) == len(self.exector.grad_arrays) + update_dict = {name: nd for name, nd in zip(self.symbol.list_arguments(), \ + self.exector.grad_arrays) if nd is not None} + output_dict = {} + output_buff = {} + for key, arr in zip(self.symbol.list_outputs(), self.exector.outputs): + output_dict[key] = arr + output_buff[key] = mx.nd.empty(arr.shape, ctx=mx.cpu()) + self.exector.forward(is_train=True) + for key in output_dict: + output_dict[key].copyto(output_buff[key]) + self.exector.backward() + for key, arr in update_dict.items(): + if key != "bigscore_weight": + self.updater(key, arr, self.arg_params[key]) + pred_shape = self.exector.outputs[0].shape + label = mx.nd.array(data[label_name].reshape(label_shape[0], label_shape[1]*label_shape[2])) + pred = mx.nd.array(output_buff["softmax_output"].asnumpy().reshape(pred_shape[0], \ + pred_shape[1], pred_shape[2]*pred_shape[3])) + eval_metric.update([label], [pred]) + self.exector.outputs[0].wait_to_read() + batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, eval_metric=eval_metric) + batch_end_callback(batch_end_params) + if epoch_end_callback is not None: + epoch_end_callback(epoch, self.symbol, self.arg_params, self.aux_params) + name, value = eval_metric.get() + logger.info(" --->Epoch[%d] Train-%s=%f", epoch, name, value) + # evaluation + if eval_data: + logger.info(" in eval process...") + nbatch = 0 + eval_data.reset() + eval_metric.reset() + for data in eval_data: + nbatch += 1 + label_shape = data[label_name].shape + self.arg_params[data_name] = mx.nd.array(data[data_name], self.ctx) + self.arg_params[label_name] = mx.nd.array(data[label_name].reshape(label_shape[0], \ + label_shape[1]*label_shape[2]), self.ctx) + exector = self.symbol.bind(self.ctx, self.arg_params, + args_grad=self.grad_params, + grad_req=grad_req, + aux_states=self.aux_params) + cpu_output_array = mx.nd.zeros(exector.outputs[0].shape) + exector.forward(is_train=False) + exector.outputs[0].copyto(cpu_output_array) + pred_shape = cpu_output_array.shape + label = mx.nd.array(data[label_name].reshape(label_shape[0], \ + label_shape[1]*label_shape[2])) + pred = mx.nd.array(cpu_output_array.asnumpy().reshape(pred_shape[0], \ + pred_shape[1], pred_shape[2]*pred_shape[3])) + eval_metric.update([label], [pred]) + exector.outputs[0].wait_to_read() + name, value = eval_metric.get() + logger.info('batch[%d] Validation-%s=%f', nbatch, name, value) diff --git a/example/gluon/audio/urban_sounds/train.py b/example/gluon/audio/urban_sounds/train.py index 8a55c5b5bc67..c88f9fb55187 100644 --- a/example/gluon/audio/urban_sounds/train.py +++ b/example/gluon/audio/urban_sounds/train.py @@ -28,7 +28,7 @@ def evaluate_accuracy(data_iterator, net): """Function to evaluate accuracy of any data iterator passed to it as an argument""" - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() for data, label in data_iterator: output = net(data) predictions = nd.argmax(output, axis=1) diff --git a/example/gluon/dc_gan/dcgan.py b/example/gluon/dc_gan/dcgan.py index d7c36a0a3a67..6e03aae8bed6 100644 --- a/example/gluon/dc_gan/dcgan.py +++ b/example/gluon/dc_gan/dcgan.py @@ -259,7 +259,7 @@ def main(): real_label = mx.nd.ones((opt.batch_size,), ctx=ctx) fake_label = mx.nd.zeros((opt.batch_size,), ctx=ctx) - metric = mx.gluon.metric.Accuracy() + metric = mx.metric.Accuracy() print('Training... ') stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') diff --git a/example/gluon/image_classification.py b/example/gluon/image_classification.py index 33583ff20175..7a845bf6f8eb 100644 --- a/example/gluon/image_classification.py +++ b/example/gluon/image_classification.py @@ -27,7 +27,7 @@ from mxnet.gluon.model_zoo import vision as models from mxnet import autograd as ag from mxnet.test_utils import get_mnist_iterator -from mxnet.gluon.metric import Accuracy, TopKAccuracy, CompositeEvalMetric +from mxnet.metric import Accuracy, TopKAccuracy, CompositeEvalMetric import numpy as np from data import (get_cifar10_iterator, get_imagenet_iterator, diff --git a/example/gluon/mnist/mnist.py b/example/gluon/mnist/mnist.py index 81259db8b939..5acaf143ca60 100644 --- a/example/gluon/mnist/mnist.py +++ b/example/gluon/mnist/mnist.py @@ -70,7 +70,7 @@ def transformer(data, label): # train def test(ctx): - metric = mx.gluon.metric.Accuracy() + metric = mx.metric.Accuracy() for data, label in val_data: data = data.as_in_context(ctx) label = label.as_in_context(ctx) @@ -86,7 +86,7 @@ def train(epochs, ctx): # Trainer is for updating parameters with gradient. trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'momentum': opt.momentum}) - metric = mx.gluon.metric.Accuracy() + metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() for epoch in range(epochs): diff --git a/example/gluon/sn_gan/train.py b/example/gluon/sn_gan/train.py index fc4e87d632fe..46e44791cebd 100644 --- a/example/gluon/sn_gan/train.py +++ b/example/gluon/sn_gan/train.py @@ -102,7 +102,7 @@ def facc(label, pred): g_net.collect_params().zero_grad() d_net.collect_params().zero_grad() # define evaluation metric -metric = mx.gluon.metric.CustomMetric(facc) +metric = mx.metric.CustomMetric(facc) # initialize labels real_label = nd.ones(BATCH_SIZE, CTX) fake_label = nd.zeros(BATCH_SIZE, CTX) diff --git a/example/gluon/super_resolution/super_resolution.py b/example/gluon/super_resolution/super_resolution.py index 52bfc2241f82..4a3e8d92aa39 100644 --- a/example/gluon/super_resolution/super_resolution.py +++ b/example/gluon/super_resolution/super_resolution.py @@ -156,7 +156,7 @@ def hybrid_forward(self, F, x): return x net = SuperResolutionNet(upscale_factor) -metric = mx.gluon.metric.MSE() +metric = mx.metric.MSE() def test(ctx): val_data.reset() diff --git a/example/gluon/tree_lstm/main.py b/example/gluon/tree_lstm/main.py index 41e4f4f13ed8..53af3fa019e9 100644 --- a/example/gluon/tree_lstm/main.py +++ b/example/gluon/tree_lstm/main.py @@ -96,7 +96,7 @@ net = SimilarityTreeLSTM(sim_hidden_size, rnn_hidden_size, vocab.size, vocab.embed.shape[1], num_classes) # use pearson correlation and mean-square error for evaluation -metric = mx.gluon.metric.create(['pearsonr', 'mse']) +metric = mx.metric.create(['pearsonr', 'mse']) def to_target(x): target = np.zeros((1, num_classes)) diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py new file mode 100644 index 000000000000..38ca296cf986 --- /dev/null +++ b/example/image-classification/common/fit.py @@ -0,0 +1,340 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" example train fit utility """ +import logging +import os +import time +import re +import math +import mxnet as mx + +def get_epoch_size(args, kv): + return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size) + +def _get_lr_scheduler(args, kv): + if 'lr_factor' not in args or args.lr_factor >= 1: + return (args.lr, None) + epoch_size = get_epoch_size(args, kv) + begin_epoch = args.load_epoch if args.load_epoch else 0 + if 'pow' in args.lr_step_epochs: + lr = args.lr + max_up = args.num_epochs * epoch_size + pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs)) + poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr) + return (lr, poly_sched) + step_epochs = [int(l) for l in args.lr_step_epochs.split(',')] + lr = args.lr + for s in step_epochs: + if begin_epoch >= s: + lr *= args.lr_factor + if lr != args.lr: + logging.info('Adjust learning rate to %e for epoch %d', + lr, begin_epoch) + + steps = [epoch_size * (x - begin_epoch) + for x in step_epochs if x - begin_epoch > 0] + if steps: + return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor, + base_lr=args.lr)) + else: + return (lr, None) + +def _load_model(args, rank=0): + if 'load_epoch' not in args or args.load_epoch is None: + return (None, None, None) + assert args.model_prefix is not None + model_prefix = args.model_prefix + if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)): + model_prefix += "-%d" % (rank) + sym, arg_params, aux_params = mx.model.load_checkpoint( + model_prefix, args.load_epoch) + logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch) + return (sym, arg_params, aux_params) + + +def _save_model(args, rank=0): + if args.model_prefix is None: + return None + return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % ( + args.model_prefix, rank), period=args.save_period) + + +def add_fit_args(parser): + """ + parser : argparse.ArgumentParser + return a parser added with args required by fit + """ + train = parser.add_argument_group('Training', 'model training') + train.add_argument('--network', type=str, + help='the neural network to use') + train.add_argument('--num-layers', type=int, + help='number of layers in the neural network, \ + required by some networks such as resnet') + train.add_argument('--gpus', type=str, + help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu') + train.add_argument('--kv-store', type=str, default='device', + help='key-value store type') + train.add_argument('--num-epochs', type=int, default=100, + help='max num of epochs') + train.add_argument('--lr', type=float, default=0.1, + help='initial learning rate') + train.add_argument('--lr-factor', type=float, default=0.1, + help='the ratio to reduce lr on each step') + train.add_argument('--lr-step-epochs', type=str, + help='the epochs to reduce the lr, e.g. 30,60') + train.add_argument('--initializer', type=str, default='default', + help='the initializer type') + train.add_argument('--optimizer', type=str, default='sgd', + help='the optimizer type') + train.add_argument('--mom', type=float, default=0.9, + help='momentum for sgd') + train.add_argument('--wd', type=float, default=0.0001, + help='weight decay for sgd') + train.add_argument('--batch-size', type=int, default=128, + help='the batch size') + train.add_argument('--disp-batches', type=int, default=20, + help='show progress for every n batches') + train.add_argument('--model-prefix', type=str, + help='model prefix') + train.add_argument('--save-period', type=int, default=1, help='params saving period') + parser.add_argument('--monitor', dest='monitor', type=int, default=0, + help='log network parameters every N iters if larger than 0') + train.add_argument('--load-epoch', type=int, + help='load the model on an epoch using the model-load-prefix') + train.add_argument('--top-k', type=int, default=0, + help='report the top-k accuracy. 0 means no report.') + train.add_argument('--loss', type=str, default='', + help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss') + train.add_argument('--test-io', type=int, default=0, + help='1 means test reading speed without training') + train.add_argument('--dtype', type=str, default='float32', + help='precision: float32 or float16') + train.add_argument('--gc-type', type=str, default='none', + help='type of gradient compression to use, \ + takes `2bit` or `none` for now') + train.add_argument('--gc-threshold', type=float, default=0.5, + help='threshold for 2bit gradient compression') + # additional parameters for large batch sgd + train.add_argument('--macrobatch-size', type=int, default=0, + help='distributed effective batch size') + train.add_argument('--warmup-epochs', type=int, default=5, + help='the epochs to ramp-up lr to scaled large-batch value') + train.add_argument('--warmup-strategy', type=str, default='linear', + help='the ramping-up strategy for large batch sgd') + train.add_argument('--profile-worker-suffix', type=str, default='', + help='profile workers actions into this file. During distributed training\ + filename saved will be rank1_ followed by this suffix') + train.add_argument('--profile-server-suffix', type=str, default='', + help='profile server actions into a file with name like rank1_ followed by this suffix \ + during distributed training') + train.add_argument('--use-imagenet-data-augmentation', type=int, default=0, + help='enable data augmentation of ImageNet data, default disabled') + return train + + +def fit(args, network, data_loader, **kwargs): + """ + train a model + args : argparse returns + network : the symbol definition of the nerual network + data_loader : function that returns the train and val data iterators + """ + # kvstore + kv = mx.kvstore.create(args.kv_store) + if args.gc_type != 'none': + kv.set_gradient_compression({'type': args.gc_type, + 'threshold': args.gc_threshold}) + if args.profile_server_suffix: + mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server') + mx.profiler.set_state(state='run', profile_process='server') + + if args.profile_worker_suffix: + if kv.num_workers > 1: + filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix + else: + filename = args.profile_worker_suffix + mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker') + mx.profiler.set_state(state='run', profile_process='worker') + + # logging + head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + logging.info('start with arguments %s', args) + + epoch_size = get_epoch_size(args, kv) + + # data iterators + (train, val) = data_loader(args, kv) + if 'dist' in args.kv_store and not 'async' in args.kv_store: + logging.info('Resizing training data to %d batches per machine', epoch_size) + # resize train iter to ensure each machine has same number of batches per epoch + # if not, dist_sync can hang at the end with one machine waiting for other machines + train = mx.io.ResizeIter(train, epoch_size) + + if args.test_io: + tic = time.time() + for i, batch in enumerate(train): + if isinstance(batch, list): + for b in batch: + for j in b.data: + j.wait_to_read() + else: + for j in batch.data: + j.wait_to_read() + if (i + 1) % args.disp_batches == 0: + logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i, + args.disp_batches * args.batch_size / (time.time() - tic)) + tic = time.time() + return + + # load model + if 'arg_params' in kwargs and 'aux_params' in kwargs: + arg_params = kwargs['arg_params'] + aux_params = kwargs['aux_params'] + else: + sym, arg_params, aux_params = _load_model(args, kv.rank) + if sym is not None: + assert sym.tojson() == network.tojson() + + # save model + checkpoint = _save_model(args, kv.rank) + + # devices for training + devs = mx.cpu() if args.gpus is None or args.gpus == "" else [ + mx.gpu(int(i)) for i in args.gpus.split(',')] + + # learning rate + lr, lr_scheduler = _get_lr_scheduler(args, kv) + + # create model + model = mx.mod.Module( + context=devs, + symbol=network + ) + + lr_scheduler = lr_scheduler + optimizer_params = { + 'learning_rate': lr, + 'wd': args.wd, + 'lr_scheduler': lr_scheduler, + 'multi_precision': True} + + # Only a limited number of optimizers have 'momentum' property + has_momentum = {'sgd', 'dcasgd', 'nag', 'signum'} + if args.optimizer in has_momentum: + optimizer_params['momentum'] = args.mom + + monitor = mx.mon.Monitor( + args.monitor, pattern=".*") if args.monitor > 0 else None + + # A limited number of optimizers have a warmup period + has_warmup = {'lbnag'} + if args.optimizer in has_warmup: + nworkers = kv.num_workers + if epoch_size < 1: + epoch_size = 1 + macrobatch_size = args.macrobatch_size + if macrobatch_size < args.batch_size * nworkers: + macrobatch_size = args.batch_size * nworkers + #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999) + batch_scale = math.ceil( + float(macrobatch_size) / args.batch_size / nworkers) + optimizer_params['updates_per_epoch'] = epoch_size + optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0 + optimizer_params['batch_scale'] = batch_scale + optimizer_params['warmup_strategy'] = args.warmup_strategy + optimizer_params['warmup_epochs'] = args.warmup_epochs + optimizer_params['num_epochs'] = args.num_epochs + + if args.initializer == 'default': + if args.network == 'alexnet': + # AlexNet will not converge using Xavier + initializer = mx.init.Normal() + # VGG will not trend to converge using Xavier-Gaussian + elif args.network and 'vgg' in args.network: + initializer = mx.init.Xavier() + else: + initializer = mx.init.Xavier( + rnd_type='gaussian', factor_type="in", magnitude=2) + # initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), + elif args.initializer == 'xavier': + initializer = mx.init.Xavier() + elif args.initializer == 'msra': + initializer = mx.init.MSRAPrelu() + elif args.initializer == 'orthogonal': + initializer = mx.init.Orthogonal() + elif args.initializer == 'normal': + initializer = mx.init.Normal() + elif args.initializer == 'uniform': + initializer = mx.init.Uniform() + elif args.initializer == 'one': + initializer = mx.init.One() + elif args.initializer == 'zero': + initializer = mx.init.Zero() + + # evaluation metrices + eval_metrics = ['accuracy'] + if args.top_k > 0: + eval_metrics.append(mx.metric.create( + 'top_k_accuracy', top_k=args.top_k)) + + supported_loss = ['ce', 'nll_loss'] + if len(args.loss) > 0: + # ce or nll loss is only applicable to softmax output + loss_type_list = args.loss.split(',') + if 'softmax_output' in network.list_outputs(): + for loss_type in loss_type_list: + loss_type = loss_type.strip() + if loss_type == 'nll': + loss_type = 'nll_loss' + if loss_type not in supported_loss: + logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \ + 'negative likelihood loss is supported!') + else: + eval_metrics.append(mx.metric.create(loss_type)) + else: + logging.warning("The output is not softmax_output, loss argument will be skipped!") + + # callbacks that run after each batch + batch_end_callbacks = [mx.callback.Speedometer( + args.batch_size, args.disp_batches)] + if 'batch_end_callback' in kwargs: + cbs = kwargs['batch_end_callback'] + batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs] + + # run + model.fit(train, + begin_epoch=args.load_epoch if args.load_epoch else 0, + num_epoch=args.num_epochs, + eval_data=val, + eval_metric=eval_metrics, + kvstore=kv, + optimizer=args.optimizer, + optimizer_params=optimizer_params, + initializer=initializer, + arg_params=arg_params, + aux_params=aux_params, + batch_end_callback=batch_end_callbacks, + epoch_end_callback=checkpoint, + allow_missing=True, + monitor=monitor) + + if args.profile_server_suffix: + mx.profiler.set_state(state='run', profile_process='server') + if args.profile_worker_suffix: + mx.profiler.set_state(state='run', profile_process='worker') diff --git a/example/image-classification/score.py b/example/image-classification/score.py new file mode 100644 index 000000000000..f40e649f1f42 --- /dev/null +++ b/example/image-classification/score.py @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +from common import modelzoo, find_mxnet +import mxnet as mx +import time +import os +import logging + +def score(model, data_val, metrics, gpus, batch_size, rgb_mean=None, mean_img=None, + image_shape='3,224,224', data_nthreads=4, label_name='softmax_label', max_num_examples=None): + # create data iterator + data_shape = tuple([int(i) for i in image_shape.split(',')]) + if mean_img is not None: + mean_args = {'mean_img':mean_img} + elif rgb_mean is not None: + rgb_mean = [float(i) for i in rgb_mean.split(',')] + mean_args = {'mean_r':rgb_mean[0], 'mean_g':rgb_mean[1], + 'mean_b':rgb_mean[2]} + + data = mx.io.ImageRecordIter( + path_imgrec = data_val, + label_width = 1, + preprocess_threads = data_nthreads, + batch_size = batch_size, + data_shape = data_shape, + label_name = label_name, + rand_crop = False, + rand_mirror = False, + **mean_args) + + if isinstance(model, str): + # download model + dir_path = os.path.dirname(os.path.realpath(__file__)) + (prefix, epoch) = modelzoo.download_model( + model, os.path.join(dir_path, 'model')) + sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) + elif isinstance(model, tuple) or isinstance(model, list): + assert len(model) == 3 + (sym, arg_params, aux_params) = model + else: + raise TypeError('model type [%s] is not supported' % str(type(model))) + + # create module + if gpus == '': + devs = mx.cpu() + else: + devs = [mx.gpu(int(i)) for i in gpus.split(',')] + + mod = mx.mod.Module(symbol=sym, context=devs, label_names=[label_name,]) + mod.bind(for_training=False, + data_shapes=data.provide_data, + label_shapes=data.provide_label) + mod.set_params(arg_params, aux_params) + if not isinstance(metrics, list): + metrics = [metrics,] + tic = time.time() + num = 0 + for batch in data: + mod.forward(batch, is_train=False) + for m in metrics: + mod.update_metric(m, batch.label) + num += batch_size + if max_num_examples is not None and num > max_num_examples: + break + return (num / (time.time() - tic), ) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='score a model on a dataset') + parser.add_argument('--model', type=str, required=True, + help = 'the model name.') + parser.add_argument('--gpus', type=str, default='0') + parser.add_argument('--batch-size', type=int, default=64) + parser.add_argument('--rgb-mean', type=str, default='0,0,0') + parser.add_argument('--data-val', type=str, required=True) + parser.add_argument('--image-shape', type=str, default='3,224,224') + parser.add_argument('--data-nthreads', type=int, default=4, + help='number of threads for data decoding') + args = parser.parse_args() + + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) + + metrics = [mx.metric.create('acc'), + mx.metric.create('top_k_accuracy', top_k = 5)] + + (speed,) = score(metrics = metrics, **vars(args)) + logging.info('Finished with %f images per second', speed) + + for m in metrics: + logging.info(m.get()) diff --git a/example/image-classification/test_score.py b/example/image-classification/test_score.py new file mode 100644 index 000000000000..58c5c66a7f1f --- /dev/null +++ b/example/image-classification/test_score.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +test pretrained models +""" +from __future__ import print_function +import mxnet as mx +from common import find_mxnet, modelzoo +from score import score +import pytest + +@pytest.fixture(scope="session") +def imagenet_val_5k_settings(): + mx.test_utils.download( + 'http://data.mxnet.io/data/val-5k-256.rec', 'data/val-5k-256.rec') + num_gpus = mx.context.num_gpus() + assert num_gpus > 0 + gpus = ','.join(map(str, range(num_gpus))) + batch_size = 16 * num_gpus + kwargs = {'gpus':gpus, 'batch_size':batch_size, 'max_num_examples':500} + return 'data/val-5k-256.rec', kwargs + +def test_imagenet1k_resnet(imagenet_val_5k_settings): + imagenet_val_5k, kwargs = imagenet_val_5k_settings + models = ['imagenet1k-resnet-50', 'imagenet1k-resnet-152'] + accs = [.77, .78] + for (m, g) in zip(models, accs): + acc = mx.metric.create('acc') + (speed,) = score(model=m, data_val=imagenet_val_5k, + rgb_mean='0,0,0', metrics=acc, **kwargs) + r = acc.get()[1] + print('Tested %s, acc = %f, speed = %f img/sec' % (m, r, speed)) + assert r > g and r < g + .1 + +def test_imagenet1k_inception_bn(imagenet_val_5k_settings): + imagenet_val_5k, kwargs = imagenet_val_5k_settings + acc = mx.metric.create('acc') + m = 'imagenet1k-inception-bn' + g = 0.75 + (speed,) = score(model=m, + data_val=imagenet_val_5k, + rgb_mean='123.68,116.779,103.939', metrics=acc, **kwargs) + r = acc.get()[1] + print('Tested %s acc = %f, speed = %f img/sec' % (m, r, speed)) + assert r > g and r < g + .1 + diff --git a/example/kaggle-ndsb2/Train.py b/example/kaggle-ndsb2/Train.py new file mode 100644 index 000000000000..51e308a2e21c --- /dev/null +++ b/example/kaggle-ndsb2/Train.py @@ -0,0 +1,234 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Training script, this is converted from a ipython notebook +""" + +import os +import csv +import sys +import numpy as np +import mxnet as mx +import logging + +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +# In[2]: + +def get_lenet(): + """ A lenet style net, takes difference of each frame as input. + """ + source = mx.sym.Variable("data") + source = (source - 128) * (1.0/128) + frames = mx.sym.SliceChannel(source, num_outputs=30) + diffs = [frames[i+1] - frames[i] for i in range(29)] + source = mx.sym.Concat(*diffs) + net = mx.sym.Convolution(source, kernel=(5, 5), num_filter=40) + net = mx.sym.BatchNorm(net, fix_gamma=True) + net = mx.sym.Activation(net, act_type="relu") + net = mx.sym.Pooling(net, pool_type="max", kernel=(2,2), stride=(2,2)) + net = mx.sym.Convolution(net, kernel=(3, 3), num_filter=40) + net = mx.sym.BatchNorm(net, fix_gamma=True) + net = mx.sym.Activation(net, act_type="relu") + net = mx.sym.Pooling(net, pool_type="max", kernel=(2,2), stride=(2,2)) + # first fullc + flatten = mx.symbol.Flatten(net) + flatten = mx.symbol.Dropout(flatten) + fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=600) + # Name the final layer as softmax so it auto matches the naming of data iterator + # Otherwise we can also change the provide_data in the data iter + return mx.symbol.LogisticRegressionOutput(data=fc1, name='softmax') + +def CRPS(label, pred): + """ Custom evaluation metric on CRPS. + """ + for i in range(pred.shape[0]): + for j in range(pred.shape[1] - 1): + if pred[i, j] > pred[i, j + 1]: + pred[i, j + 1] = pred[i, j] + return np.sum(np.square(label - pred)) / label.size + + +# In[3]: + +def encode_label(label_data): + """Run encoding to encode the label into the CDF target. + """ + systole = label_data[:, 1] + diastole = label_data[:, 2] + systole_encode = np.array([ + (x < np.arange(600)) for x in systole + ], dtype=np.uint8) + diastole_encode = np.array([ + (x < np.arange(600)) for x in diastole + ], dtype=np.uint8) + return systole_encode, diastole_encode + +def encode_csv(label_csv, systole_csv, diastole_csv): + systole_encode, diastole_encode = encode_label(np.loadtxt(label_csv, delimiter=",")) + np.savetxt(systole_csv, systole_encode, delimiter=",", fmt="%g") + np.savetxt(diastole_csv, diastole_encode, delimiter=",", fmt="%g") + +# Write encoded label into the target csv +# We use CSV so that not all data need to sit into memory +# You can also use inmemory numpy array if your machine is large enough +encode_csv("./train-label.csv", "./train-systole.csv", "./train-diastole.csv") + + +# # Training the systole net + +# In[4]: + +network = get_lenet() +batch_size = 32 +devs = [mx.gpu(0)] +data_train = mx.io.CSVIter(data_csv="./train-64x64-data.csv", data_shape=(30, 64, 64), + label_csv="./train-systole.csv", label_shape=(600,), + batch_size=batch_size) + +data_validate = mx.io.CSVIter(data_csv="./validate-64x64-data.csv", data_shape=(30, 64, 64), + batch_size=1) + +systole_model = mx.model.FeedForward(ctx=devs, + symbol = network, + num_epoch = 65, + learning_rate = 0.001, + wd = 0.00001, + momentum = 0.9) + +systole_model.fit(X=data_train, eval_metric = mx.metric.np(CRPS)) + + +# # Predict systole + +# In[5]: + +systole_prob = systole_model.predict(data_validate) + + +# # Training the diastole net + +# In[6]: + +network = get_lenet() +batch_size = 32 +devs = [mx.gpu(0)] +data_train = mx.io.CSVIter(data_csv="./train-64x64-data.csv", data_shape=(30, 64, 64), + label_csv="./train-diastole.csv", label_shape=(600,), + batch_size=batch_size) + +diastole_model = mx.model.FeedForward(ctx=devs, + symbol = network, + num_epoch = 65, + learning_rate = 0.001, + wd = 0.00001, + momentum = 0.9) + +diastole_model.fit(X=data_train, eval_metric = mx.metric.np(CRPS)) + + +# # Predict diastole + +# In[7]: + +diastole_prob = diastole_model.predict(data_validate) + + +# # Generate Submission + +# In[8]: + +def accumulate_result(validate_lst, prob): + sum_result = {} + cnt_result = {} + size = prob.shape[0] + fi = csv.reader(open(validate_lst)) + for i in range(size): + line = fi.__next__() # Python2: line = fi.next() + idx = int(line[0]) + if idx not in cnt_result: + cnt_result[idx] = 0. + sum_result[idx] = np.zeros((1, prob.shape[1])) + cnt_result[idx] += 1 + sum_result[idx] += prob[i, :] + for i in cnt_result.keys(): + sum_result[i][:] /= cnt_result[i] + return sum_result + + +# In[9]: + +systole_result = accumulate_result("./validate-label.csv", systole_prob) +diastole_result = accumulate_result("./validate-label.csv", diastole_prob) + + +# In[10]: + +# we have 2 person missing due to frame selection, use udibr's hist result instead +def doHist(data): + h = np.zeros(600) + for j in np.ceil(data).astype(int): + h[j:] += 1 + h /= len(data) + return h +train_csv = np.genfromtxt("./train-label.csv", delimiter=',') +hSystole = doHist(train_csv[:, 1]) +hDiastole = doHist(train_csv[:, 2]) + + +# In[11]: + +def submission_helper(pred): + p = np.zeros(600) + pred.resize(p.shape) + p[0] = pred[0] + for j in range(1, 600): + a = p[j - 1] + b = pred[j] + if b < a: + p[j] = a + else: + p[j] = b + return p + + + +# In[12]: + +fi = csv.reader(open("data/sample_submission_validate.csv")) +f = open("submission.csv", "w") +fo = csv.writer(f, lineterminator='\n') +fo.writerow(fi.__next__()) # Python2: fo.writerow(fi.next()) +for line in fi: + idx = line[0] + key, target = idx.split('_') + key = int(key) + out = [idx] + if key in systole_result: + if target == 'Diastole': + out.extend(list(submission_helper(diastole_result[key]))) + else: + out.extend(list(submission_helper(systole_result[key]))) + else: + print("Miss: %s" % idx) + if target == 'Diastole': + out.extend(hDiastole) + else: + out.extend(hSystole) + fo.writerow(out) +f.close() diff --git a/example/model-parallel/matrix_factorization/train.py b/example/model-parallel/matrix_factorization/train.py new file mode 100644 index 000000000000..591dab3a6534 --- /dev/null +++ b/example/model-parallel/matrix_factorization/train.py @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import logging +import time +import mxnet as mx +import numpy as np +from get_data import get_movielens_iter, get_movielens_data +from model import matrix_fact_model_parallel_net + + +logging.basicConfig(level=logging.DEBUG) + +parser = argparse.ArgumentParser(description="Run model parallel version of matrix factorization", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-epoch', type=int, default=3, + help='number of epochs to train') +parser.add_argument('--batch-size', type=int, default=256, + help='number of examples per batch') +parser.add_argument('--print-every', type=int, default=100, + help='logging interval') +parser.add_argument('--factor-size', type=int, default=128, + help="the factor size of the embedding operation") +parser.add_argument('--num-gpus', type=int, default=2, + help="number of gpus to use") + +MOVIELENS = { + 'dataset': 'ml-10m', + 'train': './ml-10M100K/r1.train', + 'val': './ml-10M100K/r1.test', + 'max_user': 71569, + 'max_movie': 65135, +} + +if __name__ == '__main__': + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + + # arg parser + args = parser.parse_args() + logging.info(args) + num_epoch = args.num_epoch + batch_size = args.batch_size + optimizer = 'sgd' + factor_size = args.factor_size + print_every = args.print_every + num_gpus = args.num_gpus + + momentum = 0.9 + learning_rate = 0.1 + + # prepare dataset and iterators + max_user = MOVIELENS['max_user'] + max_movies = MOVIELENS['max_movie'] + get_movielens_data(MOVIELENS['dataset']) + train_iter = get_movielens_iter(MOVIELENS['train'], batch_size) + val_iter = get_movielens_iter(MOVIELENS['val'], batch_size) + + # construct the model + net = matrix_fact_model_parallel_net(factor_size, factor_size, max_user, max_movies) + + # construct the module + # map the ctx_group attribute to the context assignment + group2ctxs={'dev1':[mx.cpu()]*num_gpus, 'dev2':[mx.gpu(i) for i in range(num_gpus)]} + + # Creating a module by passing group2ctxs attribute which maps + # the ctx_group attribute to the context assignment + mod = mx.module.Module(symbol=net, context=[mx.cpu()]*num_gpus, data_names=['user', 'item'], + label_names=['score'], group2ctxs=group2ctxs) + + # the initializer used to initialize the parameters + initializer = mx.init.Xavier(factor_type="in", magnitude=2.34) + + # the parameters for the optimizer constructor + optimizer_params = { + 'learning_rate': learning_rate, + 'wd': 1e-4, + 'momentum': momentum, + 'rescale_grad': 1.0/batch_size} + + # use MSE as the metric + metric = mx.metric.create(['MSE']) + + speedometer = mx.callback.Speedometer(batch_size, print_every) + + # start training + mod.fit(train_iter, + val_iter, + eval_metric = metric, + num_epoch = num_epoch, + optimizer = optimizer, + optimizer_params = optimizer_params, + initializer = initializer, + batch_end_callback = speedometer) diff --git a/example/module/mnist_mlp.py b/example/module/mnist_mlp.py new file mode 100644 index 000000000000..7d63a584aec9 --- /dev/null +++ b/example/module/mnist_mlp.py @@ -0,0 +1,108 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: skip-file +import os, sys +from utils import get_data +import mxnet as mx +import numpy as np +import logging + +data = mx.symbol.Variable('data') +fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128) +act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu") +fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64) +act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu") +fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10) +softmax = mx.symbol.SoftmaxOutput(fc3, name = 'softmax') + +n_epoch = 2 +batch_size = 100 + +basedir = os.path.dirname(__file__) +get_data.get_mnist(os.path.join(basedir, "data")) + +train_dataiter = mx.io.MNISTIter( + image=os.path.join(basedir, "data", "train-images-idx3-ubyte"), + label=os.path.join(basedir, "data", "train-labels-idx1-ubyte"), + data_shape=(784,), + batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10) +val_dataiter = mx.io.MNISTIter( + image=os.path.join(basedir, "data", "t10k-images-idx3-ubyte"), + label=os.path.join(basedir, "data", "t10k-labels-idx1-ubyte"), + data_shape=(784,), + batch_size=batch_size, shuffle=True, flat=True, silent=False) + +################################################################################ +# Intermediate-level API +################################################################################ +mod = mx.mod.Module(softmax) +mod.bind(data_shapes=train_dataiter.provide_data, label_shapes=train_dataiter.provide_label) +mod.init_params() + +mod.init_optimizer(optimizer_params={'learning_rate':0.01, 'momentum': 0.9}) +metric = mx.metric.create('acc') + +for i_epoch in range(n_epoch): + for i_iter, batch in enumerate(train_dataiter): + mod.forward(batch) + mod.update_metric(metric, batch.label) + + mod.backward() + mod.update() + + for name, val in metric.get_name_value(): + print('epoch %03d: %s=%f' % (i_epoch, name, val)) + metric.reset() + train_dataiter.reset() + + +################################################################################ +# High-level API +################################################################################ +logging.basicConfig(level=logging.DEBUG) +train_dataiter.reset() +mod = mx.mod.Module(softmax) +mod.fit(train_dataiter, eval_data=val_dataiter, + optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, num_epoch=n_epoch) + +# prediction iterator API +for preds, i_batch, batch in mod.iter_predict(val_dataiter): + pred_label = preds[0].asnumpy().argmax(axis=1) + label = batch.label[0].asnumpy().astype('int32') + if i_batch % 20 == 0: + print('batch %03d acc: %.3f' % (i_batch, (label == pred_label).sum() / float(len(pred_label)))) + +# a dummy call just to test if the API works for merge_batches=True +preds = mod.predict(val_dataiter) + +# perform prediction and calculate accuracy manually +preds = mod.predict(val_dataiter, merge_batches=False) +val_dataiter.reset() +acc_sum = 0.0; acc_cnt = 0 +for i, batch in enumerate(val_dataiter): + pred_label = preds[i][0].asnumpy().argmax(axis=1) + label = batch.label[0].asnumpy().astype('int32') + acc_sum += (label == pred_label).sum() + acc_cnt += len(pred_label) +print('validation Accuracy: %.3f' % (acc_sum / acc_cnt)) + +# evaluate on validation set with a evaluation metric +mod.score(val_dataiter, metric) +for name, val in metric.get_name_value(): + print('%s=%f' % (name, val)) + diff --git a/example/multi-task/multi-task-learning.ipynb b/example/multi-task/multi-task-learning.ipynb index e615559441f6..048d6d9862b8 100644 --- a/example/multi-task/multi-task-learning.ipynb +++ b/example/multi-task/multi-task-learning.ipynb @@ -267,8 +267,8 @@ "outputs": [], "source": [ "def evaluate_accuracy(net, data_iterator):\n", - " acc_digits = mx.gluon.metric.Accuracy(name='digits')\n", - " acc_odd_even = mx.gluon.metric.Accuracy(name='odd_even')\n", + " acc_digits = mx.metric.Accuracy(name='digits')\n", + " acc_odd_even = mx.metric.Accuracy(name='odd_even')\n", " \n", " for i, (data, label_digit, label_odd_even) in enumerate(data_iterator):\n", " data = data.as_in_context(ctx)\n", @@ -335,8 +335,8 @@ "source": [ "for e in range(epochs):\n", " # Accuracies for each task\n", - " acc_digits = mx.gluon.metric.Accuracy(name='digits')\n", - " acc_odd_even = mx.gluon.metric.Accuracy(name='odd_even')\n", + " acc_digits = mx.metric.Accuracy(name='digits')\n", + " acc_odd_even = mx.metric.Accuracy(name='odd_even')\n", " # Accumulative losses\n", " l_digits_ = 0.\n", " l_odd_even_ = 0. \n", diff --git a/example/multivariate_time_series/src/metrics.py b/example/multivariate_time_series/src/metrics.py new file mode 100644 index 000000000000..4818591068f8 --- /dev/null +++ b/example/multivariate_time_series/src/metrics.py @@ -0,0 +1,55 @@ +# !/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# -*- coding: utf-8 -*- + +import numpy as np +import mxnet as mx + +def rse(label, pred): + """computes the root relative squared error (condensed using standard deviation formula)""" + numerator = np.sqrt(np.mean(np.square(label - pred), axis = None)) + denominator = np.std(label, axis = None) + return numerator / denominator + +def rae(label, pred): + """computes the relative absolute error (condensed using standard deviation formula)""" + numerator = np.mean(np.abs(label - pred), axis=None) + denominator = np.mean(np.abs(label - np.mean(label, axis=None)), axis=None) + return numerator / denominator + +def corr(label, pred): + """computes the empirical correlation coefficient""" + numerator1 = label - np.mean(label, axis=0) + numerator2 = pred - np.mean(pred, axis = 0) + numerator = np.mean(numerator1 * numerator2, axis=0) + denominator = np.std(label, axis=0) * np.std(pred, axis=0) + return np.mean(numerator / denominator) + +def get_custom_metrics(): + """ + :return: mxnet metric object + """ + _rse = mx.metric.create(rse) + _rae = mx.metric.create(rae) + _corr = mx.metric.create(corr) + return mx.metric.create([_rae, _rse, _corr]) + +def evaluate(pred, label): + return {"RAE":rae(label, pred), "RSE":rse(label,pred),"CORR": corr(label,pred)} \ No newline at end of file diff --git a/example/named_entity_recognition/src/metrics.py b/example/named_entity_recognition/src/metrics.py new file mode 100644 index 000000000000..a1d270af6863 --- /dev/null +++ b/example/named_entity_recognition/src/metrics.py @@ -0,0 +1,87 @@ +# !/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# -*- coding: utf-8 -*- + +import logging +import mxnet as mx +import numpy as np +import pickle + +def load_obj(name): + with open(name + '.pkl', 'rb') as f: + return pickle.load(f) + +tag_dict = load_obj("../data/tag_to_index") +not_entity_index = tag_dict["O"] + +def classifer_metrics(label, pred): + """ + computes f1, precision and recall on the entity class + """ + prediction = np.argmax(pred, axis=1) + label = label.astype(int) + + pred_is_entity = prediction != not_entity_index + label_is_entity = label != not_entity_index + + corr_pred = (prediction == label) == (pred_is_entity == True) + + #how many entities are there? + # better to cast to float for safer further ratio computations + num_entities = float(np.sum(label_is_entity)) + entity_preds = float(np.sum(pred_is_entity)) + #how many times did we correctly predict an entity? + correct_entitites = float(np.sum(corr_pred[pred_is_entity])) + + #precision: when we predict entity, how often are we right? + if entity_preds == 0: + precision = np.nan + else: + precision = correct_entitites / entity_preds + + #recall: of the things that were an entity, how many did we catch? + recall = correct_entitites / num_entities + if num_entities == 0: + recall = np.nan + # To prevent dozens of warning: RuntimeWarning: divide by zero encountered in long_scalars + if precision + recall == 0: + f1 = 0 + else: + f1 = 2 * precision * recall / (precision + recall) + + logging.debug("Metrics results: precision=%f recall=%f f1=%f", precision, recall, f1) + return precision, recall, f1 + +def entity_precision(label, pred): + return classifer_metrics(label, pred)[0] + +def entity_recall(label, pred): + return classifer_metrics(label, pred)[1] + +def entity_f1(label, pred): + return classifer_metrics(label, pred)[2] + +def composite_classifier_metrics(): + metric1 = mx.metric.CustomMetric(feval=entity_precision, name='entity precision') + metric2 = mx.metric.CustomMetric(feval=entity_recall, name='entity recall') + metric3 = mx.metric.CustomMetric(feval=entity_f1, name='entity f1 score') + metric4 = mx.metric.Accuracy() + + return mx.metric.CompositeEvalMetric([metric4, metric1, metric2, metric3]) diff --git a/example/nce-loss/nce.py b/example/nce-loss/nce.py new file mode 100644 index 000000000000..e59220a026a8 --- /dev/null +++ b/example/nce-loss/nce.py @@ -0,0 +1,139 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=missing-docstring +from __future__ import print_function + +from operator import itemgetter + +import mxnet as mx +import numpy as np + + +def nce_loss(data, label, label_weight, embed_weight, vocab_size, num_hidden): + label_embed = mx.sym.Embedding(data=label, input_dim=vocab_size, + weight=embed_weight, + output_dim=num_hidden, name='label_embed') + data = mx.sym.Reshape(data=data, shape=(-1, 1, num_hidden)) + pred = mx.sym.broadcast_mul(data, label_embed) + pred = mx.sym.sum(data=pred, axis=2) + return mx.sym.LogisticRegressionOutput(data=pred, + label=label_weight) + + +def nce_loss_subwords( + data, label, label_mask, label_weight, embed_weight, vocab_size, num_hidden): + """NCE-Loss layer under subword-units input. + """ + # get subword-units embedding. + label_units_embed = mx.sym.Embedding(data=label, + input_dim=vocab_size, + weight=embed_weight, + output_dim=num_hidden) + # get valid subword-units embedding with the help of label_mask + # it's achieved by multiplying zeros to useless units in order to handle variable-length input. + label_units_embed = mx.sym.broadcast_mul(lhs=label_units_embed, + rhs=label_mask, + name='label_units_embed') + # sum over them to get label word embedding. + label_embed = mx.sym.sum(label_units_embed, axis=2, name='label_embed') + + # by boardcast_mul and sum you can get prediction scores in all label_embed inputs, + # which is easy to feed into LogisticRegressionOutput and make your code more concise. + data = mx.sym.Reshape(data=data, shape=(-1, 1, num_hidden)) + pred = mx.sym.broadcast_mul(data, label_embed) + pred = mx.sym.sum(data=pred, axis=2) + + return mx.sym.LogisticRegressionOutput(data=pred, + label=label_weight) + + +class NceAccuracy(mx.metric.EvalMetric): + def __init__(self): + super(NceAccuracy, self).__init__('nce-accuracy') + + def update(self, labels, preds): + label_weight = labels[1].asnumpy() + preds = preds[0].asnumpy() + for i in range(preds.shape[0]): + if np.argmax(label_weight[i]) == np.argmax(preds[i]): + self.sum_metric += 1 + self.num_inst += 1 + + +class NceAuc(mx.metric.EvalMetric): + def __init__(self): + super(NceAuc, self).__init__('nce-auc') + + def update(self, labels, preds): + label_weight = labels[1].asnumpy() + preds = preds[0].asnumpy() + tmp = [] + for i in range(preds.shape[0]): + for j in range(preds.shape[1]): + tmp.append((label_weight[i][j], preds[i][j])) + tmp = sorted(tmp, key=itemgetter(1), reverse=True) + m = 0.0 + n = 0.0 + z = 0.0 + k = 0 + for a, _ in tmp: + if a > 0.5: + m += 1.0 + z += len(tmp) - k + else: + n += 1.0 + k += 1 + z -= m * (m + 1.0) / 2.0 + z /= m + z /= n + self.sum_metric += z + self.num_inst += 1 + + +class NceLSTMAuc(mx.metric.EvalMetric): + def __init__(self): + super(NceLSTMAuc, self).__init__('nce-lstm-auc') + + def update(self, labels, preds): + preds = np.array([x.asnumpy() for x in preds]) + preds = preds.reshape((preds.shape[0] * preds.shape[1], preds.shape[2])) + label_weight = labels[1].asnumpy() + label_weight = label_weight.transpose((1, 0, 2)) + label_weight = label_weight.reshape((preds.shape[0], preds.shape[1])) + + tmp = [] + for i in range(preds.shape[0]): + for j in range(preds.shape[1]): + tmp.append((label_weight[i][j], preds[i][j])) + tmp = sorted(tmp, key=itemgetter(1), reverse=True) + m = 0.0 + n = 0.0 + z = 0.0 + k = 0 + for a, _ in tmp: + if a > 0.5: + m += 1.0 + z += len(tmp) - k + else: + n += 1.0 + k += 1 + z -= m * (m + 1.0) / 2.0 + z /= m + z /= n + self.sum_metric += z + self.num_inst += 1 diff --git a/example/neural_collaborative_filtering/train.py b/example/neural_collaborative_filtering/train.py new file mode 100644 index 000000000000..c68f271a6f0d --- /dev/null +++ b/example/neural_collaborative_filtering/train.py @@ -0,0 +1,163 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +import os +import time +import argparse +import logging +import math +import random +import numpy as np +import mxnet as mx +from mxnet import gluon +from core.model import get_model +from core.dataset import NCFTrainData, NCFTestData +from core.evaluate import * + + +logging.basicConfig(level=logging.DEBUG) + +parser = argparse.ArgumentParser(description="Run matrix factorization with embedding", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--path', nargs='?', default='./data/', + help='Input data path.') +parser.add_argument('--dataset', nargs='?', default='ml-20m', + help='The dataset name.') +parser.add_argument('--batch-size', type=int, default=2048, + help='number of training examples per batch') +parser.add_argument('--eval-batch-size', type=int, default=1000, + help='number of evaluate examples per batch') +parser.add_argument('--model-type', type=str, default='neumf', choices=['neumf', 'gmf', 'mlp'], + help="mdoel type") +parser.add_argument('--num-negative', type=int, default=4, + help="number of negative samples per positive sample while training.") +parser.add_argument('--layers', default='[256, 256, 128, 64]', + help="list of number hiddens of fc layers in mlp model.") +parser.add_argument('--factor-size-gmf', type=int, default=64, + help="outdim of gmf embedding layers.") +parser.add_argument('--num-hidden', type=int, default=1, + help="num-hidden of neumf fc layer") +parser.add_argument('--log-interval', type=int, default=100, + help='logging interval') +parser.add_argument('--learning-rate', type=float, default=0.0005, + help='learning rate for optimizer') +parser.add_argument('--beta1', '-b1', type=float, default=0.9, + help='beta1 for Adam') +parser.add_argument('--beta2', '-b2', type=float, default=0.999, + help='beta1 for Adam') +parser.add_argument('--eps', type=float, default=1e-8, + help='eps for Adam') +parser.add_argument('--topk', type=int, default=10, + help="topk for accuracy evaluation.") +parser.add_argument('--gpu', type=int, default=None, + help="list of gpus to run, e.g. 0 or 0,2. empty means using cpu().") +parser.add_argument('--workers', type=int, default=8, help='thread number for dataloader.') +parser.add_argument('--epoch', type=int, default=14, help='training epoch') +parser.add_argument('--seed', type=int, default=3, help='random seed to use. Default=3.') +parser.add_argument('--deploy', action='store_true', help="whether to load static graph for deployment") + + +def cross_entropy(label, pred, eps=1e-12): + ce = 0 + for l, p in zip(label, pred): + ce += -( l*np.log(p+eps) + (1-l)*np.log(1-p+eps)) + return ce + +if __name__ == '__main__': + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + + # arg parser + args = parser.parse_args() + logging.info(args) + + mx.random.seed(args.seed) + np.random.seed(args.seed) + batch_size = args.batch_size + eval_batch_size = args.eval_batch_size + model_type = args.model_type + model_layers = eval(args.layers) + factor_size_gmf = args.factor_size_gmf + factor_size_mlp = int(model_layers[0]/2) + num_hidden = args.num_hidden + learning_rate=args.learning_rate + beta1=args.beta1 + beta2=args.beta2 + eps=args.eps + ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu) + topK = args.topk + num_negatives = args.num_negative + num_workers = args.workers + epoch = args.epoch + log_interval = args.log_interval + + # prepare dataset + logging.info('Prepare Dataset') + train_dataset = NCFTrainData((args.path + args.dataset + '/train-ratings.csv'), num_negatives) + test_data = NCFTestData(args.path + args.dataset) + train_dataloader = mx.gluon.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, last_batch='rollover') + logging.info('Prepare Dataset completed') + # construct the model + net = get_model(model_type, factor_size_mlp, factor_size_gmf, + model_layers, num_hidden, train_dataset.nb_users, train_dataset.nb_items) + + # initialize the module + mod = mx.module.Module(net, context=ctx, data_names=['user', 'item'], label_names=['softmax_label']) + provide_data = [mx.io.DataDesc(name='item', shape=((batch_size,))), + mx.io.DataDesc(name='user', shape=((batch_size,)))] + provide_label = [mx.io.DataDesc(name='softmax_label', shape=((batch_size,)))] + mod.bind(for_training=True, data_shapes=provide_data, label_shapes=provide_label) + mod.init_params() + mod.init_optimizer(optimizer='adam', optimizer_params=[('learning_rate', learning_rate), ('beta1',beta1), ('beta2',beta2), ('epsilon',eps)]) + + metric = mx.metric.create(cross_entropy) + speedometer = mx.callback.Speedometer(batch_size, log_interval) + best_hr, best_ndcg, best_iter = -1, -1, -1 + logging.info('Training started ...') + for epoch in range(epoch): + metric.reset() + for nbatch, seqs in enumerate(train_dataloader): + user_id, item_id, labels = seqs + batch = mx.io.DataBatch(data = [item_id.astype('int32').as_in_context(ctx), + user_id.astype('int32').as_in_context(ctx)], + label = [labels.as_in_context(ctx)]) + mod.forward(batch) + mod.backward() + mod.update() + predicts=mod.get_outputs()[0] + metric.update(labels = labels, preds = predicts) + speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch, + eval_metric=metric, locals=locals()) + speedometer(speedometer_param) + + # save model + dir_path = os.path.dirname(os.path.realpath(__file__)) + model_path = os.path.join(dir_path, 'model', args.dataset) + if not os.path.exists(model_path): + os.makedirs(model_path) + mod.save_checkpoint(os.path.join(model_path, model_type), epoch) + # compute hit ratio + (hits, ndcgs) = evaluate_model(mod, test_data.testRatings, test_data.testNegatives, topK, eval_batch_size, ctx, logging) + hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean() + logging.info('Iteration %d: HR = %.4f, NDCG = %.4f' % (epoch, hr, ndcg)) + # best hit ratio + if hr > best_hr: + best_hr, best_ndcg, best_iter = hr, ndcg, epoch + + logging.info("End. Best Iteration %d: HR = %.4f, NDCG = %.4f. " % (best_iter, best_hr, best_ndcg)) + logging.info('Training completed.') + diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py new file mode 100644 index 000000000000..4d690d37d00c --- /dev/null +++ b/example/quantization/imagenet_inference.py @@ -0,0 +1,307 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import logging +import os +import time +import numpy as np +import mxnet as mx +from mxnet import nd +from mxnet.contrib.quantization import * +from mxnet.contrib import amp + + +def download_dataset(dataset_url, dataset_dir, logger=None): + if logger is not None: + logger.info('Downloading dataset for inference from %s to %s' % (dataset_url, dataset_dir)) + mx.test_utils.download(dataset_url, dataset_dir) + + +def load_model(symbol_file, param_file, logger=None): + cur_path = os.path.dirname(os.path.realpath(__file__)) + symbol_file_path = os.path.join(cur_path, symbol_file) + if logger is not None: + logger.info('Loading symbol from file %s' % symbol_file_path) + symbol = mx.sym.load(symbol_file_path) + + param_file_path = os.path.join(cur_path, param_file) + if logger is not None: + logger.info('Loading params from file %s' % param_file_path) + save_dict = nd.load(param_file_path) + arg_params = {} + aux_params = {} + for k, v in save_dict.items(): + tp, name = k.split(':', 1) + if tp == 'arg': + arg_params[name] = v + if tp == 'aux': + aux_params[name] = v + return symbol, arg_params, aux_params + + +def advance_data_iter(data_iter, n): + assert n >= 0 + if n == 0: + return data_iter + has_next_batch = True + while has_next_batch: + try: + data_iter.next() + n -= 1 + if n == 0: + return data_iter + except StopIteration: + has_next_batch = False + + +def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples, logger=None): + metrics = [mx.metric.create('acc'), + mx.metric.create('top_k_accuracy', top_k=5)] + if not isinstance(metrics, list): + metrics = [metrics, ] + mod = mx.mod.Module(symbol=sym, context=devs, label_names=[label_name, ]) + mod.bind(for_training=False, + data_shapes=data.provide_data, + label_shapes=data.provide_label) + mod.set_params(arg_params, aux_params) + + tic = time.time() + num = 0 + for batch in data: + mod.forward(batch, is_train=False) + for m in metrics: + mod.update_metric(m, batch.label) + num += batch_size + if max_num_examples is not None and num >= max_num_examples: + break + + speed = num / (time.time() - tic) + + if logger is not None: + logger.info('Finished inference with %d images' % num) + logger.info('Finished with %f images per second', speed) + logger.warn('Note: GPU performance is expected to be slower than CPU. Please refer quantization/README.md for details') + for m in metrics: + logger.info(m.get()) + + +def low_precison_convert(model_name, low_precision, sym, arg_params, aux_params, excluded_sym_names=[]): + if low_precision == 'bfloat16': + if model_name.find('imagenet1k-resnet-152') != -1: + excluded_sym_names += ['conv0'] + elif model_name.find('imagenet1k-inception-bn') != -1: + excluded_sym_names += ['conv_1'] + elif model_name.find('resnet') != -1 and model_name.find('v1') != -1: + excluded_sym_names += ['resnetv10_conv0_fwd'] + elif model_name.find('resnet') != -1 and model_name.find('v2') != -1: + excluded_sym_names += ['resnetv20_conv0_fwd'] + elif model_name.find('vgg') != -1: + excluded_sym_names += ['vgg0_conv0_fwd'] + elif model_name.find('squeezenet1') != -1: + excluded_sym_names += ['squeezenet0_conv0_fwd'] + elif model_name.find('mobilenet') != -1 and model_name.find('v2') == -1: + excluded_sym_names += ['mobilenet0_conv0_fwd'] + elif model_name.find('mobilenet') != -1 and model_name.find('v2') != -1: + excluded_sym_names += ['mobilenetv20_conv0_fwd'] + elif model_name.find('inceptionv3') != -1: + excluded_sym_names += ['inception30_conv0_fwd'] + return amp.convert_model(sym, + arg_params, + aux_params, + target_dtype=low_precision, + excluded_sym_names=excluded_sym_names, + cast_optional_params=True) + +def benchmark_score(symbol_file, ctx, batch_size, num_batches, data_layer_type, low_precision, logger=None): + # get mod + cur_path = os.path.dirname(os.path.realpath(__file__)) + symbol_file_path = os.path.join(cur_path, symbol_file) + if logger is not None: + logger.info('Loading symbol from file %s' % symbol_file_path) + sym = mx.sym.load(symbol_file_path) + mod = mx.mod.Module(symbol=sym, context=ctx) + if data_layer_type == "int8": + dshape = mx.io.DataDesc(name='data', shape=( + batch_size,) + data_shape, dtype=np.int8) + elif data_layer_type == 'uint8': + dshape = mx.io.DataDesc(name='data', shape=( + batch_size,) + data_shape, dtype=np.uint8) + else: # float32 + dshape = mx.io.DataDesc(name='data', shape=( + batch_size,) + data_shape, dtype=np.float32) + mod.bind(for_training=False, + inputs_need_grad=False, + data_shapes=[dshape]) + mod.init_params(initializer=mx.init.Xavier(magnitude=2.)) + + if low_precision: + arg_params, aux_params = mod.get_params() + sym, arg_params, aux_params = low_precison_convert(symbol_file, + low_precision, + sym, arg_params, + aux_params) + mod = mx.mod.Module(symbol=sym, context=ctx) + mod.bind(for_training=False, + inputs_need_grad=False, + data_shapes=[dshape], + label_shapes=[['softmax_label', (batch_size,)]]) + mod.set_params(arg_params, aux_params) + + # get data + if data_layer_type == "float32": + data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx, dtype=data_layer_type) + for _, shape in mod.data_shapes] + else: + data = [mx.nd.full(shape=shape, val=127, ctx=ctx, dtype=data_layer_type) + for _, shape in mod.data_shapes] + batch = mx.io.DataBatch(data, []) # empty label + + # run + dry_run = 5 # use 5 iterations to warm up + for i in range(dry_run+num_batches): + if i == dry_run: + tic = time.time() + mod.forward(batch, is_train=False) + for output in mod.get_outputs(): + output.wait_to_read() + + # return num images per second + return num_batches*batch_size/(time.time() - tic) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Score a model on a dataset') + parser.add_argument('--ctx', type=str, default='gpu') + parser.add_argument('--benchmark', type=bool, default=False, help='dummy data benchmark') + parser.add_argument('--symbol-file', type=str, required=True, help='symbol file path') + parser.add_argument('--param-file', type=str, required=False, help='param file path') + parser.add_argument('--batch-size', type=int, default=32) + parser.add_argument('--label-name', type=str, default='softmax_label') + parser.add_argument('--dataset', type=str, required=False, help='dataset path') + parser.add_argument('--rgb-mean', type=str, default='0,0,0') + parser.add_argument('--rgb-std', type=str, default='1,1,1') + parser.add_argument('--image-shape', type=str, default='3,224,224') + parser.add_argument('--data-nthreads', type=int, default=60, help='number of threads for data decoding') + parser.add_argument('--num-skipped-batches', type=int, default=0, help='skip the number of batches for inference') + parser.add_argument('--num-inference-batches', type=int, required=True, help='number of images used for inference') + parser.add_argument('--shuffle-dataset', action='store_true', default=True, + help='shuffle the calibration dataset') + parser.add_argument('--shuffle-chunk-seed', type=int, default=3982304, + help='shuffling chunk seed, see' + ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter' + ' for more details') + parser.add_argument('--shuffle-seed', type=int, default=48564309, + help='shuffling seed, see' + ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter' + ' for more details') + parser.add_argument('--data-layer-type', type=str, default='float32', + choices=['float32', 'int8', 'uint8'], + help='data type for data layer') + parser.add_argument('--low-precision', type=str, default='', + choices=['', 'float16', 'bfloat16'], + help='enable low precision') + + args = parser.parse_args() + + if args.ctx == 'gpu': + ctx = mx.gpu(0) + elif args.ctx == 'cpu': + ctx = mx.cpu(0) + else: + raise ValueError('ctx %s is not supported in this script' % args.ctx) + + logging.basicConfig() + logger = logging.getLogger('logger') + logger.setLevel(logging.INFO) + + symbol_file = args.symbol_file + param_file = args.param_file + data_nthreads = args.data_nthreads + + batch_size = args.batch_size + logger.info('batch size = %d for inference' % batch_size) + + rgb_mean = args.rgb_mean + logger.info('rgb_mean = %s' % rgb_mean) + rgb_mean = [float(i) for i in rgb_mean.split(',')] + mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]} + rgb_std = args.rgb_std + logger.info('rgb_std = %s' % rgb_std) + rgb_std = [float(i) for i in rgb_std.split(',')] + std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]} + combine_mean_std = {} + combine_mean_std.update(mean_args) + combine_mean_std.update(std_args) + + label_name = args.label_name + logger.info('label_name = %s' % label_name) + + image_shape = args.image_shape + data_shape = tuple([int(i) for i in image_shape.split(',')]) + logger.info('Input data shape = %s' % str(data_shape)) + + data_layer_type = args.data_layer_type + + if args.low_precision: + if args.ctx == 'gpu': + assert args.low_precision == 'float16', "Not supported low-precision options for GPU." + elif args.ctx == 'cpu': + assert args.low_precision == 'bfloat16', "Not supported low-precision options for CPU." + + if args.benchmark == False: + dataset = args.dataset + download_dataset('http://data.mxnet.io/data/val_256_q90.rec', dataset) + logger.info('Dataset for inference: %s' % dataset) + + # creating data iterator + data = mx.io.ImageRecordIter( + path_imgrec=dataset, + label_width=1, + preprocess_threads=data_nthreads, + batch_size=batch_size, + data_shape=data_shape, + label_name=label_name, + rand_crop=False, + rand_mirror=False, + shuffle=args.shuffle_dataset, + shuffle_chunk_seed=args.shuffle_chunk_seed, + seed=args.shuffle_seed, + dtype=data_layer_type, + ctx=args.ctx, + **combine_mean_std) + + # loading model + sym, arg_params, aux_params = load_model(symbol_file, param_file, logger) + + if args.low_precision: + sym, arg_params, aux_params = low_precison_convert(symbol_file, + args.low_precision, + sym, arg_params, + aux_params) + # make sure that fp32 inference works on the same images as calibrated quantized model + logger.info('Skipping the first %d batches' % args.num_skipped_batches) + data = advance_data_iter(data, args.num_skipped_batches) + + num_inference_images = args.num_inference_batches * batch_size + logger.info('Running model %s for inference' % symbol_file) + score(sym, arg_params, aux_params, data, [ctx], label_name, + max_num_examples=num_inference_images, logger=logger) + else: + logger.info('Running model %s for inference' % symbol_file) + speed = benchmark_score(symbol_file, ctx, batch_size, + args.num_inference_batches, data_layer_type, args.low_precision, logger) + logger.info('batch size %2d, image/sec: %f', batch_size, speed) diff --git a/example/rcnn/symnet/metric.py b/example/rcnn/symnet/metric.py new file mode 100644 index 000000000000..fa8d7919e919 --- /dev/null +++ b/example/rcnn/symnet/metric.py @@ -0,0 +1,147 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import numpy as np + + +def get_names(): + pred = ['rpn_cls_prob', 'rpn_bbox_loss', 'rcnn_cls_prob', 'rcnn_bbox_loss', 'rcnn_label'] + label = ['rpn_label', 'rpn_bbox_target', 'rpn_bbox_weight'] + return pred, label + + +class RPNAccMetric(mx.metric.EvalMetric): + def __init__(self): + super(RPNAccMetric, self).__init__('RPNAcc') + self.pred, self.label = get_names() + + def update(self, labels, preds): + pred = preds[self.pred.index('rpn_cls_prob')] + label = labels[self.label.index('rpn_label')] + + # pred (b, c, p) or (b, c, h, w) + pred_label = mx.ndarray.argmax_channel(pred).asnumpy().astype('int32') + pred_label = pred_label.reshape((pred_label.shape[0], -1)) + # label (b, p) + label = label.asnumpy().astype('int32') + + # filter with keep_inds + keep_inds = np.where(label != -1) + pred_label = pred_label[keep_inds] + label = label[keep_inds] + + self.sum_metric += np.sum(pred_label.flat == label.flat) + self.num_inst += len(pred_label.flat) + + +class RCNNAccMetric(mx.metric.EvalMetric): + def __init__(self): + super(RCNNAccMetric, self).__init__('RCNNAcc') + self.pred, self.label = get_names() + + def update(self, labels, preds): + pred = preds[self.pred.index('rcnn_cls_prob')] + label = preds[self.pred.index('rcnn_label')] + + last_dim = pred.shape[-1] + pred_label = pred.asnumpy().reshape(-1, last_dim).argmax(axis=1).astype('int32') + label = label.asnumpy().reshape(-1,).astype('int32') + + self.sum_metric += np.sum(pred_label.flat == label.flat) + self.num_inst += len(pred_label.flat) + + +class RPNLogLossMetric(mx.metric.EvalMetric): + def __init__(self): + super(RPNLogLossMetric, self).__init__('RPNLogLoss') + self.pred, self.label = get_names() + + def update(self, labels, preds): + pred = preds[self.pred.index('rpn_cls_prob')] + label = labels[self.label.index('rpn_label')] + + # label (b, p) + label = label.asnumpy().astype('int32').reshape((-1)) + # pred (b, c, p) or (b, c, h, w) --> (b, p, c) --> (b*p, c) + pred = pred.asnumpy().reshape((pred.shape[0], pred.shape[1], -1)).transpose((0, 2, 1)) + pred = pred.reshape((label.shape[0], -1)) + + # filter with keep_inds + keep_inds = np.where(label != -1)[0] + label = label[keep_inds] + cls = pred[keep_inds, label] + + cls += 1e-14 + cls_loss = -1 * np.log(cls) + cls_loss = np.sum(cls_loss) + self.sum_metric += cls_loss + self.num_inst += label.shape[0] + + +class RCNNLogLossMetric(mx.metric.EvalMetric): + def __init__(self): + super(RCNNLogLossMetric, self).__init__('RCNNLogLoss') + self.pred, self.label = get_names() + + def update(self, labels, preds): + pred = preds[self.pred.index('rcnn_cls_prob')] + label = preds[self.pred.index('rcnn_label')] + + last_dim = pred.shape[-1] + pred = pred.asnumpy().reshape(-1, last_dim) + label = label.asnumpy().reshape(-1,).astype('int32') + cls = pred[np.arange(label.shape[0]), label] + + cls += 1e-14 + cls_loss = -1 * np.log(cls) + cls_loss = np.sum(cls_loss) + self.sum_metric += cls_loss + self.num_inst += label.shape[0] + + +class RPNL1LossMetric(mx.metric.EvalMetric): + def __init__(self): + super(RPNL1LossMetric, self).__init__('RPNL1Loss') + self.pred, self.label = get_names() + + def update(self, labels, preds): + bbox_loss = preds[self.pred.index('rpn_bbox_loss')].asnumpy() + bbox_weight = labels[self.label.index('rpn_bbox_weight')].asnumpy() + + # calculate num_inst (average on those fg anchors) + num_inst = np.sum(bbox_weight > 0) / 4 + + self.sum_metric += np.sum(bbox_loss) + self.num_inst += num_inst + + +class RCNNL1LossMetric(mx.metric.EvalMetric): + def __init__(self): + super(RCNNL1LossMetric, self).__init__('RCNNL1Loss') + self.pred, self.label = get_names() + + def update(self, labels, preds): + bbox_loss = preds[self.pred.index('rcnn_bbox_loss')].asnumpy() + label = preds[self.pred.index('rcnn_label')].asnumpy() + + # calculate num_inst + keep_inds = np.where(label != 0)[0] + num_inst = len(keep_inds) + + self.sum_metric += np.sum(bbox_loss) + self.num_inst += num_inst diff --git a/example/rcnn/train.py b/example/rcnn/train.py new file mode 100644 index 000000000000..7b1f2f7f31a5 --- /dev/null +++ b/example/rcnn/train.py @@ -0,0 +1,303 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import ast +import pprint + +import mxnet as mx +from mxnet.module import Module + +from symdata.loader import AnchorGenerator, AnchorSampler, AnchorLoader +from symnet.logger import logger +from symnet.model import load_param, infer_data_shape, check_shape, initialize_frcnn, get_fixed_params +from symnet.metric import RPNAccMetric, RPNLogLossMetric, RPNL1LossMetric, RCNNAccMetric, RCNNLogLossMetric, RCNNL1LossMetric + + +def train_net(sym, roidb, args): + # print config + logger.info('called with args\n{}'.format(pprint.pformat(vars(args)))) + + # setup multi-gpu + ctx = [mx.cpu()] if not args.gpus else [mx.gpu(int(i)) for i in args.gpus.split(',')] + batch_size = args.rcnn_batch_size * len(ctx) + + # load training data + feat_sym = sym.get_internals()['rpn_cls_score_output'] + ag = AnchorGenerator(feat_stride=args.rpn_feat_stride, + anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios) + asp = AnchorSampler(allowed_border=args.rpn_allowed_border, batch_rois=args.rpn_batch_rois, + fg_fraction=args.rpn_fg_fraction, fg_overlap=args.rpn_fg_overlap, + bg_overlap=args.rpn_bg_overlap) + train_data = AnchorLoader(roidb, batch_size, args.img_short_side, args.img_long_side, + args.img_pixel_means, args.img_pixel_stds, feat_sym, ag, asp, shuffle=True) + + # produce shape max possible + _, out_shape, _ = feat_sym.infer_shape(data=(1, 3, args.img_long_side, args.img_long_side)) + feat_height, feat_width = out_shape[0][-2:] + rpn_num_anchors = len(args.rpn_anchor_scales) * len(args.rpn_anchor_ratios) + data_names = ['data', 'im_info', 'gt_boxes'] + label_names = ['label', 'bbox_target', 'bbox_weight'] + data_shapes = [('data', (batch_size, 3, args.img_long_side, args.img_long_side)), + ('im_info', (batch_size, 3)), + ('gt_boxes', (batch_size, 100, 5))] + label_shapes = [('label', (batch_size, 1, rpn_num_anchors * feat_height, feat_width)), + ('bbox_target', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width)), + ('bbox_weight', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width))] + + # print shapes + data_shape_dict, out_shape_dict = infer_data_shape(sym, data_shapes + label_shapes) + logger.info('max input shape\n%s' % pprint.pformat(data_shape_dict)) + logger.info('max output shape\n%s' % pprint.pformat(out_shape_dict)) + + # load and initialize params + if args.resume: + arg_params, aux_params = load_param(args.resume) + else: + arg_params, aux_params = load_param(args.pretrained) + arg_params, aux_params = initialize_frcnn(sym, data_shapes, arg_params, aux_params) + + # check parameter shapes + check_shape(sym, data_shapes + label_shapes, arg_params, aux_params) + + # check fixed params + fixed_param_names = get_fixed_params(sym, args.net_fixed_params) + logger.info('locking params\n%s' % pprint.pformat(fixed_param_names)) + + # metric + rpn_eval_metric = RPNAccMetric() + rpn_cls_metric = RPNLogLossMetric() + rpn_bbox_metric = RPNL1LossMetric() + eval_metric = RCNNAccMetric() + cls_metric = RCNNLogLossMetric() + bbox_metric = RCNNL1LossMetric() + eval_metrics = mx.metric.CompositeEvalMetric() + for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: + eval_metrics.add(child_metric) + + # callback + batch_end_callback = mx.callback.Speedometer(batch_size, frequent=args.log_interval, auto_reset=False) + epoch_end_callback = mx.callback.do_checkpoint(args.save_prefix) + + # learning schedule + base_lr = args.lr + lr_factor = 0.1 + lr_epoch = [int(epoch) for epoch in args.lr_decay_epoch.split(',')] + lr_epoch_diff = [epoch - args.start_epoch for epoch in lr_epoch if epoch > args.start_epoch] + lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) + lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] + logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) + lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) + # optimizer + optimizer_params = {'momentum': 0.9, + 'wd': 0.0005, + 'learning_rate': lr, + 'lr_scheduler': lr_scheduler, + 'rescale_grad': (1.0 / batch_size), + 'clip_gradient': 5} + + # train + mod = Module(sym, data_names=data_names, label_names=label_names, + logger=logger, context=ctx, work_load_list=None, + fixed_param_names=fixed_param_names) + mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, + batch_end_callback=batch_end_callback, kvstore='device', + optimizer='sgd', optimizer_params=optimizer_params, + arg_params=arg_params, aux_params=aux_params, begin_epoch=args.start_epoch, num_epoch=args.epochs) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train Faster R-CNN network', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--network', type=str, default='vgg16', help='base network') + parser.add_argument('--pretrained', type=str, default='', help='path to pretrained model') + parser.add_argument('--dataset', type=str, default='voc', help='training dataset') + parser.add_argument('--imageset', type=str, default='', help='imageset splits') + parser.add_argument('--gpus', type=str, help='GPU devices, eg: "0,1,2,3" , not set to use CPU') + parser.add_argument('--epochs', type=int, default=10, help='training epochs') + parser.add_argument('--lr', type=float, default=0.001, help='base learning rate') + parser.add_argument('--lr-decay-epoch', type=str, default='7', help='epoch to decay lr') + parser.add_argument('--resume', type=str, default='', help='path to last saved model') + parser.add_argument('--start-epoch', type=int, default=0, help='start epoch for resuming') + parser.add_argument('--log-interval', type=int, default=100, help='logging mini batch interval') + parser.add_argument('--save-prefix', type=str, default='', help='saving params prefix') + # faster rcnn params + parser.add_argument('--img-short-side', type=int, default=600) + parser.add_argument('--img-long-side', type=int, default=1000) + parser.add_argument('--img-pixel-means', type=str, default='(0.0, 0.0, 0.0)') + parser.add_argument('--img-pixel-stds', type=str, default='(1.0, 1.0, 1.0)') + parser.add_argument('--net-fixed-params', type=str, default='["conv0", "stage1", "gamma", "beta"]') + parser.add_argument('--rpn-feat-stride', type=int, default=16) + parser.add_argument('--rpn-anchor-scales', type=str, default='(8, 16, 32)') + parser.add_argument('--rpn-anchor-ratios', type=str, default='(0.5, 1, 2)') + parser.add_argument('--rpn-pre-nms-topk', type=int, default=12000) + parser.add_argument('--rpn-post-nms-topk', type=int, default=2000) + parser.add_argument('--rpn-nms-thresh', type=float, default=0.7) + parser.add_argument('--rpn-min-size', type=int, default=16) + parser.add_argument('--rpn-batch-rois', type=int, default=256) + parser.add_argument('--rpn-allowed-border', type=int, default=0) + parser.add_argument('--rpn-fg-fraction', type=float, default=0.5) + parser.add_argument('--rpn-fg-overlap', type=float, default=0.7) + parser.add_argument('--rpn-bg-overlap', type=float, default=0.3) + parser.add_argument('--rcnn-num-classes', type=int, default=21) + parser.add_argument('--rcnn-feat-stride', type=int, default=16) + parser.add_argument('--rcnn-pooled-size', type=str, default='(14, 14)') + parser.add_argument('--rcnn-batch-size', type=int, default=1) + parser.add_argument('--rcnn-batch-rois', type=int, default=128) + parser.add_argument('--rcnn-fg-fraction', type=float, default=0.25) + parser.add_argument('--rcnn-fg-overlap', type=float, default=0.5) + parser.add_argument('--rcnn-bbox-stds', type=str, default='(0.1, 0.1, 0.2, 0.2)') + args = parser.parse_args() + args.img_pixel_means = ast.literal_eval(args.img_pixel_means) + args.img_pixel_stds = ast.literal_eval(args.img_pixel_stds) + args.net_fixed_params = ast.literal_eval(args.net_fixed_params) + args.rpn_anchor_scales = ast.literal_eval(args.rpn_anchor_scales) + args.rpn_anchor_ratios = ast.literal_eval(args.rpn_anchor_ratios) + args.rcnn_pooled_size = ast.literal_eval(args.rcnn_pooled_size) + args.rcnn_bbox_stds = ast.literal_eval(args.rcnn_bbox_stds) + return args + + +def get_voc(args): + from symimdb.pascal_voc import PascalVOC + if not args.imageset: + args.imageset = '2007_trainval' + args.rcnn_num_classes = len(PascalVOC.classes) + + isets = args.imageset.split('+') + roidb = [] + for iset in isets: + imdb = PascalVOC(iset, 'data', 'data/VOCdevkit') + imdb.append_flipped_images() + roidb.extend(imdb.roidb) + return roidb + + +def get_coco(args): + from symimdb.coco import coco + if not args.imageset: + args.imageset = 'train2017' + args.rcnn_num_classes = len(coco.classes) + + isets = args.imageset.split('+') + roidb = [] + for iset in isets: + imdb = coco(iset, 'data', 'data/coco') + imdb.filter_roidb() + imdb.append_flipped_images() + roidb.extend(imdb.roidb) + return roidb + + +def get_vgg16_train(args): + from symnet.symbol_vgg import get_vgg_train + if not args.pretrained: + args.pretrained = 'model/vgg16-0000.params' + if not args.save_prefix: + args.save_prefix = 'model/vgg16' + args.img_pixel_means = (123.68, 116.779, 103.939) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.net_fixed_params = ['conv1', 'conv2'] + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (7, 7) + return get_vgg_train(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, rpn_batch_rois=args.rpn_batch_rois, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + rcnn_batch_rois=args.rcnn_batch_rois, rcnn_fg_fraction=args.rcnn_fg_fraction, + rcnn_fg_overlap=args.rcnn_fg_overlap, rcnn_bbox_stds=args.rcnn_bbox_stds) + + +def get_resnet50_train(args): + from symnet.symbol_resnet import get_resnet_train + if not args.pretrained: + args.pretrained = 'model/resnet-50-0000.params' + if not args.save_prefix: + args.save_prefix = 'model/resnet50' + args.img_pixel_means = (0.0, 0.0, 0.0) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.net_fixed_params = ['conv0', 'stage1', 'gamma', 'beta'] + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (14, 14) + return get_resnet_train(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, rpn_batch_rois=args.rpn_batch_rois, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + rcnn_batch_rois=args.rcnn_batch_rois, rcnn_fg_fraction=args.rcnn_fg_fraction, + rcnn_fg_overlap=args.rcnn_fg_overlap, rcnn_bbox_stds=args.rcnn_bbox_stds, + units=(3, 4, 6, 3), filter_list=(256, 512, 1024, 2048)) + + +def get_resnet101_train(args): + from symnet.symbol_resnet import get_resnet_train + if not args.pretrained: + args.pretrained = 'model/resnet-101-0000.params' + if not args.save_prefix: + args.save_prefix = 'model/resnet101' + args.img_pixel_means = (0.0, 0.0, 0.0) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.net_fixed_params = ['conv0', 'stage1', 'gamma', 'beta'] + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (14, 14) + return get_resnet_train(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, rpn_batch_rois=args.rpn_batch_rois, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + rcnn_batch_rois=args.rcnn_batch_rois, rcnn_fg_fraction=args.rcnn_fg_fraction, + rcnn_fg_overlap=args.rcnn_fg_overlap, rcnn_bbox_stds=args.rcnn_bbox_stds, + units=(3, 4, 23, 3), filter_list=(256, 512, 1024, 2048)) + + +def get_dataset(dataset, args): + datasets = { + 'voc': get_voc, + 'coco': get_coco + } + if dataset not in datasets: + raise ValueError("dataset {} not supported".format(dataset)) + return datasets[dataset](args) + + +def get_network(network, args): + networks = { + 'vgg16': get_vgg16_train, + 'resnet50': get_resnet50_train, + 'resnet101': get_resnet101_train + } + if network not in networks: + raise ValueError("network {} not supported".format(network)) + return networks[network](args) + + +def main(): + args = parse_args() + roidb = get_dataset(args.dataset, args) + sym = get_network(args.network, args) + train_net(sym, roidb, args) + + +if __name__ == '__main__': + main() diff --git a/example/rnn/bucketing/cudnn_rnn_bucketing.py b/example/rnn/bucketing/cudnn_rnn_bucketing.py new file mode 100644 index 000000000000..38275ae3dfb8 --- /dev/null +++ b/example/rnn/bucketing/cudnn_rnn_bucketing.py @@ -0,0 +1,272 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import mxnet as mx +import argparse +from mxnet.contrib.amp import amp + +parser = argparse.ArgumentParser(description="Train RNN on Sherlock Holmes", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--test', default=False, action='store_true', + help='whether to do testing instead of training') +parser.add_argument('--model-prefix', type=str, default=None, + help='path to save/load model') +parser.add_argument('--load-epoch', type=int, default=0, + help='load from epoch') +parser.add_argument('--num-layers', type=int, default=2, + help='number of stacked RNN layers') +parser.add_argument('--num-hidden', type=int, default=200, + help='hidden layer size') +parser.add_argument('--num-embed', type=int, default=200, + help='embedding layer size') +parser.add_argument('--bidirectional', action='store_true', + help='uses bidirectional layers if specified') +parser.add_argument('--gpus', type=str, + help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \ + 'Increase batch size when using multiple gpus for best performance.') +parser.add_argument('--kv-store', type=str, default='device', + help='key-value store type') +parser.add_argument('--num-epochs', type=int, default=25, + help='max num of epochs') +parser.add_argument('--lr', type=float, default=0.01, + help='initial learning rate') +parser.add_argument('--optimizer', type=str, default='sgd', + help='the optimizer type') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--wd', type=float, default=0.00001, + help='weight decay for sgd') +parser.add_argument('--batch-size', type=int, default=32, + help='the batch size.') +parser.add_argument('--disp-batches', type=int, default=50, + help='show progress for every n batches') +# When training a deep, complex model *on multiple GPUs* it's recommended to +# stack fused RNN cells (one layer per cell) together instead of one with all +# layers. The reason is that fused RNN cells don't set gradients to be ready +# until the computation for the entire layer is completed. Breaking a +# multi-layer fused RNN cell into several one-layer ones allows gradients to be +# processed ealier. This reduces communication overhead, especially with +# multiple GPUs. +parser.add_argument('--stack-rnn', default=False, + help='stack fused RNN cells to reduce communication overhead') +parser.add_argument('--dropout', type=float, default='0.0', + help='dropout probability (1.0 - keep probability)') +parser.add_argument('--rnntype', type=str, default='lstm', + help='rnn type: gru, lstm, rnn_tanh and rnn_relu are supported') +parser.add_argument('--dtype', type=str, default='float32', + help='if float16 is provided AMP convert model' + 'is used to convert model to mixed precision model' + 'before running inference') + +#buckets = [32] +buckets = [10, 20, 30, 40, 50, 60] + +start_label = 1 +invalid_label = 0 + +def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0): + lines = open(fname).readlines() + lines = [filter(None, i.split(' ')) for i in lines] + sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label, start_label=start_label) + return sentences, vocab + +def get_data(layout): + train_sent, vocab = tokenize_text("./data/sherlockholmes.train.txt", start_label=start_label, + invalid_label=invalid_label) + val_sent, _ = tokenize_text("./data/sherlockholmes.test.txt", vocab=vocab, start_label=start_label, + invalid_label=invalid_label) + + data_train = mx.rnn.BucketSentenceIter(train_sent, args.batch_size, buckets=buckets, + invalid_label=invalid_label, layout=layout) + data_val = mx.rnn.BucketSentenceIter(val_sent, args.batch_size, buckets=buckets, + invalid_label=invalid_label, layout=layout) + return data_train, data_val, vocab + + +def train(args): + data_train, data_val, vocab = get_data('TN') + if args.stack_rnn: + cell = mx.rnn.SequentialRNNCell() + for i in range(args.num_layers): + cell.add(mx.rnn.FusedRNNCell(args.num_hidden, num_layers=1, + mode=args.rnntype, prefix='%s_l%d'%(args.rnntype,i), + bidirectional=args.bidirectional)) + if args.dropout > 0 and i < args.num_layers - 1 and args.rnntype == 'lstm': + cell.add(mx.rnn.DropoutCell(args.dropout, prefix='%s_d%d'%(args.rnntype,i))) + else: + cell = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, dropout=args.dropout, + mode=args.rnntype, bidirectional=args.bidirectional) + + def sym_gen(seq_len): + data = mx.sym.Variable('data') + label = mx.sym.Variable('softmax_label') + embed = mx.sym.Embedding(data=data, input_dim=len(vocab), output_dim=args.num_embed,name='embed') + + output, _ = cell.unroll(seq_len, inputs=embed, merge_outputs=True, layout='TNC') + + pred = mx.sym.Reshape(output, + shape=(-1, args.num_hidden*(1+args.bidirectional))) + pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred') + + label = mx.sym.Reshape(label, shape=(-1,)) + pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') + + return pred, ('data',), ('softmax_label',) + + if args.gpus: + contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] + else: + contexts = mx.cpu(0) + + model = mx.mod.BucketingModule( + sym_gen = sym_gen, + default_bucket_key = data_train.default_bucket_key, + context = contexts) + + if args.load_epoch: + _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint( + cell, args.model_prefix, args.load_epoch) + else: + arg_params = None + aux_params = None + + opt_params = { + 'learning_rate': args.lr, + 'wd': args.wd + } + + if args.optimizer not in ['adadelta', 'adagrad', 'adam', 'rmsprop']: + opt_params['momentum'] = args.mom + + model.fit( + train_data = data_train, + eval_data = data_val, + eval_metric = mx.metric.Perplexity(invalid_label), + kvstore = args.kv_store, + optimizer = args.optimizer, + optimizer_params = opt_params, + initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), + arg_params = arg_params, + aux_params = aux_params, + begin_epoch = args.load_epoch, + num_epoch = args.num_epochs, + batch_end_callback = mx.callback.Speedometer(args.batch_size, args.disp_batches, auto_reset=False), + epoch_end_callback = mx.rnn.do_rnn_checkpoint(cell, args.model_prefix, 1) + if args.model_prefix else None) + +def test(args): + assert args.model_prefix, "Must specifiy path to load from" + _, data_val, vocab = get_data('NT') + + if not args.stack_rnn: + stack = mx.rnn.FusedRNNCell(args.num_hidden, num_layers=args.num_layers, + mode=args.rnntype, bidirectional=args.bidirectional).unfuse() + else: + stack = mx.rnn.SequentialRNNCell() + for i in range(args.num_layers): + if args.rnntype == 'lstm': + cell = mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='%s_%dl0_'%(args.rnntype,i)) + if args.bidirectional: + cell = mx.rnn.BidirectionalCell( + cell, + mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='%s_%dr0_'%(args.rnntype,i)), + output_prefix='bi_%s_%d'%(args.rnntype,i)) + elif args.rnntype == 'gru': + cell = mx.rnn.GRUCell(num_hidden=args.num_hidden, prefix='%s_%dl0_'%(args.rnntype,i)) + if args.bidirectional: + cell = mx.rnn.BidirectionalCell( + cell, + mx.rnn.GRUCell(num_hidden=args.num_hidden, prefix='%s_%dr0_'%(args.rnntype,i)), + output_prefix='bi_%s_%d'%(args.rnntype,i)) + elif args.rnntype == 'rnn_tanh': + cell = mx.rnn.RNNCell(num_hidden=args.num_hidden, activation='tanh', prefix='%s_%dl0_'%(args.rnntype,i)) + if args.bidirectional: + cell = mx.rnn.BidirectionalCell( + cell, + mx.rnn.RNNCell(num_hidden=args.num_hidden, activation='tanh', prefix='%s_%dr0_'%(args.rnntype,i)), + output_prefix='bi_%s_%d'%(args.rnntype,i)) + elif args.rnntype == 'rnn_relu': + cell = mx.rnn.RNNCell(num_hidden=args.num_hidden, activation='relu', prefix='%s_%dl0_'%(args.rnntype,i)) + if args.bidirectional: + cell = mx.rnn.BidirectionalCell( + cell, + mx.rnn.RNNCell(num_hidden=args.num_hidden, activation='relu', prefix='%s_%dr0_'%(args.rnntype,i)), + output_prefix='bi_%s_%d'%(args.rnntype,i)) + + stack.add(cell) + + def sym_gen(seq_len): + data = mx.sym.Variable('data') + label = mx.sym.Variable('softmax_label') + embed = mx.sym.Embedding(data=data, input_dim=len(vocab), + output_dim=args.num_embed, name='embed') + + stack.reset() + outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True) + + pred = mx.sym.Reshape(outputs, + shape=(-1, args.num_hidden*(1+args.bidirectional))) + pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred') + + label = mx.sym.Reshape(label, shape=(-1,)) + pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') + + return pred, ('data',), ('softmax_label',) + + if args.gpus: + contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] + else: + contexts = mx.cpu(0) + + model = mx.mod.BucketingModule( + sym_gen = sym_gen, + default_bucket_key = data_val.default_bucket_key, + context = contexts) + model.bind(data_val.provide_data, data_val.provide_label, for_training=False) + + _, arg_params, aux_params = mx.rnn.load_rnn_checkpoint(stack, args.model_prefix, args.load_epoch) + model.set_params(arg_params, aux_params) + + if args.dtype == "float32": + model.set_params(arg_params, aux_params) + model.score(data_val, mx.metric.Perplexity(invalid_label), + batch_end_callback=mx.callback.Speedometer(args.batch_size, 5)) + else: + assert args.dtype == "float16", "Only float32 and float16 are supported currently" + model = amp.convert_bucketing_module(model, target_dtype="float16") + model.bind(data_val.provide_data, data_val.provide_label, + for_training=False) + model.score(data_val, mx.metric.Perplexity(invalid_label), + batch_end_callback=mx.callback.Speedometer(args.batch_size, 5)) + +if __name__ == '__main__': + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + + args = parser.parse_args() + + if args.num_layers >= 4 and len(args.gpus.split(',')) >= 4 and not args.stack_rnn: + print('WARNING: stack-rnn is recommended to train complex model on multiple GPUs') + + if args.test: + # Demonstrates how to load a model trained with CuDNN RNN and predict + # with non-fused MXNet symbol + test(args) + else: + train(args) diff --git a/example/rnn/bucketing/lstm_bucketing.py b/example/rnn/bucketing/lstm_bucketing.py new file mode 100644 index 000000000000..7f150104f458 --- /dev/null +++ b/example/rnn/bucketing/lstm_bucketing.py @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import mxnet as mx +import argparse +import os + +parser = argparse.ArgumentParser(description="Train RNN on Sherlock Holmes", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-layers', type=int, default=2, + help='number of stacked RNN layers') +parser.add_argument('--num-hidden', type=int, default=200, + help='hidden layer size') +parser.add_argument('--num-embed', type=int, default=200, + help='embedding layer size') +parser.add_argument('--gpus', type=str, + help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ' \ + 'Increase batch size when using multiple gpus for best performance.') +parser.add_argument('--kv-store', type=str, default='device', + help='key-value store type') +parser.add_argument('--num-epochs', type=int, default=25, + help='max num of epochs') +parser.add_argument('--lr', type=float, default=0.01, + help='initial learning rate') +parser.add_argument('--optimizer', type=str, default='sgd', + help='the optimizer type') +parser.add_argument('--mom', type=float, default=0.0, + help='momentum for sgd') +parser.add_argument('--wd', type=float, default=0.00001, + help='weight decay for sgd') +parser.add_argument('--batch-size', type=int, default=32, + help='the batch size.') +parser.add_argument('--disp-batches', type=int, default=50, + help='show progress for every n batches') + +def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0): + if not os.path.isfile(fname): + raise IOError("Please use get_sherlockholmes_data.sh to download requied file (data/sherlockholmes.train.txt)") + lines = open(fname).readlines() + lines = [filter(None, i.split(' ')) for i in lines] + sentences, vocab = mx.rnn.encode_sentences(lines, vocab=vocab, invalid_label=invalid_label, + start_label=start_label) + return sentences, vocab + + +if __name__ == '__main__': + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + + args = parser.parse_args() + + #buckets = [] + buckets = [10, 20, 30, 40, 50, 60] + + start_label = 1 + invalid_label = 0 + + train_sent, vocab = tokenize_text("./data/sherlockholmes.train.txt", start_label=start_label, + invalid_label=invalid_label) + val_sent, _ = tokenize_text("./data/sherlockholmes.test.txt", vocab=vocab, start_label=start_label, + invalid_label=invalid_label) + + data_train = mx.rnn.BucketSentenceIter(train_sent, args.batch_size, buckets=buckets, + invalid_label=invalid_label) + data_val = mx.rnn.BucketSentenceIter(val_sent, args.batch_size, buckets=buckets, + invalid_label=invalid_label) + + stack = mx.rnn.SequentialRNNCell() + for i in range(args.num_layers): + stack.add(mx.rnn.LSTMCell(num_hidden=args.num_hidden, prefix='lstm_l%d_'%i)) + + def sym_gen(seq_len): + data = mx.sym.Variable('data') + label = mx.sym.Variable('softmax_label') + embed = mx.sym.Embedding(data=data, input_dim=len(vocab), + output_dim=args.num_embed, name='embed') + + stack.reset() + outputs = stack.unroll(seq_len, inputs=embed, merge_outputs=True)[0] + + pred = mx.sym.Reshape(outputs, shape=(-1, args.num_hidden)) + pred = mx.sym.FullyConnected(data=pred, num_hidden=len(vocab), name='pred') + + label = mx.sym.Reshape(label, shape=(-1,)) + pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') + + return pred, ('data',), ('softmax_label',) + + if args.gpus: + contexts = [mx.gpu(int(i)) for i in args.gpus.split(',')] + else: + contexts = mx.cpu(0) + + model = mx.mod.BucketingModule( + sym_gen = sym_gen, + default_bucket_key = data_train.default_bucket_key, + context = contexts) + + model.fit( + train_data = data_train, + eval_data = data_val, + eval_metric = mx.metric.Perplexity(invalid_label), + kvstore = args.kv_store, + optimizer = args.optimizer, + optimizer_params = { 'learning_rate': args.lr, + 'momentum': args.mom, + 'wd': args.wd }, + initializer = mx.init.Xavier(factor_type="in", magnitude=2.34), + num_epoch = args.num_epochs, + batch_end_callback = mx.callback.Speedometer(args.batch_size, args.disp_batches, auto_reset=False)) diff --git a/example/rnn/old/char-rnn.ipynb b/example/rnn/old/char-rnn.ipynb new file mode 100644 index 000000000000..1ec56cd9aa8c --- /dev/null +++ b/example/rnn/old/char-rnn.ipynb @@ -0,0 +1,549 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import mxnet as mx\n", + "import numpy as np\n", + "import random\n", + "import bisect" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# set up logging\n", + "import logging\n", + "reload(logging)\n", + "logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# A Glance of LSTM structure and embedding layer\n", + "\n", + "We will build a LSTM network to learn from char only. At each time, input is a char. We will see this LSTM is able to learn words and grammers from sequence of chars.\n", + "\n", + "The following figure is showing an unrolled LSTM network, and how we generate embedding of a char. The one-hot to embedding operation is a special case of fully connected network.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from lstm import lstm_unroll, lstm_inference_symbol\n", + "from bucket_io import BucketSentenceIter\n", + "from rnn_model import LSTMInferenceModel" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Read from doc\n", + "def read_content(path):\n", + " with open(path) as ins:\n", + " content = ins.read()\n", + " return content\n", + "\n", + "# Build a vocabulary of what char we have in the content\n", + "def build_vocab(path):\n", + " content = read_content(path)\n", + " content = list(content)\n", + " idx = 1 # 0 is left for zero-padding\n", + " the_vocab = {}\n", + " for word in content:\n", + " if len(word) == 0:\n", + " continue\n", + " if not word in the_vocab:\n", + " the_vocab[word] = idx\n", + " idx += 1\n", + " return the_vocab\n", + "\n", + "# We will assign each char with a special numerical id\n", + "def text2id(sentence, the_vocab):\n", + " words = list(sentence)\n", + " words = [the_vocab[w] for w in words if len(w) > 0]\n", + " return words" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Evaluation \n", + "def Perplexity(label, pred):\n", + " label = label.T.reshape((-1,))\n", + " loss = 0.\n", + " for i in range(pred.shape[0]):\n", + " loss += -np.log(max(1e-10, pred[i][int(label[i])]))\n", + " return np.exp(loss / label.size)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get Data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "data_url = \"http://data.mxnet.io/mxnet/data/char_lstm.zip\"\n", + "os.system(\"wget %s\" % data_url)\n", + "os.system(\"unzip -o char_lstm.zip\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sample training data:\n", + "```\n", + "all to Renewal Keynote Address Call to Renewal Pt 1Call to Renewal Part 2 TOPIC: Our Past, Our Future & Vision for America June\n", + "28, 2006 Call to Renewal' Keynote Address Complete Text Good morning. I appreciate the opportunity to speak here at the Call to R\n", + "enewal's Building a Covenant for a New America conference. I've had the opportunity to take a look at your Covenant for a New Ame\n", + "rica. It is filled with outstanding policies and prescriptions for much of what ails this country. So I'd like to congratulate yo\n", + "u all on the thoughtful presentations you've given so far about poverty and justice in America, and for putting fire under the fe\n", + "et of the political leadership here in Washington.But today I'd like to talk about the connection between religion and politics a\n", + "nd perhaps offer some thoughts about how we can sort through some of the often bitter arguments that we've been seeing over the l\n", + "ast several years.I do so because, as you all know, we can affirm the importance of poverty in the Bible; and we can raise up and\n", + " pass out this Covenant for a New America. We can talk to the press, and we can discuss the religious call to address poverty and\n", + " environmental stewardship all we want, but it won't have an impact unless we tackle head-on the mutual suspicion that sometimes\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LSTM Hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# The batch size for training\n", + "batch_size = 32\n", + "# We can support various length input\n", + "# For this problem, we cut each input sentence to length of 129\n", + "# So we only need fix length bucket\n", + "buckets = [129]\n", + "# hidden unit in LSTM cell\n", + "num_hidden = 512\n", + "# embedding dimension, which is, map a char to a 256 dim vector\n", + "num_embed = 256\n", + "# number of lstm layer\n", + "num_lstm_layer = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# we will show a quick demo in 2 epoch\n", + "# and we will see result by training 75 epoch\n", + "num_epoch = 2\n", + "# learning rate \n", + "learning_rate = 0.01\n", + "# we will use pure sgd without momentum\n", + "momentum = 0.0" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# we can select multi-gpu for training\n", + "# for this demo we only use one\n", + "devs = [mx.context.gpu(i) for i in range(1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# build char vocabluary from input\n", + "vocab = build_vocab(\"./obama.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# generate symbol for a length\n", + "def sym_gen(seq_len):\n", + " return lstm_unroll(num_lstm_layer, seq_len, len(vocab) + 1,\n", + " num_hidden=num_hidden, num_embed=num_embed,\n", + " num_label=len(vocab) + 1, dropout=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# initalize states for LSTM\n", + "init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]\n", + "init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)]\n", + "init_states = init_c + init_h" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of dataset ==================\n", + "bucket of len 129 : 8290 samples\n" + ] + } + ], + "source": [ + "# we can build an iterator for text\n", + "data_train = BucketSentenceIter(\"./obama.txt\", vocab, buckets, batch_size,\n", + " init_states, seperate_char='\\n',\n", + " text2id=text2id, read_content=read_content)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# the network symbol\n", + "symbol = sym_gen(buckets[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train model" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Train a LSTM network as simple as feedforward network\n", + "model = mx.model.FeedForward(ctx=devs,\n", + " symbol=symbol,\n", + " num_epoch=num_epoch,\n", + " learning_rate=learning_rate,\n", + " momentum=momentum,\n", + " wd=0.0001,\n", + " initializer=mx.init.Xavier(factor_type=\"in\", magnitude=2.34))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "05:01:35 INFO:Start training with [gpu(0)]\n" + ] + } + ], + "source": [ + "# Fit it\n", + "model.fit(X=data_train,\n", + " eval_metric = mx.metric.np(Perplexity),\n", + " batch_end_callback=mx.callback.Speedometer(batch_size, 50),\n", + " epoch_end_callback=mx.callback.do_checkpoint(\"obama\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inference from model" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# helper strcuture for prediction\n", + "def MakeRevertVocab(vocab):\n", + " dic = {}\n", + " for k, v in vocab.items():\n", + " dic[v] = k\n", + " return dic" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# make input from char\n", + "def MakeInput(char, vocab, arr):\n", + " idx = vocab[char]\n", + " tmp = np.zeros((1,))\n", + " tmp[0] = idx\n", + " arr[:] = tmp" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# helper function for random sample \n", + "def _cdf(weights):\n", + " total = sum(weights)\n", + " result = []\n", + " cumsum = 0\n", + " for w in weights:\n", + " cumsum += w\n", + " result.append(cumsum / total)\n", + " return result\n", + "\n", + "def _choice(population, weights):\n", + " assert len(population) == len(weights)\n", + " cdf_vals = _cdf(weights)\n", + " x = random.random()\n", + " idx = bisect.bisect(cdf_vals, x)\n", + " return population[idx]\n", + "\n", + "# we can use random output or fixed output by choosing largest probability\n", + "def MakeOutput(prob, vocab, sample=False, temperature=1.):\n", + " if sample == False:\n", + " idx = np.argmax(prob, axis=1)[0]\n", + " else:\n", + " fix_dict = [\"\"] + [vocab[i] for i in range(1, len(vocab) + 1)]\n", + " scale_prob = np.clip(prob, 1e-6, 1 - 1e-6)\n", + " rescale = np.exp(np.log(scale_prob) / temperature)\n", + " rescale[:] /= rescale.sum()\n", + " return _choice(fix_dict, rescale[0, :])\n", + " try:\n", + " char = vocab[idx]\n", + " except:\n", + " char = ''\n", + " return char" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# load from check-point\n", + "_, arg_params, __ = mx.model.load_checkpoint(\"obama\", 75)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# build an inference model\n", + "model = LSTMInferenceModel(num_lstm_layer, len(vocab) + 1,\n", + " num_hidden=num_hidden, num_embed=num_embed,\n", + " num_label=len(vocab) + 1, arg_params=arg_params, ctx=mx.gpu(), dropout=0.2)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# generate a sequence of 1200 chars\n", + "\n", + "seq_length = 1200\n", + "input_ndarray = mx.nd.zeros((1,))\n", + "revert_vocab = MakeRevertVocab(vocab)\n", + "# Feel free to change the starter sentence\n", + "output ='The joke'\n", + "random_sample = True\n", + "new_sentence = True\n", + "\n", + "ignore_length = len(output)\n", + "\n", + "for i in range(seq_length):\n", + " if i <= ignore_length - 1:\n", + " MakeInput(output[i], vocab, input_ndarray)\n", + " else:\n", + " MakeInput(output[-1], vocab, input_ndarray)\n", + " prob = model.forward(input_ndarray, new_sentence)\n", + " new_sentence = False\n", + " next_char = MakeOutput(prob, revert_vocab, random_sample)\n", + " if next_char == '':\n", + " new_sentence = True\n", + " if i >= ignore_length - 1:\n", + " output += next_char\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The joke learning to be struggle for our daughter. We are the ones who can't pay their relationship. The Judiciary Commencement ce designed to deficit to the party of almost unemployment instead, just to look at home, little proof for America, Carguin are showing struggle against our pride. That if you came from tharger by a party that would increase the pervasive sense of new global warming against the challenge of governments - to get a corporation.As a highealth care, your own retirement security information about his family decided to get a job or aspect what will allow cannot simply by sagging high school system and stin twenty-five years. But led my faith designed to leave all their buddets and responsibility. But I sund this dangerous weapons, explain withdrawal oful -clears axdication in Iraq.What is the time for American policy became their efforts, and given them that a man doesn't make sure that that my own, you'll be faced with you. Four years, reforms illness all that kind of choose to understand is a broadeary. You instills in search of a reducithis recision, of us, with public services from using that barealies, but that must continue to limb line, they know th\n" + ] + } + ], + "source": [ + "# Let's see what we can learned from char in Obama's speech.\n", + "print(output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/example/rnn/old/gru_bucketing.py b/example/rnn/old/gru_bucketing.py new file mode 100644 index 000000000000..b9f651a90dc0 --- /dev/null +++ b/example/rnn/old/gru_bucketing.py @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme +# pylint: disable=superfluous-parens, no-member, invalid-name +import sys +sys.path.insert(0, "../../python") +import numpy as np +import mxnet as mx + +from gru import gru_unroll +from bucket_io import BucketSentenceIter, default_build_vocab, DummyIter + +def Perplexity(label, pred): + label = label.T.reshape((-1,)) + loss = 0. + for i in range(pred.shape[0]): + loss += -np.log(max(1e-10, pred[i][int(label[i])])) + return np.exp(loss / label.size) + +if __name__ == '__main__': + batch_size = 32 + #buckets = [10, 20, 30, 40, 50, 60] + #buckets = [32] + buckets = [] + num_hidden = 200 + num_embed = 200 + num_lstm_layer = 2 + + num_epoch = 25 + learning_rate = 0.01 + momentum = 0.0 + + # dummy data is used to test speed without IO + dummy_data = False + + #contexts = [mx.context.gpu(i) for i in range(1)] + contexts = mx.context.cpu() + + vocab = default_build_vocab("./data/sherlockholmes.train.txt") + + def sym_gen(seq_len): + return gru_unroll(num_lstm_layer, seq_len, len(vocab), + num_hidden=num_hidden, num_embed=num_embed, + num_label=len(vocab)) + + init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] + + data_train = BucketSentenceIter("./data/sherlockholmes.train.txt", vocab, + buckets, batch_size, init_h) + data_val = BucketSentenceIter("./data/sherlockholmes.valid.txt", vocab, + buckets, batch_size, init_h) + + if dummy_data: + data_train = DummyIter(data_train) + data_val = DummyIter(data_val) + + if len(buckets) == 1: + # only 1 bucket, disable bucketing + symbol = sym_gen(buckets[0]) + else: + symbol = sym_gen + + model = mx.model.FeedForward(ctx=contexts, + symbol=symbol, + num_epoch=num_epoch, + learning_rate=learning_rate, + momentum=momentum, + wd=0.00001, + initializer=mx.init.Xavier(factor_type="in", magnitude=2.34)) + + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + + model.fit(X=data_train, eval_data=data_val, + eval_metric = mx.metric.np(Perplexity), + batch_end_callback=mx.callback.Speedometer(batch_size, 50),) + diff --git a/example/rnn/old/lstm_bucketing.py b/example/rnn/old/lstm_bucketing.py new file mode 100644 index 000000000000..0fe4116250a2 --- /dev/null +++ b/example/rnn/old/lstm_bucketing.py @@ -0,0 +1,95 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=C0111,too-many-arguments,too-many-instance-attributes,too-many-locals,redefined-outer-name,fixme +# pylint: disable=superfluous-parens, no-member, invalid-name +import sys +sys.path.insert(0, "../../python") +import numpy as np +import mxnet as mx + +from lstm import lstm_unroll +from bucket_io import BucketSentenceIter, default_build_vocab, DummyIter + +def Perplexity(label, pred): + label = label.T.reshape((-1,)) + loss = 0. + for i in range(pred.shape[0]): + loss += -np.log(max(1e-10, pred[i][int(label[i])])) + return np.exp(loss / label.size) + +if __name__ == '__main__': + N = 8 + batch_size = 32*N + #buckets = [10, 20, 30, 40, 50, 60] + buckets = [32] + #buckets = [] + num_hidden = 200 + num_embed = 200 + num_lstm_layer = 2 + + num_epoch = 25 + learning_rate = 0.01 + momentum = 0.0 + + # dummy data is used to test speed without IO + dummy_data = False + + contexts = [mx.context.gpu(i) for i in range(N)] + + vocab = default_build_vocab("./data/sherlockholmes.train.txt") + + def sym_gen(seq_len): + return lstm_unroll(num_lstm_layer, seq_len, len(vocab), + num_hidden=num_hidden, num_embed=num_embed, + num_label=len(vocab)) + + init_c = [('l%d_init_c'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] + init_h = [('l%d_init_h'%l, (batch_size, num_hidden)) for l in range(num_lstm_layer)] + init_states = init_c + init_h + + data_train = BucketSentenceIter("./data/sherlockholmes.train.txt", vocab, + buckets, batch_size, init_states) + data_val = BucketSentenceIter("./data/sherlockholmes.valid.txt", vocab, + buckets, batch_size, init_states) + + if dummy_data: + data_train = DummyIter(data_train) + data_val = DummyIter(data_val) + + if len(buckets) == 1: + # only 1 bucket, disable bucketing + symbol = sym_gen(buckets[0]) + else: + symbol = sym_gen + + model = mx.model.FeedForward(ctx=contexts, + symbol=symbol, + num_epoch=num_epoch, + learning_rate=learning_rate, + momentum=momentum, + wd=0.00001, + initializer=mx.init.Xavier(factor_type="in", magnitude=2.34)) + + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + + model.fit(X=data_train, eval_data=data_val, kvstore='device', + eval_metric = mx.metric.np(Perplexity), + batch_end_callback=mx.callback.Speedometer(batch_size, 50),) + diff --git a/example/rnn/old/rnn_cell_demo.py b/example/rnn/old/rnn_cell_demo.py new file mode 100644 index 000000000000..c5772fa3a5b7 --- /dev/null +++ b/example/rnn/old/rnn_cell_demo.py @@ -0,0 +1,152 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""A simple demo of new RNN cell with sherlockholmes language model.""" + +import os + +import numpy as np +import mxnet as mx + +from bucket_io import BucketSentenceIter, default_build_vocab + + +data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')) + + +def Perplexity(label, pred): + # TODO(tofix): we make a transpose of label here, because when + # using the RNN cell, we called swap axis to the data. + label = label.T.reshape((-1,)) + loss = 0. + for i in range(pred.shape[0]): + loss += -np.log(max(1e-10, pred[i][int(label[i])])) + return np.exp(loss / label.size) + + +if __name__ == '__main__': + batch_size = 128 + buckets = [10, 20, 30, 40, 50, 60] + num_hidden = 200 + num_embed = 200 + num_lstm_layer = 2 + + num_epoch = 2 + learning_rate = 0.01 + momentum = 0.0 + + contexts = [mx.context.gpu(i) for i in range(4)] + vocab = default_build_vocab(os.path.join(data_dir, 'sherlockholmes.train.txt')) + + init_h = [('LSTM_init_h', (batch_size, num_lstm_layer, num_hidden))] + init_c = [('LSTM_init_c', (batch_size, num_lstm_layer, num_hidden))] + init_states = init_c + init_h + + data_train = BucketSentenceIter(os.path.join(data_dir, 'sherlockholmes.train.txt'), + vocab, buckets, batch_size, init_states) + data_val = BucketSentenceIter(os.path.join(data_dir, 'sherlockholmes.valid.txt'), + vocab, buckets, batch_size, init_states) + + def sym_gen(seq_len): + data = mx.sym.Variable('data') + label = mx.sym.Variable('softmax_label') + embed = mx.sym.Embedding(data=data, input_dim=len(vocab), + output_dim=num_embed, name='embed') + + # TODO(tofix) + # The inputs and labels from IO are all in batch-major. + # We need to transform them into time-major to use RNN cells. + embed_tm = mx.sym.SwapAxis(embed, dim1=0, dim2=1) + label_tm = mx.sym.SwapAxis(label, dim1=0, dim2=1) + + # TODO(tofix) + # Create transformed RNN initial states. Normally we do + # no need to do this. But the RNN symbol expects the state + # to be time-major shape layout, while the current mxnet + # IO and high-level training logic assume everything from + # the data iter have batch_size as the first dimension. + # So until we have extended our IO and training logic to + # support this more general case, this dummy axis swap is + # needed. + rnn_h_init = mx.sym.SwapAxis(mx.sym.Variable('LSTM_init_h'), + dim1=0, dim2=1) + rnn_c_init = mx.sym.SwapAxis(mx.sym.Variable('LSTM_init_c'), + dim1=0, dim2=1) + + # TODO(tofix) + # currently all the LSTM parameters are concatenated as + # a huge vector, and named '_parameters'. By default + # mxnet initializer does not know how to initilize this + # guy because its name does not ends with _weight or _bias + # or anything familiar. Here we just use a temp workaround + # to create a variable and name it as LSTM_bias to get + # this demo running. Note by default bias is initialized + # as zeros, so this is not a good scheme. But calling it + # LSTM_weight is not good, as this is 1D vector, while + # the initialization scheme of a weight parameter needs + # at least two dimensions. + rnn_params = mx.sym.Variable('LSTM_bias') + + # RNN cell takes input of shape (time, batch, feature) + rnn = mx.sym.RNN(data=embed_tm, state_size=num_hidden, + num_layers=num_lstm_layer, mode='lstm', + name='LSTM', + # The following params can be omitted + # provided we do not need to apply the + # workarounds mentioned above + state=rnn_h_init, + state_cell=rnn_c_init, + parameters=rnn_params) + + # the RNN cell output is of shape (time, batch, dim) + # if we need the states and cell states in the last time + # step (e.g. when building encoder-decoder models), we + # can set state_outputs=True, and the RNN cell will have + # extra outputs: rnn['LSTM_output'], rnn['LSTM_state'] + # and for LSTM, also rnn['LSTM_state_cell'] + + # now we collapse the time and batch dimension to do the + # final linear logistic regression prediction + hidden = mx.sym.Reshape(data=rnn, shape=(-1, num_hidden)) + label_cl = mx.sym.Reshape(data=label_tm, shape=(-1,)) + + pred = mx.sym.FullyConnected(data=hidden, num_hidden=len(vocab), + name='pred') + sm = mx.sym.SoftmaxOutput(data=pred, label=label_cl, name='softmax') + + data_names = ['data', 'LSTM_init_h', 'LSTM_init_c'] + label_names = ['softmax_label'] + + return (sm, data_names, label_names) + + if len(buckets) == 1: + mod = mx.mod.Module(*sym_gen(buckets[0]), context=contexts) + else: + mod = mx.mod.BucketingModule(sym_gen, default_bucket_key=data_train.default_bucket_key, + context=contexts) + + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + + mod.fit(data_train, eval_data=data_val, num_epoch=num_epoch, + eval_metric=mx.metric.np(Perplexity), + batch_end_callback=mx.callback.Speedometer(batch_size, 50), + initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), + optimizer='sgd', + optimizer_params={'learning_rate': learning_rate, + 'momentum': momentum, 'wd': 0.00001}) diff --git a/example/sparse/factorization_machine/metric.py b/example/sparse/factorization_machine/metric.py new file mode 100644 index 000000000000..a8c52c781c0f --- /dev/null +++ b/example/sparse/factorization_machine/metric.py @@ -0,0 +1,125 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import numpy as np +from operator import itemgetter + +@mx.metric.register +@mx.metric.alias('log_loss') +class LogLossMetric(mx.metric.EvalMetric): + """Computes the negative log-likelihood loss. + + The negative log-likelihoodd loss over a batch of sample size :math:`N` is given by + + .. math:: + -\\sum_{n=1}^{N}\\sum_{k=1}^{K}t_{nk}\\log (y_{nk}), + + where :math:`K` is the number of classes, :math:`y_{nk}` is the prediceted probability for + :math:`k`-th class for :math:`n`-th sample. :math:`t_{nk}=1` if and only if sample + :math:`n` belongs to class :math:`k`. + + Parameters + ---------- + eps : float + Negative log-likelihood loss is undefined for predicted value is 0, + so predicted values are added with the small constant. + name : str + Name of this metric instance for display. + output_names : list of str, or None + Name of predictions that should be used when updating with update_dict. + By default include all predictions. + label_names : list of str, or None + Name of labels that should be used when updating with update_dict. + By default include all labels. + + Examples + -------- + >>> predicts = [mx.nd.array([[0.3], [0], [0.4]])] + >>> labels = [mx.nd.array([0, 1, 1])] + >>> log_loss= mx.metric.NegativeLogLikelihood() + >>> log_loss.update(labels, predicts) + >>> print(log_loss.get()) + ('log-loss', 0.57159948348999023) + """ + def __init__(self, eps=1e-12, name='log-loss', + output_names=None, label_names=None): + super(LogLossMetric, self).__init__( + name, eps=eps, + output_names=output_names, label_names=label_names) + self.eps = eps + + def update(self, labels, preds): + """Updates the internal evaluation result. + + Parameters + ---------- + labels : list of `NDArray` + The labels of the data. + + preds : list of `NDArray` + Predicted values. + """ + mx.metric.check_label_shapes(labels, preds) + + for label, pred in zip(labels, preds): + label = label.asnumpy() + pred = pred.asnumpy() + pred = np.column_stack((1 - pred, pred)) + + label = label.ravel() + num_examples = pred.shape[0] + assert label.shape[0] == num_examples, (label.shape[0], num_examples) + prob = pred[np.arange(num_examples, dtype=np.int64), np.int64(label)] + self.sum_metric += (-np.log(prob + self.eps)).sum() + self.num_inst += num_examples + +@mx.metric.register +@mx.metric.alias('auc') +class AUCMetric(mx.metric.EvalMetric): + def __init__(self, eps=1e-12): + super(AUCMetric, self).__init__( + 'auc') + self.eps = eps + + def update(self, labels, preds): + mx.metric.check_label_shapes(labels, preds) + label_weight = labels[0].asnumpy() + preds = preds[0].asnumpy() + tmp = [] + for i in range(preds.shape[0]): + tmp.append((label_weight[i], preds[i])) + tmp = sorted(tmp, key=itemgetter(1), reverse=True) + label_sum = label_weight.sum() + if label_sum == 0 or label_sum == label_weight.size: + raise Exception("AUC with one class is undefined") + + label_one_num = np.count_nonzero(label_weight) + label_zero_num = len(label_weight) - label_one_num + total_area = label_zero_num * label_one_num + height = 0 + width = 0 + area = 0 + for a, _ in tmp: + if a == 1.0: + height += 1.0 + else: + width += 1.0 + area += height + + self.sum_metric += area / total_area + self.num_inst += 1 diff --git a/example/sparse/factorization_machine/train.py b/example/sparse/factorization_machine/train.py new file mode 100644 index 000000000000..b30f9cc81acf --- /dev/null +++ b/example/sparse/factorization_machine/train.py @@ -0,0 +1,147 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from metric import * +from mxnet.test_utils import * +from model import factorization_machine_model +import argparse, os + +parser = argparse.ArgumentParser(description="Run factorization machine with criteo dataset", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--data-train', type=str, default=None, + help='training dataset in LibSVM format.') +parser.add_argument('--data-test', type=str, default=None, + help='test dataset in LibSVM format.') +parser.add_argument('--num-epoch', type=int, default=1, + help='number of epochs to train') +parser.add_argument('--batch-size', type=int, default=1000, + help='number of examples per batch') +parser.add_argument('--input-size', type=int, default=1000000, + help='number of features in the input') +parser.add_argument('--factor-size', type=int, default=16, + help='number of latent variables') +parser.add_argument('--factor-lr', type=float, default=0.0001, + help='learning rate for factor terms') +parser.add_argument('--linear-lr', type=float, default=0.001, + help='learning rate for linear terms') +parser.add_argument('--bias-lr', type=float, default=0.1, + help='learning rate for bias terms') +parser.add_argument('--factor-wd', type=float, default=0.00001, + help='weight decay rate for factor terms') +parser.add_argument('--linear-wd', type=float, default=0.001, + help='weight decay rate for linear terms') +parser.add_argument('--bias-wd', type=float, default=0.01, + help='weight decay rate for bias terms') +parser.add_argument('--factor-sigma', type=float, default=0.001, + help='standard deviation for initialization of factor terms') +parser.add_argument('--linear-sigma', type=float, default=0.01, + help='standard deviation for initialization of linear terms') +parser.add_argument('--bias-sigma', type=float, default=0.01, + help='standard deviation for initialization of bias terms') +parser.add_argument('--log-interval', type=int, default=100, + help='number of batches between logging messages') +parser.add_argument('--kvstore', type=str, default='local', + help='what kvstore to use', choices=["dist_async", "local"]) + + +if __name__ == '__main__': + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + + # arg parser + args = parser.parse_args() + logging.info(args) + num_epoch = args.num_epoch + batch_size = args.batch_size + kvstore = args.kvstore + factor_size = args.factor_size + num_features = args.input_size + log_interval = args.log_interval + assert(args.data_train is not None and args.data_test is not None), \ + "dataset for training or test is missing" + + def batch_row_ids(data_batch): + """ Generate row ids based on the current mini-batch """ + idx = data_batch.data[0].indices + return {'w': idx, 'v': idx} + + def all_row_ids(data_batch): + """ Generate row ids for all rows """ + all_rows = mx.nd.arange(0, num_features, dtype='int64') + return {'w': all_rows, 'v': all_rows} + + # create kvstore + kv = mx.kvstore.create(kvstore) + # data iterator + train_data = mx.io.LibSVMIter(data_libsvm=args.data_train, data_shape=(num_features,), + batch_size=batch_size) + eval_data = mx.io.LibSVMIter(data_libsvm=args.data_test, data_shape=(num_features,), + batch_size=batch_size) + # model + lr_config = {'v': args.factor_lr, 'w': args.linear_lr, 'w0': args.bias_lr} + wd_config = {'v': args.factor_wd, 'w': args.linear_wd, 'w0': args.bias_wd} + init_config = {'v': mx.initializer.Normal(args.factor_sigma), + 'w': mx.initializer.Normal(args.linear_sigma), + 'w0': mx.initializer.Normal(args.bias_sigma)} + model = factorization_machine_model(factor_size, num_features, lr_config, wd_config, init_config) + + # module + mod = mx.mod.Module(symbol=model) + mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) + mod.init_params() + optimizer_params=(('learning_rate', 1), ('wd', 1), ('beta1', 0.9), + ('beta2', 0.999), ('epsilon', 1e-8)) + mod.init_optimizer(optimizer='adam', kvstore=kv, optimizer_params=optimizer_params) + + # metrics + metric = mx.metric.create(['log_loss', 'auc']) + speedometer = mx.callback.Speedometer(batch_size, log_interval) + + logging.info('Training started ...') + train_iter = iter(train_data) + eval_iter = iter(eval_data) + for epoch in range(num_epoch): + nbatch = 0 + metric.reset() + for batch in train_iter: + try: + nbatch += 1 + # manually pull sparse weights from kvstore so that _square_sum + # only computes the rows necessary + mod.prepare(batch, sparse_row_id_fn=batch_row_ids) + mod.forward_backward(batch) + # update all parameters (including the weight parameter) + mod.update() + # update training metric + mod.update_metric(metric, batch.label) + speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch, + eval_metric=metric, locals=locals()) + speedometer(speedometer_param) + except: + continue + + # pull all updated rows before validation + mod.prepare(None, all_row_ids) + # evaluate metric on validation dataset + score = mod.score(eval_iter, ['log_loss']) + logging.info("epoch %d, eval log loss = %s" % (epoch, score[0][1])) + # reset the iterator for next pass of data + train_iter.reset() + eval_iter.reset() + logging.info('Training completed.') diff --git a/example/sparse/linear_classification/train.py b/example/sparse/linear_classification/train.py new file mode 100644 index 000000000000..0a8acfd87bef --- /dev/null +++ b/example/sparse/linear_classification/train.py @@ -0,0 +1,139 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet.test_utils import * +from data import get_avazu_data +from linear_model import * +import argparse +import os + +parser = argparse.ArgumentParser(description="Run sparse linear classification " \ + "with distributed kvstore", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-epoch', type=int, default=5, + help='number of epochs to train') +parser.add_argument('--batch-size', type=int, default=8192, + help='number of examples per batch') +parser.add_argument('--kvstore', type=str, default=None, + help='what kvstore to use', + choices=["dist_sync", "dist_async", "local"]) +parser.add_argument('--optimizer', type=str, default='sgd', + help='what optimizer to use', + choices=["adagrad", "sgd", "adam"]) + +AVAZU = { + 'train': 'avazu-app', + 'test': 'avazu-app.t', + 'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/", + # 1000000 + 1 since LibSVMIter uses zero-based indexing + 'num_features': 1000001, +} + +def batch_row_ids(data_batch): + """ Generate row ids based on the current mini-batch """ + return {'weight': data_batch.data[0].indices} + +def all_row_ids(data_batch): + """ Generate row ids for all rows """ + all_rows = mx.nd.arange(0, AVAZU['num_features'], dtype='int64') + return {'weight': all_rows} + +if __name__ == '__main__': + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + + # arg parser + args = parser.parse_args() + logging.info(args) + num_epoch = args.num_epoch + kvstore = args.kvstore + batch_size = args.batch_size + optimizer = args.optimizer + + # create kvstore + kv = mx.kvstore.create(kvstore) if kvstore else None + rank = kv.rank if kv else 0 + num_worker = kv.num_workers if kv else 1 + + # dataset + num_features = AVAZU['num_features'] + data_dir = os.path.join(os.getcwd(), 'data') + train_data = os.path.join(data_dir, AVAZU['train']) + val_data = os.path.join(data_dir, AVAZU['test']) + get_avazu_data(data_dir, AVAZU['train'], AVAZU['url']) + get_avazu_data(data_dir, AVAZU['test'], AVAZU['url']) + + # data iterator + train_data = mx.io.LibSVMIter(data_libsvm=train_data, data_shape=(num_features,), + batch_size=batch_size, num_parts=num_worker, + part_index=rank) + eval_data = mx.io.LibSVMIter(data_libsvm=val_data, data_shape=(num_features,), + batch_size=batch_size) + + # model + # The positive class weight, says how much more we should upweight the importance of + # positive instances in the objective function. + # This is used to combat the extreme class imbalance. + positive_class_weight = 2 + model = linear_model(num_features, positive_class_weight) + + # module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['softmax_label']) + mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) + mod.init_params() + optim = mx.optimizer.create(optimizer, learning_rate=0.01, rescale_grad=1.0/batch_size/num_worker) + mod.init_optimizer(optimizer=optim, kvstore=kv) + # use accuracy as the metric + metric = mx.metric.create(['nll_loss']) + + # get the sparse weight parameter + speedometer = mx.callback.Speedometer(batch_size, 100) + + logging.info('Training started ...') + for epoch in range(num_epoch): + nbatch = 0 + metric.reset() + for batch in train_data: + nbatch += 1 + # for distributed training, we need to manually pull sparse weights from kvstore + mod.prepare(batch, sparse_row_id_fn=batch_row_ids) + mod.forward_backward(batch) + # update all parameters (including the weight parameter) + mod.update() + # update training metric + mod.update_metric(metric, batch.label) + speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch, + eval_metric=metric, locals=locals()) + speedometer(speedometer_param) + + # prepare the module weight with all row ids for inference. Alternatively, one could call + # score = mod.score(val_iter, ['MSE'], sparse_row_id_fn=batch_row_ids) + # to fetch the weight per mini-batch + mod.prepare(None, all_row_ids) + # evaluate metric on validation dataset + score = mod.score(eval_data, ['nll_loss']) + logging.info('epoch %d, eval nll = %s ' % (epoch, score[0][1])) + + # prepare the module weight with all row ids before making a checkpoint. + mod.prepare(None, all_row_ids) + mod.save_checkpoint("checkpoint", epoch) + # reset the iterator for next pass of data + train_data.reset() + eval_data.reset() + logging.info('Training completed.') diff --git a/example/sparse/matrix_factorization/train.py b/example/sparse/matrix_factorization/train.py new file mode 100644 index 000000000000..44bab2c416ba --- /dev/null +++ b/example/sparse/matrix_factorization/train.py @@ -0,0 +1,132 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import logging +import mxnet as mx +import numpy as np +from data import get_movielens_iter, get_movielens_data +from model import matrix_fact_net +import os + +logging.basicConfig(level=logging.DEBUG) + +parser = argparse.ArgumentParser(description="Run matrix factorization with sparse embedding", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-epoch', type=int, default=3, + help='number of epochs to train') +parser.add_argument('--seed', type=int, default=1, + help='random seed') +parser.add_argument('--batch-size', type=int, default=128, + help='number of examples per batch') +parser.add_argument('--log-interval', type=int, default=100, + help='logging interval') +parser.add_argument('--factor-size', type=int, default=128, + help="the factor size of the embedding operation") +parser.add_argument('--gpus', type=str, + help="list of gpus to run, e.g. 0 or 0,2. empty means using cpu().") +parser.add_argument('--dense', action='store_true', help="whether to use dense embedding") + +MOVIELENS = { + 'dataset': 'ml-10m', + 'train': './data/ml-10M100K/r1.train', + 'val': './data/ml-10M100K/r1.test', + 'max_user': 71569, + 'max_movie': 65135, +} + +def batch_row_ids(data_batch): + """ Generate row ids based on the current mini-batch """ + item = data_batch.data[0] + user = data_batch.data[1] + return {'user_weight': user.astype(np.int64), + 'item_weight': item.astype(np.int64)} + +def all_row_ids(data_batch): + """ Generate row ids for all rows """ + all_users = mx.nd.arange(0, MOVIELENS['max_user'], dtype='int64') + all_movies = mx.nd.arange(0, MOVIELENS['max_movie'], dtype='int64') + return {'user_weight': all_users, 'item_weight': all_movies} + +if __name__ == '__main__': + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + + # arg parser + args = parser.parse_args() + logging.info(args) + num_epoch = args.num_epoch + batch_size = args.batch_size + optimizer = 'sgd' + factor_size = args.factor_size + log_interval = args.log_interval + + momentum = 0.9 + ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] if args.gpus else [mx.cpu()] + learning_rate = 0.1 + mx.random.seed(args.seed) + np.random.seed(args.seed) + + # prepare dataset and iterators + max_user = MOVIELENS['max_user'] + max_movies = MOVIELENS['max_movie'] + data_dir = os.path.join(os.getcwd(), 'data') + get_movielens_data(data_dir, MOVIELENS['dataset']) + train_iter = get_movielens_iter(MOVIELENS['train'], batch_size) + val_iter = get_movielens_iter(MOVIELENS['val'], batch_size) + + # construct the model + net = matrix_fact_net(factor_size, factor_size, max_user, max_movies, dense=args.dense) + + # initialize the module + mod = mx.module.Module(net, context=ctx, data_names=['user', 'item'], + label_names=['score']) + mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) + mod.init_params(initializer=mx.init.Xavier(factor_type="in", magnitude=2.34)) + optim = mx.optimizer.create(optimizer, learning_rate=learning_rate, + rescale_grad=1.0/batch_size) + mod.init_optimizer(optimizer=optim, kvstore='device') + # use MSE as the metric + metric = mx.metric.create(['MSE']) + speedometer = mx.callback.Speedometer(batch_size, log_interval) + logging.info('Training started ...') + for epoch in range(num_epoch): + nbatch = 0 + metric.reset() + for batch in train_iter: + nbatch += 1 + mod.prepare(batch, sparse_row_id_fn=batch_row_ids) + mod.forward_backward(batch) + # update all parameters + mod.update() + # update training metric + mod.update_metric(metric, batch.label) + speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch, + eval_metric=metric, locals=locals()) + speedometer(speedometer_param) + + # prepare the module weight with all row ids for inference. Alternatively, one could call + # score = mod.score(val_iter, ['MSE'], sparse_row_id_fn=batch_row_ids) + # to fetch the weight per mini-batch + mod.prepare(None, sparse_row_id_fn=all_row_ids) + # evaluate metric on validation dataset + score = mod.score(val_iter, ['MSE']) + logging.info('epoch %d, eval MSE = %s ' % (epoch, score[0][1])) + # reset the iterator for next pass of data + train_iter.reset() + val_iter.reset() + logging.info('Training completed.') diff --git a/example/sparse/wide_deep/inference.py b/example/sparse/wide_deep/inference.py new file mode 100644 index 000000000000..e14396e50c15 --- /dev/null +++ b/example/sparse/wide_deep/inference.py @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet.test_utils import * +from config import * +from data import get_uci_adult +from model import wide_deep_model +import argparse +import os +import time + +parser = argparse.ArgumentParser(description="Run sparse wide and deep inference", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-infer-batch', type=int, default=100, + help='number of batches to inference') +parser.add_argument('--load-epoch', type=int, default=0, + help='loading the params of the corresponding training epoch.') +parser.add_argument('--batch-size', type=int, default=100, + help='number of examples per batch') +parser.add_argument('--benchmark', action='store_true', default=False, + help='run the script for benchmark mode, not set for accuracy test.') +parser.add_argument('--verbose', action='store_true', default=False, + help='accurcy for each batch will be logged if set') +parser.add_argument('--gpu', action='store_true', default=False, + help='Inference on GPU with CUDA') +parser.add_argument('--model-prefix', type=str, default='checkpoint', + help='the model prefix') + +if __name__ == '__main__': + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + + # arg parser + args = parser.parse_args() + logging.info(args) + num_iters = args.num_infer_batch + batch_size = args.batch_size + benchmark = args.benchmark + verbose = args.verbose + model_prefix = args.model_prefix + load_epoch = args.load_epoch + ctx = mx.gpu(0) if args.gpu else mx.cpu() + # dataset + data_dir = os.path.join(os.getcwd(), 'data') + val_data = os.path.join(data_dir, ADULT['test']) + val_csr, val_dns, val_label = get_uci_adult(data_dir, ADULT['test'], ADULT['url']) + # load parameters and symbol + sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, load_epoch) + # data iterator + eval_data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns}, + {'softmax_label': val_label}, batch_size, + shuffle=True, last_batch_handle='discard') + # module + mod = mx.mod.Module(symbol=sym, context=ctx, data_names=['csr_data', 'dns_data'], + label_names=['softmax_label']) + mod.bind(data_shapes=eval_data.provide_data, label_shapes=eval_data.provide_label) + # get the sparse weight parameter + mod.set_params(arg_params=arg_params, aux_params=aux_params) + + data_iter = iter(eval_data) + nbatch = 0 + if benchmark: + logging.info('Inference benchmark started ...') + tic = time.time() + for i in range(num_iters): + try: + batch = data_iter.next() + except StopIteration: + data_iter.reset() + else: + mod.forward(batch, is_train=False) + for output in mod.get_outputs(): + output.wait_to_read() + nbatch += 1 + score = (nbatch*batch_size)/(time.time() - tic) + logging.info('batch size %d, process %s samples/s' % (batch_size, score)) + else: + logging.info('Inference started ...') + # use accuracy as the metric + metric = mx.metric.create(['acc']) + accuracy_avg = 0.0 + for batch in data_iter: + nbatch += 1 + metric.reset() + mod.forward(batch, is_train=False) + mod.update_metric(metric, batch.label) + accuracy_avg += metric.get()[1][0] + if args.verbose: + logging.info('batch %d, accuracy = %s' % (nbatch, metric.get())) + logging.info('averged accuracy on eval set is %.5f' % (accuracy_avg/nbatch)) diff --git a/example/sparse/wide_deep/train.py b/example/sparse/wide_deep/train.py new file mode 100644 index 000000000000..eea70301660d --- /dev/null +++ b/example/sparse/wide_deep/train.py @@ -0,0 +1,114 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet.test_utils import * +from config import * +from data import get_uci_adult +from model import wide_deep_model +import argparse +import os + + +parser = argparse.ArgumentParser(description="Run sparse wide and deep classification ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('--num-epoch', type=int, default=10, + help='number of epochs to train') +parser.add_argument('--batch-size', type=int, default=100, + help='number of examples per batch') +parser.add_argument('--lr', type=float, default=0.001, + help='learning rate') +parser.add_argument('--gpu', action='store_true', default=False, + help='Train on GPU with CUDA') +parser.add_argument('--optimizer', type=str, default='adam', + help='what optimizer to use', + choices=["ftrl", "sgd", "adam"]) +parser.add_argument('--log-interval', type=int, default=100, + help='number of batches to wait before logging training status') + + +if __name__ == '__main__': + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + + # arg parser + args = parser.parse_args() + logging.info(args) + num_epoch = args.num_epoch + batch_size = args.batch_size + optimizer = args.optimizer + log_interval = args.log_interval + lr = args.lr + ctx = mx.gpu(0) if args.gpu else mx.cpu() + + # dataset + data_dir = os.path.join(os.getcwd(), 'data') + train_data = os.path.join(data_dir, ADULT['train']) + val_data = os.path.join(data_dir, ADULT['test']) + train_csr, train_dns, train_label = get_uci_adult(data_dir, ADULT['train'], ADULT['url']) + val_csr, val_dns, val_label = get_uci_adult(data_dir, ADULT['test'], ADULT['url']) + + model = wide_deep_model(ADULT['num_linear_features'], ADULT['num_embed_features'], + ADULT['num_cont_features'], ADULT['embed_input_dims'], + ADULT['hidden_units']) + + # data iterator + train_data = mx.io.NDArrayIter({'csr_data': train_csr, 'dns_data': train_dns}, + {'softmax_label': train_label}, batch_size, + shuffle=True, last_batch_handle='discard') + eval_data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns}, + {'softmax_label': val_label}, batch_size, + shuffle=True, last_batch_handle='discard') + + # module + mod = mx.mod.Module(symbol=model, context=ctx, data_names=['csr_data', 'dns_data'], + label_names=['softmax_label']) + mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label) + mod.init_params() + optim = mx.optimizer.create(optimizer, learning_rate=lr, rescale_grad=1.0/batch_size) + mod.init_optimizer(optimizer=optim) + # use accuracy as the metric + metric = mx.metric.create(['acc']) + # get the sparse weight parameter + speedometer = mx.callback.Speedometer(batch_size, log_interval) + + logging.info('Training started ...') + + data_iter = iter(train_data) + for epoch in range(num_epoch): + nbatch = 0 + metric.reset() + for batch in data_iter: + nbatch += 1 + mod.forward_backward(batch) + # update all parameters (including the weight parameter) + mod.update() + # update training metric + mod.update_metric(metric, batch.label) + speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch, + eval_metric=metric, locals=locals()) + speedometer(speedometer_param) + # evaluate metric on validation dataset + score = mod.score(eval_data, ['acc']) + logging.info('epoch %d, accuracy = %s' % (epoch, score[0][1])) + + mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=True) + # reset the iterator for next pass of data + data_iter.reset() + + logging.info('Training completed.') diff --git a/example/speech_recognition/stt_metric.py b/example/speech_recognition/stt_metric.py new file mode 100644 index 000000000000..26609627ea58 --- /dev/null +++ b/example/speech_recognition/stt_metric.py @@ -0,0 +1,252 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import numpy as np + +from label_util import LabelUtil +from log_util import LogUtil + + +def check_label_shapes(labels, preds, shape=0): + """Check to see if the two arrays are the same size.""" + + if shape == 0: + label_shape, pred_shape = len(labels), len(preds) + else: + label_shape, pred_shape = labels.shape, preds.shape + + if label_shape != pred_shape: + raise ValueError("Shape of labels {} does not match shape of " + "predictions {}".format(label_shape, pred_shape)) + + +class STTMetric(mx.metric.EvalMetric): + def __init__(self, batch_size, num_gpu, is_epoch_end=False, is_logging=True): + super(STTMetric, self).__init__('STTMetric') + + self.batch_size = batch_size + self.num_gpu = num_gpu + self.total_n_label = 0 + self.total_l_dist = 0 + self.is_epoch_end = is_epoch_end + self.total_ctc_loss = 0. + self.batch_loss = 0. + self.is_logging = is_logging + + def update(self, labels, preds): + check_label_shapes(labels, preds) + if self.is_logging: + log = LogUtil.getInstance().getlogger() + labelUtil = LabelUtil.getInstance() + self.batch_loss = 0. + + for label, pred in zip(labels, preds): + label = label.asnumpy() + pred = pred.asnumpy() + + seq_length = len(pred) / int(int(self.batch_size) / int(self.num_gpu)) + + for i in range(int(int(self.batch_size) / int(self.num_gpu))): + l = remove_blank(label[i]) + p = [] + for k in range(int(seq_length)): + p.append(np.argmax(pred[k * int(int(self.batch_size) / int(self.num_gpu)) + i])) + p = pred_best(p) + + l_distance = levenshtein_distance(l, p) + self.total_n_label += len(l) + self.total_l_dist += l_distance + this_cer = float(l_distance) / float(len(l)) + if self.is_logging: + log.info("label: %s " % (labelUtil.convert_num_to_word(l))) + log.info("pred : %s , cer: %f (distance: %d/ label length: %d)" % ( + labelUtil.convert_num_to_word(p), this_cer, l_distance, len(l))) + self.num_inst += 1 + self.sum_metric += this_cer + if self.is_epoch_end: + loss = ctc_loss(l, pred, i, int(seq_length), int(self.batch_size), int(self.num_gpu)) + self.batch_loss += loss + if self.is_logging: + log.info("loss: %f " % loss) + self.total_ctc_loss += self.batch_loss + + def get_batch_loss(self): + return self.batch_loss + + def get_name_value(self): + try: + total_cer = float(self.total_l_dist) / float(self.total_n_label) + except ZeroDivisionError: + total_cer = float('inf') + + return total_cer, self.total_n_label, self.total_l_dist, self.total_ctc_loss + + def reset(self): + self.total_n_label = 0 + self.total_l_dist = 0 + self.num_inst = 0 + self.sum_metric = 0.0 + self.total_ctc_loss = 0.0 + + +def pred_best(p): + ret = [] + p1 = [0] + p + for i in range(len(p)): + c1 = p1[i] + c2 = p1[i + 1] + if c2 == 0 or c2 == c1: + continue + ret.append(c2) + return ret + + +def remove_blank(l): + ret = [] + for i in range(l.size): + if l[i] == 0: + break + ret.append(l[i]) + return ret + + +def remove_space(l): + labelUtil = LabelUtil.getInstance() + ret = [] + for i in range(len(l)): + if l[i] != labelUtil.get_space_index(): + ret.append(l[i]) + return ret + + +def ctc_loss(label, prob, remainder, seq_length, batch_size, num_gpu=1, big_num=1e10): + label_ = [0, 0] + prob[prob < 1 / big_num] = 1 / big_num + log_prob = np.log(prob) + + l = len(label) + for i in range(l): + label_.append(int(label[i])) + label_.append(0) + + l_ = 2 * l + 1 + a = np.full((seq_length, l_ + 1), -big_num) + a[0][1] = log_prob[remainder][0] + a[0][2] = log_prob[remainder][label_[2]] + for i in range(1, seq_length): + row = i * int(batch_size / num_gpu) + remainder + a[i][1] = a[i - 1][1] + log_prob[row][0] + a[i][2] = np.logaddexp(a[i - 1][2], a[i - 1][1]) + log_prob[row][label_[2]] + for j in range(3, l_ + 1): + a[i][j] = np.logaddexp(a[i - 1][j], a[i - 1][j - 1]) + if label_[j] != 0 and label_[j] != label_[j - 2]: + a[i][j] = np.logaddexp(a[i][j], a[i - 1][j - 2]) + a[i][j] += log_prob[row][label_[j]] + + return -np.logaddexp(a[seq_length - 1][l_], a[seq_length - 1][l_ - 1]) + + +# label is done with remove_blank +# pred is got from pred_best +def levenshtein_distance(label, pred): + n_label = len(label) + 1 + n_pred = len(pred) + 1 + if (label == pred): + return 0 + if (len(label) == 0): + return len(pred) + if (len(pred) == 0): + return len(label) + + v0 = [i for i in range(n_label)] + v1 = [0 for i in range(n_label)] + + for i in range(len(pred)): + v1[0] = i + 1 + + for j in range(len(label)): + cost = 0 if label[j] == pred[i] else 1 + v1[j + 1] = min(v1[j] + 1, v0[j + 1] + 1, v0[j] + cost) + + for j in range(n_label): + v0[j] = v1[j] + + return v1[len(label)] + + +def char_match_1way(char_label, char_pred, criteria, n_whole_label): + n_label = len(char_label) + n_pred = len(char_pred) + + pred_pos = 0 + accuracy = 0. + next_accu = 0. + n_matched = 0. + next_n_matched = 0. + + for i_index in range(n_label): + tail_label = n_label - 1 - i_index + c_label = char_label[i_index] + + for j_index in range(pred_pos, n_pred): + tail_pred = n_pred - 1 - j_index + c_pred = char_pred[j_index] + + if tail_label < tail_pred * criteria or tail_pred < tail_label * criteria: + break + if c_label == c_pred: + n_matched += 1.0 + pred_pos = j_index + 1 + break + + accuracy = n_matched / n_whole_label + + if n_label > 0.7 * n_whole_label: + next_label = char_label[1:] + next_accu, next_n_matched = char_match_1way(next_label, char_pred, criteria, n_whole_label) + + if next_accu > accuracy: + accuracy = next_accu + n_matched = next_n_matched + return accuracy, n_matched + + +def char_match_2way(label, pred): + criterias = [0.98, 0.96, 0.93, 0.9, 0.85, 0.8, 0.7] + r_pred = pred[::-1] + r_label = label[::-1] + n_whole_label = len(remove_space(label)) + + val1_max = 0. + val2_max = 0. + val1_max_matched = 0. + val2_max_matched = 0. + for criteria in criterias: + val1, val1_matched = char_match_1way(label, pred, criteria, n_whole_label) + val2, val2_matched = char_match_1way(r_label, r_pred, criteria, n_whole_label) + + if val1 > val1_max: + val1_max = val1 + val1_max_matched = val1_matched + if val2 > val2_max: + val2_max = val2 + val2_max_matched = val2_matched + + val = val1_max if val1_max > val2_max else val2_max + val_matched = val1_max_matched if val1_max > val2_max else val2_max_matched + return val, val_matched, n_whole_label diff --git a/example/ssd/evaluate/eval_metric.py b/example/ssd/evaluate/eval_metric.py new file mode 100644 index 000000000000..1deb381fb859 --- /dev/null +++ b/example/ssd/evaluate/eval_metric.py @@ -0,0 +1,295 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import numpy as np + +class MApMetric(mx.metric.EvalMetric): + """ + Calculate mean AP for object detection task + + Parameters: + --------- + ovp_thresh : float + overlap threshold for TP + use_difficult : boolean + use difficult ground-truths if applicable, otherwise just ignore + class_names : list of str + optional, if provided, will print out AP for each class + pred_idx : int + prediction index in network output list + """ + def __init__(self, ovp_thresh=0.5, use_difficult=False, class_names=None, pred_idx=0): + super(MApMetric, self).__init__('mAP') + if class_names is None: + self.num = None + else: + assert isinstance(class_names, (list, tuple)) + for name in class_names: + assert isinstance(name, str), "must provide names as str" + num = len(class_names) + self.name = class_names + ['mAP'] + self.num = num + 1 + self.reset() + self.ovp_thresh = ovp_thresh + self.use_difficult = use_difficult + self.class_names = class_names + self.pred_idx = int(pred_idx) + + def reset(self): + """Clear the internal statistics to initial state.""" + if getattr(self, 'num', None) is None: + self.num_inst = 0 + self.sum_metric = 0.0 + else: + self.num_inst = [0] * self.num + self.sum_metric = [0.0] * self.num + self.records = dict() + self.counts = dict() + + def get(self): + """Get the current evaluation result. + + Returns + ------- + name : str + Name of the metric. + value : float + Value of the evaluation. + """ + self._update() # update metric at this time + if self.num is None: + if self.num_inst == 0: + return (self.name, float('nan')) + else: + return (self.name, self.sum_metric / self.num_inst) + else: + names = ['%s'%(self.name[i]) for i in range(self.num)] + values = [x / y if y != 0 else float('nan') \ + for x, y in zip(self.sum_metric, self.num_inst)] + return (names, values) + + def update(self, labels, preds): + """ + Update internal records. This function now only update internal buffer, + sum_metric and num_inst are updated in _update() function instead when + get() is called to return results. + + Params: + ---------- + labels: mx.nd.array (n * 6) or (n * 5), difficult column is optional + 2-d array of ground-truths, n objects(id-xmin-ymin-xmax-ymax-[difficult]) + preds: mx.nd.array (m * 6) + 2-d array of detections, m objects(id-score-xmin-ymin-xmax-ymax) + """ + def iou(x, ys): + """ + Calculate intersection-over-union overlap + Params: + ---------- + x : numpy.array + single box [xmin, ymin ,xmax, ymax] + ys : numpy.array + multiple box [[xmin, ymin, xmax, ymax], [...], ] + Returns: + ----------- + numpy.array + [iou1, iou2, ...], size == ys.shape[0] + """ + ixmin = np.maximum(ys[:, 0], x[0]) + iymin = np.maximum(ys[:, 1], x[1]) + ixmax = np.minimum(ys[:, 2], x[2]) + iymax = np.minimum(ys[:, 3], x[3]) + iw = np.maximum(ixmax - ixmin, 0.) + ih = np.maximum(iymax - iymin, 0.) + inters = iw * ih + uni = (x[2] - x[0]) * (x[3] - x[1]) + (ys[:, 2] - ys[:, 0]) * \ + (ys[:, 3] - ys[:, 1]) - inters + ious = inters / uni + ious[uni < 1e-12] = 0 # in case bad boxes + return ious + + # independant execution for each image + for i in range(labels[0].shape[0]): + # get as numpy arrays + label = labels[0][i].asnumpy() + if np.sum(label[:, 0] >= 0) < 1: + continue + pred = preds[self.pred_idx][i].asnumpy() + # calculate for each class + while (pred.shape[0] > 0): + cid = int(pred[0, 0]) + indices = np.where(pred[:, 0].astype(int) == cid)[0] + if cid < 0: + pred = np.delete(pred, indices, axis=0) + continue + dets = pred[indices] + pred = np.delete(pred, indices, axis=0) + # sort by score, desceding + dets = dets[dets[:,1].argsort()[::-1]] + records = np.hstack((dets[:, 1][:, np.newaxis], np.zeros((dets.shape[0], 1)))) + # ground-truths + label_indices = np.where(label[:, 0].astype(int) == cid)[0] + gts = label[label_indices, :] + label = np.delete(label, label_indices, axis=0) + if gts.size > 0: + found = [False] * gts.shape[0] + for j in range(dets.shape[0]): + # compute overlaps + ious = iou(dets[j, 2:], gts[:, 1:5]) + ovargmax = np.argmax(ious) + ovmax = ious[ovargmax] + if ovmax > self.ovp_thresh: + if (not self.use_difficult and + gts.shape[1] >= 6 and + gts[ovargmax, 5] > 0): + pass + else: + if not found[ovargmax]: + records[j, -1] = 1 # tp + found[ovargmax] = True + else: + # duplicate + records[j, -1] = 2 # fp + else: + records[j, -1] = 2 # fp + else: + # no gt, mark all fp + records[:, -1] = 2 + + # ground truth count + if (not self.use_difficult and gts.shape[1] >= 6): + gt_count = np.sum(gts[:, 5] < 1) + else: + gt_count = gts.shape[0] + + # now we push records to buffer + # first column: score, second column: tp/fp + # 0: not set(matched to difficult or something), 1: tp, 2: fp + records = records[np.where(records[:, -1] > 0)[0], :] + if records.size > 0: + self._insert(cid, records, gt_count) + + # add missing class if not present in prediction + while (label.shape[0] > 0): + cid = int(label[0, 0]) + label_indices = np.where(label[:, 0].astype(int) == cid)[0] + label = np.delete(label, label_indices, axis=0) + if cid < 0: + continue + gt_count = label_indices.size + self._insert(cid, np.array([[0, 0]]), gt_count) + + def _update(self): + """ update num_inst and sum_metric """ + aps = [] + for k, v in self.records.items(): + recall, prec = self._recall_prec(v, self.counts[k]) + ap = self._average_precision(recall, prec) + aps.append(ap) + if self.num is not None and k < (self.num - 1): + self.sum_metric[k] = ap + self.num_inst[k] = 1 + if self.num is None: + self.num_inst = 1 + self.sum_metric = np.mean(aps) + else: + self.num_inst[-1] = 1 + self.sum_metric[-1] = np.mean(aps) + + def _recall_prec(self, record, count): + """ get recall and precision from internal records """ + record = np.delete(record, np.where(record[:, 1].astype(int) == 0)[0], axis=0) + sorted_records = record[record[:,0].argsort()[::-1]] + tp = np.cumsum(sorted_records[:, 1].astype(int) == 1) + fp = np.cumsum(sorted_records[:, 1].astype(int) == 2) + if count <= 0: + recall = tp * 0.0 + else: + recall = tp / float(count) + prec = tp.astype(float) / (tp + fp) + return recall, prec + + def _average_precision(self, rec, prec): + """ + calculate average precision + + Params: + ---------- + rec : numpy.array + cumulated recall + prec : numpy.array + cumulated precision + Returns: + ---------- + ap as float + """ + # append sentinel values at both ends + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute precision integration ladder + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # look for recall value changes + i = np.where(mrec[1:] != mrec[:-1])[0] + + # sum (\delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + def _insert(self, key, records, count): + """ Insert records according to key """ + if key not in self.records: + assert key not in self.counts + self.records[key] = records + self.counts[key] = count + else: + self.records[key] = np.vstack((self.records[key], records)) + assert key in self.counts + self.counts[key] += count + + +class VOC07MApMetric(MApMetric): + """ Mean average precision metric for PASCAL V0C 07 dataset """ + def __init__(self, *args, **kwargs): + super(VOC07MApMetric, self).__init__(*args, **kwargs) + + def _average_precision(self, rec, prec): + """ + calculate average precision, override the default one, + special 11-point metric + + Params: + ---------- + rec : numpy.array + cumulated recall + prec : numpy.array + cumulated precision + Returns: + ---------- + ap as float + """ + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap += p / 11. + return ap diff --git a/example/ssd/train/metric.py b/example/ssd/train/metric.py new file mode 100644 index 000000000000..eeb9796bf4a8 --- /dev/null +++ b/example/ssd/train/metric.py @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import numpy as np + + +class MultiBoxMetric(mx.metric.EvalMetric): + """Calculate metrics for Multibox training """ + def __init__(self, eps=1e-8): + super(MultiBoxMetric, self).__init__('MultiBox') + self.eps = eps + self.num = 2 + self.name = ['CrossEntropy', 'SmoothL1'] + self.reset() + + def reset(self): + """ + override reset behavior + """ + if getattr(self, 'num', None) is None: + self.num_inst = 0 + self.sum_metric = 0.0 + else: + self.num_inst = [0] * self.num + self.sum_metric = [0.0] * self.num + + def reset_local(self): + """ + override reset behavior + """ + if getattr(self, 'num', None) is None: + self.num_inst = 0 + self.sum_metric = 0.0 + else: + self.num_inst = [0] * self.num + self.sum_metric = [0.0] * self.num + + def update(self, labels, preds): + """ + Implementation of updating metrics + """ + # get generated multi label from network + cls_prob = preds[0].asnumpy() + loc_loss = preds[1].asnumpy() + cls_label = preds[2].asnumpy() + valid_count = np.sum(cls_label >= 0) + # overall accuracy & object accuracy + label = cls_label.flatten() + mask = np.where(label >= 0)[0] + indices = np.int64(label[mask]) + prob = cls_prob.transpose((0, 2, 1)).reshape((-1, cls_prob.shape[1])) + prob = prob[mask, indices] + self.sum_metric[0] += (-np.log(prob + self.eps)).sum() + self.num_inst[0] += valid_count + # smoothl1loss + self.sum_metric[1] += np.sum(loc_loss) + self.num_inst[1] += valid_count + + def get(self): + """Get the current evaluation result. + Override the default behavior + + Returns + ------- + name : str + Name of the metric. + value : float + Value of the evaluation. + """ + if self.num is None: + if self.num_inst == 0: + return (self.name, float('nan')) + else: + return (self.name, self.sum_metric / self.num_inst) + else: + names = ['%s'%(self.name[i]) for i in range(self.num)] + values = [x / y if y != 0 else float('nan') \ + for x, y in zip(self.sum_metric, self.num_inst)] + return (names, values) diff --git a/example/svm_mnist/svm_mnist.py b/example/svm_mnist/svm_mnist.py new file mode 100644 index 000000000000..e166cb6ac707 --- /dev/null +++ b/example/svm_mnist/svm_mnist.py @@ -0,0 +1,124 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +############################################################# +## Please read the README.md document for better reference ## +############################################################# +from __future__ import print_function + +import logging +import random + +import mxnet as mx +import numpy as np +from sklearn.datasets import fetch_mldata +from sklearn.decomposition import PCA + + +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +np.random.seed(1234) # set seed for deterministic ordering +mx.random.seed(1234) +random.seed(1234) + +# Network declaration as symbols. The following pattern was based +# on the article, but feel free to play with the number of nodes +# and with the activation function +data = mx.symbol.Variable('data') +fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=512) +act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") +fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 512) +act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") +fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10) + +# Here we add the ultimate layer based on L2-SVM objective +mlp_svm_l2 = mx.symbol.SVMOutput(data=fc3, name='svm_l2') + +# With L1-SVM objective +mlp_svm_l1 = mx.symbol.SVMOutput(data=fc3, name='svm_l1', use_linear=True) + +# Compare with softmax cross entropy loss +mlp_softmax = mx.symbol.SoftmaxOutput(data=fc3, name='softmax') + +print("Preparing data...") +mnist_data = mx.test_utils.get_mnist() +X = np.concatenate([mnist_data['train_data'], mnist_data['test_data']]) +Y = np.concatenate([mnist_data['train_label'], mnist_data['test_label']]) +X = X.reshape((X.shape[0], -1)).astype(np.float32) * 255 + +# Now we fetch MNIST dataset, add some noise, as the article suggests, +# permutate and assign the examples to be used on our network +mnist_pca = PCA(n_components=70).fit_transform(X) +noise = np.random.normal(size=mnist_pca.shape) +mnist_pca += noise +p = np.random.permutation(mnist_pca.shape[0]) +X = mnist_pca[p] / 255. +Y = Y[p] +X_show = X[p] + +# This is just to normalize the input and separate train set and test set +X_train = X[:60000] +X_test = X[60000:] +X_show = X_show[60000:] +Y_train = Y[:60000] +Y_test = Y[60000:] +print("Data prepared.") +# Article's suggestion on batch size +batch_size = 200 + +ctx = mx.gpu() if mx.context.num_gpus() > 0 else mx.cpu() + +results = {} +for output in [mlp_svm_l2, mlp_svm_l1, mlp_softmax]: + + print("\nTesting with %s \n" % output.name) + + label = output.name + "_label" + + train_iter = mx.io.NDArrayIter(X_train, Y_train, batch_size=batch_size, label_name=label) + test_iter = mx.io.NDArrayIter(X_test, Y_test, batch_size=batch_size, label_name=label) + + # Here we instatiate and fit the model for our data + # The article actually suggests using 400 epochs, + # But I reduced to 10, for convenience + + mod = mx.mod.Module( + context = ctx, + symbol = output, # Use the network we just defined + label_names = [label], + ) + mod.fit( + train_data=train_iter, + eval_data=test_iter, # Testing data set. MXNet computes scores on test set every epoch + batch_end_callback = mx.callback.Speedometer(batch_size, 200), # Logging module to print out progress + num_epoch = 10, # Train for 10 epochs + optimizer_params = { + 'learning_rate': 0.1, # Learning rate + 'momentum': 0.9, # Momentum for SGD with momentum + 'wd': 0.00001, # Weight decay for regularization + }) + results[output.name] = mod.score(test_iter, mx.metric.Accuracy())[0][1]*100 + print('Accuracy for %s:'%output.name, mod.score(test_iter, mx.metric.Accuracy())[0][1]*100, '%\n') + +for key, value in results.items(): + print(key, value, "%s") + +#svm_l2 97.85 %s +#svm_l1 98.15 %s +#softmax 97.69 %s diff --git a/example/svrg_module/api_usage_example/example_api_train.py b/example/svrg_module/api_usage_example/example_api_train.py new file mode 100644 index 000000000000..f6cd1b2e592c --- /dev/null +++ b/example/svrg_module/api_usage_example/example_api_train.py @@ -0,0 +1,124 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import mxnet as mx +import numpy as np +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule + + +def test_svrg_intermediate_level_api(args): + """Demonstrates intermediate level SVRGModule API where the training process + need to be explicitly defined. KVstore is not explicitly created. + + Parameters + ---------- + args: args + Command line arguments + """ + num_epoch = args.epochs + batch_size = args.batch_size + update_freq = args.update_freq + + di, mod = create_network(batch_size, update_freq) + + mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) + mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False) + kv = mx.kv.create("local") + mod.init_optimizer(kvstore=kv, optimizer='sgd', optimizer_params=(('learning_rate', 0.025),)) + metrics = mx.metric.create("mse") + for e in range(num_epoch): + metrics.reset() + if e % mod.update_freq == 0: + mod.update_full_grads(di) + di.reset() + for batch in di: + mod.forward_backward(data_batch=batch) + mod.update() + mod.update_metric(metrics, batch.label) + mod.logger.info('Epoch[%d] Train cost=%f', e, metrics.get()[1]) + + +def test_svrg_high_level_api(args): + """Demonstrates suggested usage of high level SVRGModule API. KVStore is explicitly created. + + Parameters + ---------- + args: args + Command line arguments + """ + num_epoch = args.epochs + batch_size = args.batch_size + update_freq = args.update_freq + + di, mod = create_network(batch_size, update_freq) + mod.fit(di, eval_metric='mse', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), num_epoch=num_epoch, + kvstore='local') + + +def create_network(batch_size, update_freq): + """Create a linear regression network for performing SVRG optimization. + Parameters + ---------- + batch_size: int + Size of data split + update_freq: int + Update Frequency for calculating full gradients + + Returns + ---------- + di: mx.io.NDArrayIter + Data iterator + update_freq: SVRGModule + An instance of SVRGModule for performing SVRG optimization + """ + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + + train_data = np.random.randint(1, 5, [1000, 2]) + weights = np.array([1.0, 2.0]) + train_label = train_data.dot(weights) + + di = mx.io.NDArrayIter(train_data, train_label, batch_size=batch_size, shuffle=True, label_name='lin_reg_label') + X = mx.sym.Variable('data') + Y = mx.symbol.Variable('lin_reg_label') + fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) + lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") + + mod = SVRGModule( + symbol=lro, + data_names=['data'], + label_names=['lin_reg_label'], update_freq=update_freq, logger=logging + ) + + return di, mod + +# run as a script +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('-e', dest='epochs', default=100, type=int) + parser.add_argument('-bs', dest='batch_size', default=32, type=int) + parser.add_argument('-f', dest="update_freq", default=2, type=int) + args = parser.parse_args() + + print("========================== Intermediate Level API ==========================") + test_svrg_intermediate_level_api(args) + print("========================== High Level API ==========================") + test_svrg_high_level_api(args) diff --git a/example/svrg_module/api_usage_example/example_inference.py b/example/svrg_module/api_usage_example/example_inference.py new file mode 100644 index 000000000000..312f9796074d --- /dev/null +++ b/example/svrg_module/api_usage_example/example_inference.py @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import mxnet as mx +import numpy as np +import logging +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule + + +def test_svrg_inference(args): + epoch = args.epochs + batch_size = args.batch_size + update_freq = args.update_freq + + train_iter, val_iter, mod = create_network(batch_size, update_freq) + mod.fit(train_iter, eval_data=val_iter, eval_metric='mse', optimizer='sgd', + optimizer_params=(('learning_rate', 0.025),), + num_epoch=epoch) + + +def get_validation_score(args): + epoch = args.epochs + batch_size = args.batch_size + update_freq = args.update_freq + + train_iter, val_iter, mod = create_network(batch_size, update_freq) + mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) + mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False) + mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),)) + metrics = mx.metric.create("mse") + for e in range(epoch): + metrics.reset() + if e % mod.update_freq == 0: + mod.update_full_grads(train_iter) + train_iter.reset() + for batch in train_iter: + mod.forward_backward(data_batch=batch) + mod.update() + mod.update_metric(metrics, batch.label) + + y = mod.predict(val_iter) + + # test-train data split, 20% test data out of 1000 data samples + assert y.shape == (200, 1) + score = mod.score(val_iter, ['mse']) + print("Training Loss on Validation Set is {}".format(score[0][1])) + + +def create_network(batch_size, update_freq): + """Create a linear regression network for performing SVRG optimization. + :return: an instance of mx.io.NDArrayIter + :return: an instance of mx.mod.svrgmodule for performing SVRG optimization + """ + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.INFO, format=head) + data = np.random.randint(1, 5, [1000, 2]) + + #Test_Train data split + n_train = int(data.shape[0] * 0.8) + weights = np.array([1.0, 2.0]) + label = data.dot(weights) + + di = mx.io.NDArrayIter(data[:n_train, :], label[:n_train], batch_size=batch_size, shuffle=True, label_name='lin_reg_label') + val_iter = mx.io.NDArrayIter(data[n_train:, :], label[n_train:], batch_size=batch_size) + + X = mx.sym.Variable('data') + Y = mx.symbol.Variable('lin_reg_label') + fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) + lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") + + mod = SVRGModule( + symbol=lro, + data_names=['data'], + label_names=['lin_reg_label'], update_freq=update_freq, logger=logging) + + return di, val_iter, mod + + +# run as a script +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('-e', dest='epochs', default=100, type=int) + parser.add_argument('-bs', dest='batch_size', default=32, type=int) + parser.add_argument('-f', dest="update_freq", default=2, type=int) + args = parser.parse_args() + + print("========================== SVRG Module Inference ==========================") + test_svrg_inference(args) + print("========================SVRG Module Score ============================") + get_validation_score(args) diff --git a/example/svrg_module/benchmarks/svrg_benchmark.ipynb b/example/svrg_module/benchmarks/svrg_benchmark.ipynb new file mode 100644 index 000000000000..54ae81281db3 --- /dev/null +++ b/example/svrg_module/benchmarks/svrg_benchmark.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linear Regression Using SVRGModule on YearPredictionMSD Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, a linear regression model will be fit on YearPredictionMSD dataset, which contains predictions of the release year of a song based on its audio features. The dataset has 90 features and over 400,000 samples. The dataset is downsampled to 5,000 in this experiment." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "import sys\n", + "import tempfile\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.patches as mpatches\n", + "import mxnet as mx\n", + "from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "from sklearn.datasets import load_svmlight_file\n", + "\n", + "sys.path.insert(0, \"../linear_regression\")\n", + "from data_reader import get_year_prediction_data\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read Data\n", + "The first step is to get the training features and labels and normalize the data. In this example, we will use 5000 data samples. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting data...\n", + "Reading data from disk...\n" + ] + } + ], + "source": [ + "feature_dim, train_features, train_labels = get_year_prediction_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "train_features = train_features[-5000:]\n", + "train_labels = train_labels[-5000:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Linear Regression Network" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def create_lin_reg_network(batch_size=100):\n", + " train_iter = mx.io.NDArrayIter(train_features, train_labels, batch_size=batch_size, shuffle=True,\n", + " data_name='data', label_name='label')\n", + " data = mx.sym.Variable(\"data\")\n", + " label = mx.sym.Variable(\"label\")\n", + " weight = mx.sym.Variable(\"fc_weight\", shape=(1, 90))\n", + " net = mx.sym.dot(data, weight.transpose())\n", + " bias = mx.sym.Variable(\"fc_bias\", shape=(1,), wd_mult=0.0, lr_mult=10.0)\n", + " net = mx.sym.broadcast_plus(net, bias)\n", + " net = mx.sym.LinearRegressionOutput(data=net, label=label)\n", + " \n", + " return train_iter, net" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### SVRGModule with SVRG Optimization\n", + "In this example, we will use intermediate level API for SVRGModule and the dump mse per epoch to JSON file for plotting graphs." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def train_svrg_lin_reg(num_epoch=100, batch_size=100, update_freq=2, output='svrg_lr.json', \n", + " optimizer_params=None):\n", + "\n", + " di, net = create_lin_reg_network(batch_size=batch_size)\n", + " \n", + " #Create a SVRGModule\n", + " mod = SVRGModule(symbol=net, context=mx.cpu(0), data_names=['data'], label_names=['label'], update_freq=update_freq)\n", + " mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label)\n", + " mod.init_params(initializer=mx.init.Zero(), allow_missing=False, force_init=False, allow_extra=False)\n", + " mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=optimizer_params)\n", + " metrics = mx.metric.create(\"mse\")\n", + " \n", + " results = {}\n", + " for e in range(num_epoch):\n", + " results[e] = {}\n", + " metrics.reset()\n", + " if e % mod.update_freq == 0:\n", + " mod.update_full_grads(di)\n", + " di.reset()\n", + " for batch in di:\n", + " mod.forward_backward(data_batch=batch)\n", + " mod.update()\n", + " mod.update_metric(metrics, batch.label)\n", + " results[e][\"mse\"] = metrics.get()[1]\n", + " \n", + " f = open(output, 'w+')\n", + " f.write(json.dumps(results, indent=4, sort_keys=True))\n", + " f.close()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Module with SGD Optimization " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def train_sgd_lin_reg(num_epoch=100, batch_size=100, update_freq=2, output='sgd_lr.json', \n", + " optimizer_params=None):\n", + " \n", + " di, net = create_lin_reg_network(batch_size=batch_size)\n", + " \n", + " #Create a standard module\n", + " mod = mx.mod.Module(symbol=net, context=mx.cpu(0), data_names=['data'], label_names=['label'])\n", + " mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label)\n", + " mod.init_params(initializer=mx.init.Zero(), allow_missing=False, force_init=False, allow_extra=False)\n", + " mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=optimizer_params)\n", + " metrics = mx.metric.create(\"mse\")\n", + " \n", + " results = {}\n", + " for e in range(num_epoch):\n", + " results[e] = {}\n", + " metrics.reset()\n", + " di.reset()\n", + " for batch in di:\n", + " mod.forward_backward(data_batch=batch)\n", + " mod.update()\n", + " mod.update_metric(metrics, batch.label)\n", + " results[e][\"mse\"] = metrics.get()[1]\n", + " f = open(output, 'w+')\n", + " f.write(json.dumps(results, indent=4, sort_keys=True))\n", + " f.close()\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training Loss over 100 Epochs Using lr_scheduler\n", + "When a large learning rate is used with SGD, training loss will drop fast but will oscillates above the minimum and never converges. With a small learning rate, it will eventually reach the minimum after many iterations. A common practice is to use learning rate scheduling by starting with a large learning rate and gradually decreasing it. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "train_svrg_lin_reg(optimizer_params={'lr_scheduler': mx.lr_scheduler.FactorScheduler(step=10, factor=0.99)})\n", + "train_sgd_lin_reg(optimizer_params={'lr_scheduler': mx.lr_scheduler.FactorScheduler(step=10, factor=0.99)})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5,0,'Epochs')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot graph\n", + "#Plot training loss over Epochs:\n", + "color = sns.color_palette()\n", + "#Draw Weight Variance Ratio\n", + "dataplot3 = {\"svrg_mse\": [], \"sgd_mse\": []}\n", + "with open('sgd_lr.json') as sgd_data, open('svrg_lr.json') as svrg_data:\n", + " sgd = json.load(sgd_data)\n", + " svrg = json.load(svrg_data)\n", + " for epoch in range(100):\n", + " dataplot3[\"svrg_mse\"].append(svrg[str(epoch)][\"mse\"])\n", + " dataplot3[\"sgd_mse\"].append(sgd[str(epoch)][\"mse\"])\n", + "\n", + "x3 = list(range(100))\n", + "plt.figure(figsize=(20, 12))\n", + "plt.title(\"Training Loss Over Epochs\")\n", + "sns.pointplot(x3, dataplot3['svrg_mse'], color=color[9])\n", + "sns.pointplot(x3, dataplot3['sgd_mse'], color=color[8])\n", + "color_patch1 = mpatches.Patch(color=color[9], label=\"svrg_mse\")\n", + "color_patch2 = mpatches.Patch(color=color[8], label=\"sgd_mse\")\n", + "plt.legend(handles=[color_patch1, color_patch2])\n", + "plt.ylabel('Training Loss', fontsize=12)\n", + "plt.xlabel('Epochs', fontsize=12)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training Loss Comparison with SGD with fixed learning rates\n", + "Choosing learning rate (0.0025, 0.001, 0.005) for SGD and a relatively large learning rate 0.025 for SVRG, we can see SVRG smoothly goes down faster than SGD. Learning rate for SVRG does not need to decay to zero, which means we can start with a larger learning rate." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "train_svrg_lin_reg(output=\"svrg_0.025.json\", optimizer_params=(('learning_rate', 0.025),))\n", + "train_sgd_lin_reg(output=\"sgd_0.001.json\", optimizer_params=((\"learning_rate\", 0.001),))\n", + "train_sgd_lin_reg(output=\"sgd_0.0025.json\", optimizer_params=((\"learning_rate\", 0.0025),))\n", + "train_sgd_lin_reg(output=\"sgd_0.005.json\", optimizer_params=((\"learning_rate\", 0.005),))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5,0,'Epochs')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Plot training loss over Epochs:\n", + "color = sns.color_palette()\n", + "#Draw Weight Variance Ratio\n", + "dataplot3 = {\"svrg_mse\": [], \"sgd_mse_lr_0.001\": [], \"sgd_mse_lr_0.0025\": [], \"sgd_mse_lr_0.005\":[]}\n", + "with open('sgd_0.001.json') as sgd_data, open('svrg_0.025.json') as svrg_data, open('sgd_0.0025.json') as sgd_data_2, open('sgd_0.005.json') as sgd_data_3:\n", + " sgd = json.load(sgd_data)\n", + " svrg = json.load(svrg_data)\n", + " sgd_lr = json.load(sgd_data_2)\n", + " sgd_lr_2 = json.load(sgd_data_3)\n", + " for epoch in range(100):\n", + " dataplot3[\"svrg_mse\"].append(svrg[str(epoch)][\"mse\"])\n", + " dataplot3[\"sgd_mse_lr_0.001\"].append(sgd[str(epoch)][\"mse\"])\n", + " dataplot3[\"sgd_mse_lr_0.0025\"].append(sgd_lr[str(epoch)][\"mse\"])\n", + " dataplot3[\"sgd_mse_lr_0.005\"].append(sgd_lr_2[str(epoch)][\"mse\"])\n", + "\n", + "x3 = list(range(100))\n", + "plt.figure(figsize=(20, 12))\n", + "plt.title(\"Training Loss Over Epochs\")\n", + "sns.pointplot(x3, dataplot3['svrg_mse'], color=color[9])\n", + "sns.pointplot(x3, dataplot3['sgd_mse_lr_0.001'], color=color[8])\n", + "sns.pointplot(x3, dataplot3['sgd_mse_lr_0.0025'], color=color[3])\n", + "sns.pointplot(x3, dataplot3['sgd_mse_lr_0.005'], color=color[7])\n", + "color_patch1 = mpatches.Patch(color=color[9], label=\"svrg_mse_lr_0.025\")\n", + "color_patch2 = mpatches.Patch(color=color[8], label=\"sgd_mse_lr_0.001\")\n", + "color_patch3 = mpatches.Patch(color=color[3], label=\"sgd_mse_lr_0.0025\")\n", + "color_patch4 = mpatches.Patch(color=color[7], label=\"sgd_mse_lr_0.005\")\n", + "plt.legend(handles=[color_patch1, color_patch2, color_patch3, color_patch4])\n", + "plt.ylabel('Training Loss', fontsize=12)\n", + "plt.xlabel('Epochs', fontsize=12)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/example/svrg_module/linear_regression/common.py b/example/svrg_module/linear_regression/common.py new file mode 100644 index 000000000000..14a144f40ce2 --- /dev/null +++ b/example/svrg_module/linear_regression/common.py @@ -0,0 +1,117 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +import mxnet as mx +import logging +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule + + +def create_lin_reg_network(train_features, train_labels, feature_dim, batch_size, update_freq, ctx, logger): + # fit a linear regression model with mxnet SVRGModule + print("Fitting linear regression with mxnet") + train_iter = mx.io.NDArrayIter(train_features, train_labels, batch_size=batch_size, shuffle=True, + data_name='data', label_name='label') + data = mx.sym.Variable("data") + label = mx.sym.Variable("label") + weight = mx.sym.Variable("fc_weight", shape=(1, feature_dim)) + net = mx.sym.dot(data, weight.transpose()) + bias = mx.sym.Variable("fc_bias", shape=(1,), wd_mult=0.0, lr_mult=10.0) + net = mx.sym.broadcast_plus(net, bias) + net = mx.sym.LinearRegressionOutput(data=net, label=label) + mod = SVRGModule(symbol=net, context=ctx, data_names=['data'], label_names=['label'], logger=logger, + update_freq=update_freq) + return train_iter, mod + + +def create_metrics(metrics): + metric = mx.metric.create(metrics) + return metric + + +def create_logger(): + logger = logging.getLogger('sgd_svrg') + logger.setLevel(logging.INFO) + formatter = logging.Formatter('%(asctime)s - %(message)s') + fh = logging.FileHandler('experiments.log') + fh.setFormatter(formatter) + logger.addHandler(fh) + return logger + + +################################################################################ +# Functions below are for benchmark purpose to calcuate expectation, variance of +# gradients per epoch for each parameter. These calculations will be helpful when +# benchmarking SVRG optimization with other optimization techniques, such as SGD. +# Currently it only calculates the expectation, variance for single context but +# can be extended to multi-context in later iterations. +################################################################################ + +def accumulate_grad(grad_dict, mod): + param_names = mod._exec_group.param_names + + for index, name in enumerate(param_names): + if name not in grad_dict: + grad_dict[name] = mod._exec_group.grad_arrays[index][0].copy() + else: + grad_dict[name] = mx.ndarray.concat(grad_dict[name], mod._exec_group.grad_arrays[index][0], dim=0) + + +def calc_expectation(grad_dict, num_batches): + """Calculates the expectation of the gradients per epoch for each parameter w.r.t number of batches + + Parameters + ---------- + grad_dict: dict + dictionary that maps parameter name to gradients in the mod executor group + num_batches: int + number of batches + + Returns + ---------- + grad_dict: dict + dictionary with new keys mapping to gradients expectations + + """ + for key in grad_dict.keys(): + grad_dict[str.format(key+"_expectation")] = mx.ndarray.sum(grad_dict[key], axis=0) / num_batches + + return grad_dict + + +def calc_variance(grad_dict, num_batches, param_names): + """Calculates the variance of the gradients per epoch for each parameter w.r.t number of batches + + Parameters + ---------- + grad_dict: dict + dictionary that maps parameter name to gradients in the mod executor group + num_batches: int + number of batches + param_names: str + parameter name in the module + + Returns + ---------- + grad_dict: dict + dictionary with new keys mapping to gradients variance + + """ + for i in range(len(param_names)): + diff_sqr = mx.ndarray.square(mx.nd.subtract(grad_dict[param_names[i]], + grad_dict[str.format(param_names[i]+"_expectation")])) + grad_dict[str.format(param_names[i] + "_variance")] = mx.ndarray.sum(diff_sqr, axis=0) / num_batches diff --git a/example/vae-gan/vaegan_mxnet.py b/example/vae-gan/vaegan_mxnet.py new file mode 100644 index 000000000000..38e7e2ecc92f --- /dev/null +++ b/example/vae-gan/vaegan_mxnet.py @@ -0,0 +1,739 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +''' +Created on Jun 15, 2017 + +@author: shujon +''' + +from __future__ import print_function +import logging +from datetime import datetime +import os +import argparse +import errno +import mxnet as mx +import numpy as np +import cv2 +from scipy.io import savemat +#from layer import GaussianSampleLayer + +###################################################################### +#An adversarial variational autoencoder implementation in mxnet +# following the implementation at https://github.com/JeremyCCHsu/tf-vaegan +# of paper `Larsen, Anders Boesen Lindbo, et al. "Autoencoding beyond pixels using a +# learned similarity metric." arXiv preprint arXiv:1512.09300 (2015).` +###################################################################### + +@mx.init.register +class MyConstant(mx.init.Initializer): + '''constant operator in mxnet, no used in the code + ''' + def __init__(self, value): + super(MyConstant, self).__init__(value=value) + self.value = value + + def _init_weight(self, _, arr): + arr[:] = mx.nd.array(self.value) + +def encoder(nef, z_dim, batch_size, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12): + '''The encoder is a CNN which takes 32x32 image as input + generates the 100 dimensional shape embedding as a sample from normal distribution + using predicted meand and variance + ''' + BatchNorm = mx.sym.BatchNorm + + data = mx.sym.Variable('data') + + e1 = mx.sym.Convolution(data, name='enc1', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef, no_bias=no_bias) + ebn1 = BatchNorm(e1, name='encbn1', fix_gamma=fix_gamma, eps=eps) + eact1 = mx.sym.LeakyReLU(ebn1, name='encact1', act_type='leaky', slope=0.2) + + e2 = mx.sym.Convolution(eact1, name='enc2', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef*2, no_bias=no_bias) + ebn2 = BatchNorm(e2, name='encbn2', fix_gamma=fix_gamma, eps=eps) + eact2 = mx.sym.LeakyReLU(ebn2, name='encact2', act_type='leaky', slope=0.2) + + e3 = mx.sym.Convolution(eact2, name='enc3', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef*4, no_bias=no_bias) + ebn3 = BatchNorm(e3, name='encbn3', fix_gamma=fix_gamma, eps=eps) + eact3 = mx.sym.LeakyReLU(ebn3, name='encact3', act_type='leaky', slope=0.2) + + e4 = mx.sym.Convolution(eact3, name='enc4', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef*8, no_bias=no_bias) + ebn4 = BatchNorm(e4, name='encbn4', fix_gamma=fix_gamma, eps=eps) + eact4 = mx.sym.LeakyReLU(ebn4, name='encact4', act_type='leaky', slope=0.2) + + eact4 = mx.sym.Flatten(eact4) + + z_mu = mx.sym.FullyConnected(eact4, num_hidden=z_dim, name="enc_mu") + z_lv = mx.sym.FullyConnected(eact4, num_hidden=z_dim, name="enc_lv") + + z = z_mu + mx.symbol.broadcast_mul(mx.symbol.exp(0.5*z_lv),mx.symbol.random_normal(loc=0, scale=1,shape=(batch_size,z_dim))) + + return z_mu, z_lv, z + +def generator(ngf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12, z_dim=100, activation='sigmoid'): + '''The genrator is a CNN which takes 100 dimensional embedding as input + and reconstructs the input image given to the encoder + ''' + BatchNorm = mx.sym.BatchNorm + rand = mx.sym.Variable('rand') + + rand = mx.sym.Reshape(rand, shape=(-1, z_dim, 1, 1)) + + g1 = mx.sym.Deconvolution(rand, name='gen1', kernel=(5,5), stride=(2,2),target_shape=(2,2), num_filter=ngf*8, no_bias=no_bias) + gbn1 = BatchNorm(g1, name='genbn1', fix_gamma=fix_gamma, eps=eps) + gact1 = mx.sym.Activation(gbn1, name="genact1", act_type="relu") + + g2 = mx.sym.Deconvolution(gact1, name='gen2', kernel=(5,5), stride=(2,2),target_shape=(4,4), num_filter=ngf*4, no_bias=no_bias) + gbn2 = BatchNorm(g2, name='genbn2', fix_gamma=fix_gamma, eps=eps) + gact2 = mx.sym.Activation(gbn2, name='genact2', act_type='relu') + + g3 = mx.sym.Deconvolution(gact2, name='gen3', kernel=(5,5), stride=(2,2), target_shape=(8,8), num_filter=ngf*2, no_bias=no_bias) + gbn3 = BatchNorm(g3, name='genbn3', fix_gamma=fix_gamma, eps=eps) + gact3 = mx.sym.Activation(gbn3, name='genact3', act_type='relu') + + g4 = mx.sym.Deconvolution(gact3, name='gen4', kernel=(5,5), stride=(2,2), target_shape=(16,16), num_filter=ngf, no_bias=no_bias) + gbn4 = BatchNorm(g4, name='genbn4', fix_gamma=fix_gamma, eps=eps) + gact4 = mx.sym.Activation(gbn4, name='genact4', act_type='relu') + + g5 = mx.sym.Deconvolution(gact4, name='gen5', kernel=(5,5), stride=(2,2), target_shape=(32,32), num_filter=nc, no_bias=no_bias) + gout = mx.sym.Activation(g5, name='genact5', act_type=activation) + + return gout + +def discriminator1(ndf, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12): + '''First part of the discriminator which takes a 32x32 image as input + and output a convolutional feature map, this is required to calculate + the layer loss''' + BatchNorm = mx.sym.BatchNorm + + data = mx.sym.Variable('data') + + d1 = mx.sym.Convolution(data, name='d1', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf, no_bias=no_bias) + dact1 = mx.sym.LeakyReLU(d1, name='dact1', act_type='leaky', slope=0.2) + + d2 = mx.sym.Convolution(dact1, name='d2', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf*2, no_bias=no_bias) + dbn2 = BatchNorm(d2, name='dbn2', fix_gamma=fix_gamma, eps=eps) + dact2 = mx.sym.LeakyReLU(dbn2, name='dact2', act_type='leaky', slope=0.2) + + d3 = mx.sym.Convolution(dact2, name='d3', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf*4, no_bias=no_bias) + dbn3 = BatchNorm(d3, name='dbn3', fix_gamma=fix_gamma, eps=eps) + dact3 = mx.sym.LeakyReLU(dbn3, name='dact3', act_type='leaky', slope=0.2) + + return dact3 + +def discriminator2(ndf, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12): + '''Second part of the discriminator which takes a 256x8x8 feature map as input + and generates the loss based on whether the input image was a real one or fake one''' + + BatchNorm = mx.sym.BatchNorm + + data = mx.sym.Variable('data') + + label = mx.sym.Variable('label') + + d4 = mx.sym.Convolution(data, name='d4', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf*8, no_bias=no_bias) + dbn4 = BatchNorm(d4, name='dbn4', fix_gamma=fix_gamma, eps=eps) + dact4 = mx.sym.LeakyReLU(dbn4, name='dact4', act_type='leaky', slope=0.2) + + h = mx.sym.Flatten(dact4) + + d5 = mx.sym.FullyConnected(h, num_hidden=1, name="d5") + + dloss = mx.sym.LogisticRegressionOutput(data=d5, label=label, name='dloss') + + return dloss + +def GaussianLogDensity(x, mu, log_var, name='GaussianLogDensity', EPSILON = 1e-6): + '''GaussianLogDensity loss calculation for layer wise loss + ''' + c = mx.sym.ones_like(log_var)*2.0 * 3.1416 + c = mx.symbol.log(c) + var = mx.sym.exp(log_var) + x_mu2 = mx.symbol.square(x - mu) # [Issue] not sure the dim works or not? + x_mu2_over_var = mx.symbol.broadcast_div(x_mu2, var + EPSILON) + log_prob = -0.5 * (c + log_var + x_mu2_over_var) + log_prob = mx.symbol.sum(log_prob, axis=1, name=name) # keep_dims=True, + return log_prob + +def DiscriminatorLayerLoss(): + '''Calculate the discriminator layer loss + ''' + + data = mx.sym.Variable('data') + + label = mx.sym.Variable('label') + + data = mx.sym.Flatten(data) + label = mx.sym.Flatten(label) + + label = mx.sym.BlockGrad(label) + + zeros = mx.sym.zeros_like(data) + + output = -GaussianLogDensity(label, data, zeros) + + dloss = mx.symbol.MakeLoss(mx.symbol.mean(output),name='lloss') + + return dloss + +def KLDivergenceLoss(): + '''KLDivergenceLoss loss + ''' + + data = mx.sym.Variable('data') + mu1, lv1 = mx.sym.split(data, num_outputs=2, axis=0) + mu2 = mx.sym.zeros_like(mu1) + lv2 = mx.sym.zeros_like(lv1) + + v1 = mx.sym.exp(lv1) + v2 = mx.sym.exp(lv2) + mu_diff_sq = mx.sym.square(mu1 - mu2) + dimwise_kld = .5 * ( + (lv2 - lv1) + mx.symbol.broadcast_div(v1, v2) + mx.symbol.broadcast_div(mu_diff_sq, v2) - 1.) + KL = mx.symbol.sum(dimwise_kld, axis=1) + + KLloss = mx.symbol.MakeLoss(mx.symbol.mean(KL),name='KLloss') + return KLloss + +def get_data(path, activation): + '''Get the dataset + ''' + data = [] + image_names = [] + for filename in os.listdir(path): + img = cv2.imread(os.path.join(path,filename), cv2.IMREAD_GRAYSCALE) + image_names.append(filename) + if img is not None: + data.append(img) + + data = np.asarray(data) + + if activation == 'sigmoid': + data = data.astype(np.float32)/(255.0) + elif activation == 'tanh': + data = data.astype(np.float32)/(255.0/2) - 1.0 + + data = data.reshape((data.shape[0], 1, data.shape[1], data.shape[2])) + + np.random.seed(1234) + p = np.random.permutation(data.shape[0]) + X = data[p] + + return X, image_names + +class RandIter(mx.io.DataIter): + '''Create a random iterator for generator + ''' + def __init__(self, batch_size, ndim): + self.batch_size = batch_size + self.ndim = ndim + self.provide_data = [('rand', (batch_size, ndim, 1, 1))] + self.provide_label = [] + + def iter_next(self): + return True + + def getdata(self): + return [mx.random.normal(0, 1.0, shape=(self.batch_size, self.ndim, 1, 1))] + +def fill_buf(buf, i, img, shape): + '''fill the ith grid of the buffer matrix with the values from the img + buf : buffer matrix + i : serial of the image in the 2D grid + img : image data + shape : ( height width depth ) of image''' + + # grid height is a multiple of individual image height + m = buf.shape[0]/shape[0] + + sx = (i%m)*shape[1] + sy = (i//m)*shape[0] + sx = int(sx) + sy = int(sy) + buf[sy:sy+shape[0], sx:sx+shape[1], :] = img + +def visual(title, X, activation): + '''create a grid of images and save it as a final image + title : grid image name + X : array of images + ''' + assert len(X.shape) == 4 + + X = X.transpose((0, 2, 3, 1)) + if activation == 'sigmoid': + X = np.clip((X)*(255.0), 0, 255).astype(np.uint8) + elif activation == 'tanh': + X = np.clip((X+1.0)*(255.0/2.0), 0, 255).astype(np.uint8) + n = np.ceil(np.sqrt(X.shape[0])) + buff = np.zeros((int(n*X.shape[1]), int(n*X.shape[2]), int(X.shape[3])), dtype=np.uint8) + for i, img in enumerate(X): + fill_buf(buff, i, img, X.shape[1:3]) + cv2.imwrite('%s.jpg' % (title), buff) + +def train(dataset, nef, ndf, ngf, nc, batch_size, Z, lr, beta1, epsilon, ctx, check_point, g_dl_weight, output_path, checkpoint_path, data_path, activation,num_epoch, save_after_every, visualize_after_every, show_after_every): + '''adversarial training of the VAE + ''' + + #encoder + z_mu, z_lv, z = encoder(nef, Z, batch_size) + symE = mx.sym.Group([z_mu, z_lv, z]) + + #generator + symG = generator(ngf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12, z_dim = Z, activation=activation ) + + #discriminator + h = discriminator1(ndf) + dloss = discriminator2(ndf) + symD1 = h + symD2 = dloss + + + # ==============data============== + X_train, _ = get_data(data_path, activation) + train_iter = mx.io.NDArrayIter(X_train, batch_size=batch_size, shuffle=True) + rand_iter = RandIter(batch_size, Z) + label = mx.nd.zeros((batch_size,), ctx=ctx) + + # =============module E============= + modE = mx.mod.Module(symbol=symE, data_names=('data',), label_names=None, context=ctx) + modE.bind(data_shapes=train_iter.provide_data) + modE.init_params(initializer=mx.init.Normal(0.02)) + modE.init_optimizer( + optimizer='adam', + optimizer_params={ + 'learning_rate': lr, + 'wd': 1e-6, + 'beta1': beta1, + 'epsilon': epsilon, + 'rescale_grad': (1.0/batch_size) + }) + mods = [modE] + + # =============module G============= + modG = mx.mod.Module(symbol=symG, data_names=('rand',), label_names=None, context=ctx) + modG.bind(data_shapes=rand_iter.provide_data, inputs_need_grad=True) + modG.init_params(initializer=mx.init.Normal(0.02)) + modG.init_optimizer( + optimizer='adam', + optimizer_params={ + 'learning_rate': lr, + 'wd': 1e-6, + 'beta1': beta1, + 'epsilon': epsilon, + }) + mods.append(modG) + + # =============module D============= + modD1 = mx.mod.Module(symD1, label_names=[], context=ctx) + modD2 = mx.mod.Module(symD2, label_names=('label',), context=ctx) + modD = mx.mod.SequentialModule() + modD.add(modD1).add(modD2, take_labels=True, auto_wiring=True) + modD.bind(data_shapes=train_iter.provide_data, + label_shapes=[('label', (batch_size,))], + inputs_need_grad=True) + modD.init_params(initializer=mx.init.Normal(0.02)) + modD.init_optimizer( + optimizer='adam', + optimizer_params={ + 'learning_rate': lr, + 'wd': 1e-3, + 'beta1': beta1, + 'epsilon': epsilon, + 'rescale_grad': (1.0/batch_size) + }) + mods.append(modD) + + + # =============module DL============= + symDL = DiscriminatorLayerLoss() + modDL = mx.mod.Module(symbol=symDL, data_names=('data',), label_names=('label',), context=ctx) + modDL.bind(data_shapes=[('data', (batch_size,nef * 4,4,4))], ################################################################################################################################ fix 512 here + label_shapes=[('label', (batch_size,nef * 4,4,4))], + inputs_need_grad=True) + modDL.init_params(initializer=mx.init.Normal(0.02)) + modDL.init_optimizer( + optimizer='adam', + optimizer_params={ + 'learning_rate': lr, + 'wd': 0., + 'beta1': beta1, + 'epsilon': epsilon, + 'rescale_grad': (1.0/batch_size) + }) + + # =============module KL============= + symKL = KLDivergenceLoss() + modKL = mx.mod.Module(symbol=symKL, data_names=('data',), label_names=None, context=ctx) + modKL.bind(data_shapes=[('data', (batch_size*2,Z))], + inputs_need_grad=True) + modKL.init_params(initializer=mx.init.Normal(0.02)) + modKL.init_optimizer( + optimizer='adam', + optimizer_params={ + 'learning_rate': lr, + 'wd': 0., + 'beta1': beta1, + 'epsilon': epsilon, + 'rescale_grad': (1.0/batch_size) + }) + mods.append(modKL) + + def norm_stat(d): + return mx.nd.norm(d)/np.sqrt(d.size) + mon = mx.mon.Monitor(10, norm_stat, pattern=".*output|d1_backward_data", sort=True) + mon = None + if mon is not None: + for mod in mods: + pass + + def facc(label, pred): + '''calculating prediction accuracy + ''' + pred = pred.ravel() + label = label.ravel() + return ((pred > 0.5) == label).mean() + + def fentropy(label, pred): + '''calculating binary cross-entropy loss + ''' + pred = pred.ravel() + label = label.ravel() + return -(label*np.log(pred+1e-12) + (1.-label)*np.log(1.-pred+1e-12)).mean() + + def kldivergence(label, pred): + '''calculating KL divergence loss + ''' + mean, log_var = np.split(pred, 2, axis=0) + var = np.exp(log_var) + KLLoss = -0.5 * np.sum(1 + log_var - np.power(mean, 2) - var) + KLLoss = KLLoss / nElements + return KLLoss + + mG = mx.metric.CustomMetric(fentropy) + mD = mx.metric.CustomMetric(fentropy) + mE = mx.metric.CustomMetric(kldivergence) + mACC = mx.metric.CustomMetric(facc) + + print('Training...') + stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') + + # =============train=============== + for epoch in range(num_epoch): + train_iter.reset() + for t, batch in enumerate(train_iter): + + rbatch = rand_iter.next() + + if mon is not None: + mon.tic() + + modG.forward(rbatch, is_train=True) + outG = modG.get_outputs() + + # update discriminator on fake + label[:] = 0 + modD.forward(mx.io.DataBatch(outG, [label]), is_train=True) + modD.backward() + gradD11 = [[grad.copyto(grad.context) for grad in grads] for grads in modD1._exec_group.grad_arrays] + gradD12 = [[grad.copyto(grad.context) for grad in grads] for grads in modD2._exec_group.grad_arrays] + + modD.update_metric(mD, [label]) + modD.update_metric(mACC, [label]) + + + #update discriminator on decoded + modE.forward(batch, is_train=True) + mu, lv, z = modE.get_outputs() + z = z.reshape((batch_size, Z, 1, 1)) + sample = mx.io.DataBatch([z], label=None, provide_data = [('rand', (batch_size, Z, 1, 1))]) + modG.forward(sample, is_train=True) + xz = modG.get_outputs() + label[:] = 0 + modD.forward(mx.io.DataBatch(xz, [label]), is_train=True) + modD.backward() + + #modD.update() + gradD21 = [[grad.copyto(grad.context) for grad in grads] for grads in modD1._exec_group.grad_arrays] + gradD22 = [[grad.copyto(grad.context) for grad in grads] for grads in modD2._exec_group.grad_arrays] + modD.update_metric(mD, [label]) + modD.update_metric(mACC, [label]) + + # update discriminator on real + label[:] = 1 + batch.label = [label] + modD.forward(batch, is_train=True) + lx = [out.copyto(out.context) for out in modD1.get_outputs()] + modD.backward() + for gradsr, gradsf, gradsd in zip(modD1._exec_group.grad_arrays, gradD11, gradD21): + for gradr, gradf, gradd in zip(gradsr, gradsf, gradsd): + gradr += 0.5 * (gradf + gradd) + for gradsr, gradsf, gradsd in zip(modD2._exec_group.grad_arrays, gradD12, gradD22): + for gradr, gradf, gradd in zip(gradsr, gradsf, gradsd): + gradr += 0.5 * (gradf + gradd) + + modD.update() + modD.update_metric(mD, [label]) + modD.update_metric(mACC, [label]) + + modG.forward(rbatch, is_train=True) + outG = modG.get_outputs() + label[:] = 1 + modD.forward(mx.io.DataBatch(outG, [label]), is_train=True) + modD.backward() + diffD = modD1.get_input_grads() + modG.backward(diffD) + gradG1 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays] + mG.update([label], modD.get_outputs()) + + modG.forward(sample, is_train=True) + xz = modG.get_outputs() + label[:] = 1 + modD.forward(mx.io.DataBatch(xz, [label]), is_train=True) + modD.backward() + diffD = modD1.get_input_grads() + modG.backward(diffD) + gradG2 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays] + mG.update([label], modD.get_outputs()) + + modG.forward(sample, is_train=True) + xz = modG.get_outputs() + modD1.forward(mx.io.DataBatch(xz, []), is_train=True) + outD1 = modD1.get_outputs() + modDL.forward(mx.io.DataBatch(outD1, lx), is_train=True) + modDL.backward() + dlGrad = modDL.get_input_grads() + modD1.backward(dlGrad) + diffD = modD1.get_input_grads() + modG.backward(diffD) + + for grads, gradsG1, gradsG2 in zip(modG._exec_group.grad_arrays, gradG1, gradG2): + for grad, gradg1, gradg2 in zip(grads, gradsG1, gradsG2): + grad = g_dl_weight * grad + 0.5 * (gradg1 + gradg2) + + modG.update() + mG.update([label], modD.get_outputs()) + + modG.forward(rbatch, is_train=True) + outG = modG.get_outputs() + label[:] = 1 + modD.forward(mx.io.DataBatch(outG, [label]), is_train=True) + modD.backward() + diffD = modD1.get_input_grads() + modG.backward(diffD) + gradG1 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays] + mG.update([label], modD.get_outputs()) + + modG.forward(sample, is_train=True) + xz = modG.get_outputs() + label[:] = 1 + modD.forward(mx.io.DataBatch(xz, [label]), is_train=True) + modD.backward() + diffD = modD1.get_input_grads() + modG.backward(diffD) + gradG2 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays] + mG.update([label], modD.get_outputs()) + + modG.forward(sample, is_train=True) + xz = modG.get_outputs() + modD1.forward(mx.io.DataBatch(xz, []), is_train=True) + outD1 = modD1.get_outputs() + modDL.forward(mx.io.DataBatch(outD1, lx), is_train=True) + modDL.backward() + dlGrad = modDL.get_input_grads() + modD1.backward(dlGrad) + diffD = modD1.get_input_grads() + modG.backward(diffD) + + for grads, gradsG1, gradsG2 in zip(modG._exec_group.grad_arrays, gradG1, gradG2): + for grad, gradg1, gradg2 in zip(grads, gradsG1, gradsG2): + grad = g_dl_weight * grad + 0.5 * (gradg1 + gradg2) + + modG.update() + mG.update([label], modD.get_outputs()) + + modG.forward(sample, is_train=True) + xz = modG.get_outputs() + + #update generator + modD1.forward(mx.io.DataBatch(xz, []), is_train=True) + outD1 = modD1.get_outputs() + modDL.forward(mx.io.DataBatch(outD1, lx), is_train=True) + DLloss = modDL.get_outputs() + modDL.backward() + dlGrad = modDL.get_input_grads() + modD1.backward(dlGrad) + diffD = modD1.get_input_grads() + modG.backward(diffD) + #update encoder + nElements = batch_size + modKL.forward(mx.io.DataBatch([mx.ndarray.concat(mu,lv, dim=0)]), is_train=True) + KLloss = modKL.get_outputs() + modKL.backward() + gradKLLoss = modKL.get_input_grads() + diffG = modG.get_input_grads() + diffG = diffG[0].reshape((batch_size, Z)) + modE.backward(mx.ndarray.split(gradKLLoss[0], num_outputs=2, axis=0) + [diffG]) + modE.update() + pred = mx.ndarray.concat(mu,lv, dim=0) + mE.update([pred], [pred]) + if mon is not None: + mon.toc_print() + + t += 1 + if t % show_after_every == 0: + print('epoch:', epoch, 'iter:', t, 'metric:', mACC.get(), mG.get(), mD.get(), mE.get(), KLloss[0].asnumpy(), DLloss[0].asnumpy()) + mACC.reset() + mG.reset() + mD.reset() + mE.reset() + + if epoch % visualize_after_every == 0: + visual(output_path +'gout'+str(epoch), outG[0].asnumpy(), activation) + visual(output_path + 'data'+str(epoch), batch.data[0].asnumpy(), activation) + + if check_point and epoch % save_after_every == 0: + print('Saving...') + modG.save_params(checkpoint_path + '/%s_G-%04d.params'%(dataset, epoch)) + modD.save_params(checkpoint_path + '/%s_D-%04d.params'%(dataset, epoch)) + modE.save_params(checkpoint_path + '/%s_E-%04d.params'%(dataset, epoch)) + +def test(nef, ngf, nc, batch_size, Z, ctx, pretrained_encoder_path, pretrained_generator_path, output_path, data_path, activation, save_embedding, embedding_path = ''): + '''Test the VAE with a pretrained encoder and generator. + Keep the batch size 1''' + #encoder + z_mu, z_lv, z = encoder(nef, Z, batch_size) + symE = mx.sym.Group([z_mu, z_lv, z]) + + #generator + symG = generator(ngf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12, z_dim = Z, activation=activation ) + + # ==============data============== + X_test, image_names = get_data(data_path, activation) + test_iter = mx.io.NDArrayIter(X_test, batch_size=batch_size, shuffle=False) + + # =============module E============= + modE = mx.mod.Module(symbol=symE, data_names=('data',), label_names=None, context=ctx) + modE.bind(data_shapes=test_iter.provide_data) + modE.load_params(pretrained_encoder_path) + + # =============module G============= + modG = mx.mod.Module(symbol=symG, data_names=('rand',), label_names=None, context=ctx) + modG.bind(data_shapes=[('rand', (1, Z, 1, 1))]) + modG.load_params(pretrained_generator_path) + + print('Testing...') + + # =============test=============== + test_iter.reset() + for t, batch in enumerate(test_iter): + + #update discriminator on decoded + modE.forward(batch, is_train=False) + mu, lv, z = modE.get_outputs() + mu = mu.reshape((batch_size, Z, 1, 1)) + sample = mx.io.DataBatch([mu], label=None, provide_data = [('rand', (batch_size, Z, 1, 1))]) + modG.forward(sample, is_train=False) + outG = modG.get_outputs() + + visual(output_path + '/' + 'gout'+str(t), outG[0].asnumpy(), activation) + visual(output_path + '/' + 'data'+str(t), batch.data[0].asnumpy(), activation) + image_name = image_names[t].split('.')[0] + + if save_embedding: + savemat(embedding_path+'/'+image_name+'.mat', {'embedding':mu.asnumpy()}) + +def create_and_validate_dir(data_dir): + '''Creates/Validates dir + ''' + if data_dir != "": + if not os.path.exists(data_dir): + try: + logging.info('create directory %s', data_dir) + os.makedirs(data_dir) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise OSError('failed to create ' + data_dir) + + +def parse_args(): + '''Parse args + ''' + parser = argparse.ArgumentParser(description='Train and Test an Adversarial Variatiional Encoder') + + parser.add_argument('--train', help='train the network', action='store_true') + parser.add_argument('--test', help='test the network', action='store_true') + parser.add_argument('--save_embedding', help='saves the shape embedding of each input image', action='store_true') + parser.add_argument('--dataset', help='dataset name', default='caltech', type=str) + parser.add_argument('--activation', help='activation i.e. sigmoid or tanh', default='sigmoid', type=str) + parser.add_argument('--training_data_path', help='training data path', default='datasets/caltech101/data/images32x32', type=str) + parser.add_argument('--testing_data_path', help='testing data path', default='datasets/caltech101/test_data', type=str) + parser.add_argument('--pretrained_encoder_path', help='pretrained encoder model path', default='checkpoints32x32_sigmoid/caltech_E-0045.params', type=str) + parser.add_argument('--pretrained_generator_path', help='pretrained generator model path', default='checkpoints32x32_sigmoid/caltech_G-0045.params', type=str) + parser.add_argument('--output_path', help='output path for the generated images', default='outputs32x32_sigmoid', type=str) + parser.add_argument('--embedding_path', help='output path for the generated embeddings', default='outputs32x32_sigmoid', type=str) + parser.add_argument('--checkpoint_path', help='checkpoint saving path ', default='checkpoints32x32_sigmoid', type=str) + parser.add_argument('--nef', help='encoder filter count in the first layer', default=64, type=int) + parser.add_argument('--ndf', help='discriminator filter count in the first layer', default=64, type=int) + parser.add_argument('--ngf', help='generator filter count in the second last layer', default=64, type=int) + parser.add_argument('--nc', help='generator filter count in the last layer i.e. 1 for grayscale image, 3 for RGB image', default=1, type=int) + parser.add_argument('--batch_size', help='batch size, keep it 1 during testing', default=64, type=int) + parser.add_argument('--Z', help='embedding size', default=100, type=int) + parser.add_argument('--lr', help='learning rate', default=0.0002, type=float) + parser.add_argument('--beta1', help='beta1 for adam optimizer', default=0.5, type=float) + parser.add_argument('--epsilon', help='epsilon for adam optimizer', default=1e-5, type=float) + parser.add_argument('--g_dl_weight', help='discriminator layer loss weight', default=1e-1, type=float) + parser.add_argument('--gpu', help='gpu index', default=0, type=int) + parser.add_argument('--use_cpu', help='use cpu', action='store_true') + parser.add_argument('--num_epoch', help='number of maximum epochs ', default=45, type=int) + parser.add_argument('--save_after_every', help='save checkpoint after every this number of epochs ', default=5, type=int) + parser.add_argument('--visualize_after_every', help='save output images after every this number of epochs', default=5, type=int) + parser.add_argument('--show_after_every', help='show metrics after this number of iterations', default=10, type=int) + + args = parser.parse_args() + return args + +def main(): + args = parse_args() + + if args.test and not os.path.exists(args.testing_data_path): + if not os.path.exists(args.testing_data_path): + raise OSError("Provided Testing Path: {} does not exist".format(args.testing_data_path)) + if not os.path.exists(args.checkpoint_path): + raise OSError("Provided Checkpoint Path: {} does not exist".format(args.checkpoint_path)) + + create_and_validate_dir(args.checkpoint_path) + create_and_validate_dir(args.output_path) + + # gpu context + if args.use_cpu: + ctx = mx.cpu() + else: + ctx = mx.gpu(args.gpu) + + # checkpoint saving flags + check_point = True + + if args.train: + train(args.dataset, args.nef, args.ndf, args.ngf, args.nc, args.batch_size, args.Z, args.lr, args.beta1, args.epsilon, ctx, check_point, args.g_dl_weight, args.output_path, args.checkpoint_path, args.training_data_path, args.activation, args.num_epoch, args.save_after_every, args.visualize_after_every, args.show_after_every) + + if args.test: + test(args.nef, args.ngf, args.nc, 1, args.Z, ctx, args.pretrained_encoder_path, args.pretrained_generator_path, args.output_path, args.testing_data_path, args.activation, args.save_embedding, args.embedding_path) + +if __name__ == '__main__': + logging.basicConfig(level=logging.DEBUG) + main() diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py index efdd02a3be6a..8c98fd73561c 100644 --- a/python/mxnet/__init__.py +++ b/python/mxnet/__init__.py @@ -56,6 +56,7 @@ from . import random from . import optimizer from . import model +from . import metric from . import notebook from . import initializer # use mx.init as short for mx.initializer diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py index 7ada7fe029f8..cf030509fed4 100644 --- a/python/mxnet/callback.py +++ b/python/mxnet/callback.py @@ -84,7 +84,7 @@ def _callback(param): logging.info('Iter[%d] Batch[%d] Train-%s=%f', param.epoch, param.nbatch, name, value) if auto_reset: - param.eval_metric.reset() + param.eval_metric.reset_local() return _callback @@ -135,7 +135,7 @@ def __call__(self, param): if param.eval_metric is not None: name_value = param.eval_metric.get_name_value() if self.auto_reset: - param.eval_metric.reset() + param.eval_metric.reset_local() msg = 'Epoch[%d] Batch [%d-%d]\tSpeed: %.2f samples/sec' msg += '\t%s=%f'*len(name_value) logging.info(msg, param.epoch, count-self.frequent, count, speed, *sum(name_value, ())) diff --git a/python/mxnet/contrib/svrg_optimization/svrg_module.py b/python/mxnet/contrib/svrg_optimization/svrg_module.py new file mode 100644 index 000000000000..eecb87cf25bb --- /dev/null +++ b/python/mxnet/contrib/svrg_optimization/svrg_module.py @@ -0,0 +1,579 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# coding: utf-8 +"""A `SVRGModule` implements the `Module` API by wrapping an auxiliary module to perform +SVRG optimization logic. +""" + +import time +import logging +import mxnet as mx +from mxnet.module import Module +from .svrg_optimizer import _SVRGOptimizer + + +class SVRGModule(Module): + """SVRGModule is a module that encapsulates two Modules to accommodate the SVRG optimization technique. + It is functionally the same as Module API, except it is implemented using SVRG optimization logic. + + Parameters + ---------- + symbol : Symbol + data_names : list of str + Defaults to `('data')` for a typical model used in image classification. + label_names : list of str + Defaults to `('softmax_label')` for a typical model used in image classification. + logger : Logger + Defaults to `logging`. + context : Context or list of Context + Defaults to ``mx.cpu()``. + work_load_list : list of number + Default ``None``, indicating uniform workload. + fixed_param_names: list of str + Default ``None``, indicating no network parameters are fixed. + state_names : list of str + states are similar to data and label, but not provided by data iterator. \ + Instead they are initialized to 0 and can be set by `set_states()`. + group2ctxs : dict of str to context or list of context, or list of dict of str to context + Default is `None`. Mapping the `ctx_group` attribute to the context assignment. + compression_params : dict + Specifies type of gradient compression and additional arguments depending \ + on the type of compression being used. For example, 2bit compression requires a threshold. \ + Arguments would then be {'type':'2bit', 'threshold':0.5} \ + See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. \ + update_freq: int + Specifies the number of times to update the full gradients to be used in the SVRG optimization. For instance, \ + update_freq = 2 will calculates the gradients over all data every two epochs + + Examples + -------- + >>> # An example of declaring and using SVRGModule. + >>> mod = SVRGModule(symbol=lro, data_names=['data'], label_names=['lin_reg_label'], update_freq=2) + >>> mod.fit(di, eval_metric='mse', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), + >>> num_epoch=num_epoch, kvstore='local') + """ + + def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',), + logger=logging, context=mx.cpu(), work_load_list=None, + fixed_param_names=None, state_names=None, group2ctxs=None, + compression_params=None, update_freq=None): + super(SVRGModule, self).__init__(symbol, data_names=data_names, label_names=label_names, logger=logger, + context=context, work_load_list=work_load_list, + fixed_param_names=fixed_param_names, state_names=state_names, + group2ctxs=group2ctxs, compression_params=compression_params) + + # Type check update_frequency + if isinstance(update_freq, int): + if update_freq <= 0: + raise ValueError("update_freq in SVRGModule must be a positive integer to represent the frequency for " + "calculating full gradients") + self.update_freq = update_freq + else: + raise TypeError("update_freq in SVRGModule must be an integer to represent the frequency for " + "calculating full gradients") + + self._mod_aux = mx.mod.Module(symbol, data_names, label_names, logger, context, work_load_list, + fixed_param_names, state_names, group2ctxs, compression_params) + + self._param_dict = None + self._ctx_len = len(self._context) + + def _reset_bind(self): + """Internal function to reset binded state for both modules.""" + super(SVRGModule, self)._reset_bind() + self._mod_aux._reset_bind() + + def reshape(self, data_shapes, label_shapes=None): + """Reshapes both modules for new input shapes. + + Parameters + ---------- + data_shapes : list of (str, tuple) + Typically is ``data_iter.provide_data``. + label_shapes : list of (str, tuple) + Typically is ``data_iter.provide_label``. + """ + super(SVRGModule, self).reshape(data_shapes, label_shapes=label_shapes) + self._mod_aux.reshape(data_shapes, label_shapes=label_shapes) + + def init_optimizer(self, kvstore='local', optimizer='sgd', + optimizer_params=(('learning_rate', 0.01),), force_init=False): + """Installs and initializes SVRGOptimizer. The SVRGOptimizer is a wrapper class for a regular optimizer that is + passed in and a special AssignmentOptimizer to accumulate the full gradients. If KVStore is 'local' or None, + the full gradients will be accumulated locally without pushing to the KVStore. Otherwise, additional keys will + be pushed to accumulate the full gradients in the KVStore. + + Parameters + ---------- + kvstore : str or KVStore + Default `'local'`. + optimizer : str or Optimizer + Default `'sgd'` + optimizer_params : dict + Default `(('learning_rate', 0.01),)`. The default value is not a dictionary, + just to avoid pylint warning of dangerous default values. + force_init : bool + Default ``False``, indicating whether we should force re-initializing the + optimizer in the case an optimizer is already installed. + """ + + # Init dict for storing average of full gradients for each device + self._param_dict = [{key: mx.nd.zeros(shape=value.shape, ctx=self._context[i]) + for key, value in self.get_params()[0].items()} for i in range(self._ctx_len)] + + svrg_optimizer = self._create_optimizer(_SVRGOptimizer.__name__, default_opt=optimizer, + kvstore=kvstore, optimizer_params=optimizer_params) + + super(SVRGModule, self).init_optimizer(kvstore=kvstore, optimizer=svrg_optimizer, + optimizer_params=optimizer_params, force_init=force_init) + + # Init additional keys for accumulating full grads in KVStore + if self._kvstore: + for idx, param_on_devs in enumerate(self._exec_group.param_arrays): + name = self._exec_group.param_names[idx] + self._kvstore.init(name + "_full", mx.nd.zeros(shape=self._arg_params[name].shape)) + if self._update_on_kvstore: + self._kvstore.pull(name + "_full", param_on_devs, priority=-idx) + + def _create_optimizer(self, optimizer, default_opt, kvstore, optimizer_params): + """Helper function to create a svrg optimizer. SVRG optimizer encapsulates two optimizers and + will redirect update() to the correct optimizer based on the key. + + Parameters + ---------- + kvstore : str or KVStore + Default `'local'`. + optimizer: str + Name for SVRGOptimizer + default_opt : str or Optimizer that was passed in. + optimizer_params : dict + optimizer params that was passed in. + """ + + # code partially copied from mxnet module.init_optimizer() to accomodate svrg_optimizer + batch_size = self._exec_group.batch_size + + (kv_store, update_on_kvstore) = mx.model._create_kvstore(kvstore, self._ctx_len, self._arg_params) + if kv_store and 'dist' in kv_store.type and '_sync' in kv_store.type: + batch_size *= kv_store.num_workers + rescale_grad = 1.0 / batch_size + + idx2name = {} + if update_on_kvstore: + idx2name.update(enumerate(self._exec_group.param_names)) + else: + for k in range(self._ctx_len): + idx2name.update({i * self._ctx_len + k: n + for i, n in enumerate(self._exec_group.param_names)}) + + # update idx2name to include new keys + for key in self._param_dict[0].keys(): + max_key = max(list(idx2name.keys())) + 1 + idx2name[max_key] = key + "_full" + + optimizer_params = dict(optimizer_params) + if 'rescale_grad' not in optimizer_params: + optimizer_params['rescale_grad'] = rescale_grad + optimizer_params["default_optimizer"] = default_opt + optimizer_params["param_idx2name"] = idx2name + optimizer = mx.optimizer.create(optimizer, **optimizer_params) + + return optimizer + + def bind(self, data_shapes, label_shapes=None, for_training=True, + inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req='write'): + """Binds the symbols to construct executors for both two modules. This is necessary before one + can perform computation with the SVRGModule. + + Parameters + ---------- + data_shapes : list of (str, tuple) + Typically is ``data_iter.provide_data``. + label_shapes : list of (str, tuple) + Typically is ``data_iter.provide_label``. + for_training : bool + Default is ``True``. Whether the executors should be bound for training. + inputs_need_grad : bool + Default is ``False``. Whether the gradients to the input data need to be computed. + Typically this is not needed. But this might be needed when implementing composition + of modules. + force_rebind : bool + Default is ``False``. This function does nothing if the executors are already + bound. But with this ``True``, the executors will be forced to rebind. + shared_module : Module + Default is ``None``. This is used in bucketing. When not ``None``, the shared module + essentially corresponds to a different bucket -- a module with different symbol + but with the same sets of parameters (e.g. unrolled RNNs with different lengths). + """ + # force rebinding is typically used when one want to switch from + # training to prediction phase. + super(SVRGModule, self).bind(data_shapes, label_shapes, for_training, inputs_need_grad, force_rebind, + shared_module, grad_req) + + if for_training: + self._mod_aux.bind(data_shapes, label_shapes, for_training, inputs_need_grad, force_rebind, shared_module, + grad_req) + + def forward(self, data_batch, is_train=None): + """Forward computation for both two modules. It supports data batches with different shapes, such as + different batch sizes or different image sizes. + If reshaping of data batch relates to modification of symbol or module, such as + changing image layout ordering or switching from training to predicting, module + rebinding is required. + + See Also + ---------- + :meth:`BaseModule.forward`. + + Parameters + ---------- + data_batch : DataBatch + Could be anything with similar API implemented. + is_train : bool + Default is ``None``, which means ``is_train`` takes the value of ``self.for_training``. + """ + super(SVRGModule, self).forward(data_batch, is_train) + + if is_train: + self._mod_aux.forward(data_batch, is_train) + + def backward(self, out_grads=None): + """Backward computation. + + See Also + ---------- + :meth:`BaseModule.backward`. + + Parameters + ---------- + out_grads : NDArray or list of NDArray, optional + Gradient on the outputs to be propagated back. + This parameter is only needed when bind is called + on outputs that are not a loss function. + """ + super(SVRGModule, self).backward(out_grads) + + if self._mod_aux.binded: + self._mod_aux.backward(out_grads) + + def update(self): + """Updates parameters according to the installed optimizer and the gradients computed + in the previous forward-backward batch. The gradients in the _exec_group will be overwritten + using the gradients calculated by the SVRG update rule. + + When KVStore is used to update parameters for multi-device or multi-machine training, + a copy of the parameters is stored in KVStore. Note that for `row_sparse` parameters, + this function does update the copy of parameters in KVStore, but doesn't broadcast the + updated parameters to all devices / machines. Please call `prepare` to broadcast + `row_sparse` parameters with the next batch of data. + + See Also + ---------- + :meth:`BaseModule.update`. + """ + self._update_svrg_gradients() + super(SVRGModule, self).update() + + def update_full_grads(self, train_data): + """Computes the gradients over all data w.r.t weights of past + m epochs. For distributed env, it will accumulate full grads in the kvstore. + + Parameters + ---------- + train_data: DataIter + Train data iterator + """ + param_names = self._exec_group.param_names + arg, aux = self.get_params() + self._mod_aux.set_params(arg_params=arg, aux_params=aux) + train_data.reset() + nbatch = 0 + padding = 0 + for batch in train_data: + self._mod_aux.forward(batch, is_train=True) + self._mod_aux.backward() + nbatch += 1 + for ctx in range(self._ctx_len): + for index, name in enumerate(param_names): + grads = self._mod_aux._exec_group.grad_arrays[index][ctx] + self._param_dict[ctx][name] = mx.nd.broadcast_add(self._param_dict[ctx][name], grads, axis=0) + padding = batch.pad + + true_num_batch = nbatch - padding / train_data.batch_size + for name in param_names: + grad_list = [] + for i in range(self._ctx_len): + self._param_dict[i][name] /= true_num_batch + grad_list.append(self._param_dict[i][name]) + if self._kvstore: + # If in distributed mode, push a list of gradients from each worker/device to the KVStore + self._accumulate_kvstore(name, grad_list) + + def _accumulate_kvstore(self, key, value): + """Accumulate gradients over all data in the KVStore. In distributed setting, each worker sees a portion of + data. The full gradients will be aggregated from each worker in the KVStore. + + Parameters + ---------- + + key: int or str + Key in the KVStore. + value: NDArray, RowSparseNDArray + Average of the full gradients. + """ + # Accumulate full gradients for current epochs + self._kvstore.push(key + "_full", value) + self._kvstore._barrier() + self._kvstore.pull(key + "_full", value) + + self._allocate_gradients(key, value) + + def _allocate_gradients(self, key, value): + """Allocate average of full gradients accumulated in the KVStore to each device. + + Parameters + ---------- + + key: int or str + Key in the kvstore. + value: List of NDArray, List of RowSparseNDArray + A list of average of the full gradients in the KVStore. + """ + for i in range(self._ctx_len): + self._param_dict[i][key] = value[i] / self._ctx_len + + def _svrg_grads_update_rule(self, g_curr_batch_curr_weight, g_curr_batch_special_weight, + g_special_weight_all_batch): + """Calculates the gradient based on the SVRG update rule. + Parameters + ---------- + g_curr_batch_curr_weight : NDArray + gradients of current weight of self.mod w.r.t current batch of data + g_curr_batch_special_weight: NDArray + gradients of the weight of past m epochs of self._mod_special w.r.t current batch of data + g_special_weight_all_batch: NDArray + average of full gradients over full pass of data + + Returns + ---------- + Gradients calculated using SVRG update rule: + grads = g_curr_batch_curr_weight - g_curr_batch_special_weight + g_special_weight_all_batch + """ + for index, grad in enumerate(g_curr_batch_curr_weight): + grad -= g_curr_batch_special_weight[index] + grad += g_special_weight_all_batch[index] + return g_curr_batch_curr_weight + + def _update_svrg_gradients(self): + """Calculates gradients based on the SVRG update rule. + """ + param_names = self._exec_group.param_names + for ctx in range(self._ctx_len): + for index, name in enumerate(param_names): + g_curr_batch_reg = self._exec_group.grad_arrays[index][ctx] + g_curr_batch_special = self._mod_aux._exec_group.grad_arrays[index][ctx] + g_special_weight_all_batch = self._param_dict[ctx][name] + g_svrg = self._svrg_grads_update_rule(g_curr_batch_reg, g_curr_batch_special, + g_special_weight_all_batch) + self._exec_group.grad_arrays[index][ctx] = g_svrg + + def fit(self, train_data, eval_data=None, eval_metric='acc', + epoch_end_callback=None, batch_end_callback=None, kvstore='local', + optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), + eval_end_callback=None, + eval_batch_end_callback=None, initializer=mx.init.Uniform(0.01), + arg_params=None, aux_params=None, allow_missing=False, + force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, + validation_metric=None, monitor=None, sparse_row_id_fn=None): + """Trains the module parameters. + + Parameters + ---------- + train_data : DataIter + Train DataIter. + eval_data : DataIter + If not ``None``, will be used as validation set and the performance + after each epoch will be evaluated. + eval_metric : str or EvalMetric + Defaults to 'accuracy'. The performance measure used to display during training. + Other possible predefined metrics are: + 'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'. + epoch_end_callback : function or list of functions + Each callback will be called with the current `epoch`, `symbol`, `arg_params` + and `aux_params`. + batch_end_callback : function or list of function + Each callback will be called with a `BatchEndParam`. + kvstore : str or KVStore + Defaults to 'local'. + optimizer : str or Optimizer + Defaults to 'sgd'. + optimizer_params : dict + Defaults to ``(('learning_rate', 0.01),)``. The parameters for + the optimizer constructor. + The default value is not a dict, just to avoid pylint warning on dangerous + default values. + eval_end_callback : function or list of function + These will be called at the end of each full evaluation, with the metrics over + the entire evaluation set. + eval_batch_end_callback : function or list of function + These will be called at the end of each mini-batch during evaluation. + initializer : Initializer + The initializer is called to initialize the module parameters when they are + not already initialized. + arg_params : dict + Defaults to ``None``, if not ``None``, should be existing parameters from a trained + model or loaded from a checkpoint (previously saved model). In this case, + the value here will be used to initialize the module parameters, unless they + are already initialized by the user via a call to `init_params` or `fit`. + `arg_params` has a higher priority than `initializer`. + aux_params : dict + Defaults to ``None``. Similar to `arg_params`, except for auxiliary states. + allow_missing : bool + Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params` + and `aux_params` are not ``None``. If this is ``True``, then the missing parameters + will be initialized via the `initializer`. + force_rebind : bool + Defaults to ``False``. Whether to force rebinding the executors if already bound. + force_init : bool + Defaults to ``False``. Indicates whether to force initialization even if the + parameters are already initialized. + begin_epoch : int + Defaults to 0. Indicates the starting epoch. Usually, if resumed from a + checkpoint saved at a previous training phase at epoch N, then this value should be + N+1. + num_epoch : int + Number of epochs for training. + sparse_row_id_fn : A callback function + The function takes `data_batch` as an input and returns a dict of + str -> NDArray. The resulting dict is used for pulling row_sparse + parameters from the kvstore, where the str key is the name of the param, + and the value is the row id of the param to pull. + validation_metric: str or EvalMetric + The performance measure used to display during validation. + """ + assert num_epoch is not None, 'please specify number of epochs' + + self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, + for_training=True, force_rebind=force_rebind) + if monitor is not None: + self.install_monitor(monitor) + self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, + allow_missing=allow_missing, force_init=force_init) + self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params) + + if validation_metric is None: + validation_metric = eval_metric + if not isinstance(eval_metric, mx.metric.EvalMetric): + eval_metric = mx.metric.create(eval_metric) + + ################################################################################ + # training loop + ################################################################################ + for epoch in range(begin_epoch, num_epoch): + eval_metric.reset() + tic = time.time() + if epoch % self.update_freq == 0: + self.update_full_grads(train_data) + + train_data.reset() + data_iter = iter(train_data) + end_of_batch = False + nbatch = 0 + next_data_batch = next(data_iter) + + while not end_of_batch: + data_batch = next_data_batch + if monitor is not None: + monitor.tic() + + self.forward_backward(data_batch) + self.update() + + if isinstance(data_batch, list): + self.update_metric(eval_metric, [db.label for db in data_batch], pre_sliced=True) + else: + self.update_metric(eval_metric, data_batch.label) + + try: + # pre fetch next batch + next_data_batch = next(data_iter) + self.prepare(next_data_batch, sparse_row_id_fn=sparse_row_id_fn) + except StopIteration: + end_of_batch = True + + if monitor is not None: + monitor.toc_print() + + if end_of_batch: + eval_name_vals = eval_metric.get_name_value() + + if batch_end_callback is not None: + batch_end_params = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch, + eval_metric=eval_metric, locals=locals()) + for callback in mx.base._as_list(batch_end_callback): + callback(batch_end_params) + + nbatch += 1 + for name, val in eval_name_vals: + self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) + toc = time.time() + self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic)) + + # sync aux params across devices + arg_params, aux_params = self.get_params() + self.set_params(arg_params, aux_params) + + if epoch_end_callback is not None: + for callback in mx.base._as_list(epoch_end_callback): + callback(epoch, self.symbol, arg_params, aux_params) + + # ---------------------------------------- + # evaluation on validation set + if eval_data: + res = self.score(eval_data, validation_metric, + score_end_callback=eval_end_callback, + batch_end_callback=eval_batch_end_callback, epoch=epoch) + for name, val in res: + self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) + + def prepare(self, data_batch, sparse_row_id_fn=None): + """Prepares two modules for processing a data batch. + + Usually involves switching bucket and reshaping. + For modules that contain `row_sparse` parameters in KVStore, + it prepares the `row_sparse` parameters based on the sparse_row_id_fn. + + When KVStore is used to update parameters for multi-device or multi-machine training, + a copy of the parameters are stored in KVStore. Note that for `row_sparse` parameters, + the `update()` updates the copy of parameters in KVStore, but doesn't broadcast + the updated parameters to all devices / machines. The `prepare` function is used to + broadcast `row_sparse` parameters with the next batch of data. + + Parameters + ---------- + data_batch : DataBatch + The current batch of data for forward computation. + + sparse_row_id_fn : A callback function + The function takes `data_batch` as an input and returns a dict of + str -> NDArray. The resulting dict is used for pulling row_sparse + parameters from the kvstore, where the str key is the name of the param, + and the value is the row id of the param to pull. + """ + super(SVRGModule, self).prepare(data_batch, sparse_row_id_fn=sparse_row_id_fn) + self._mod_aux.prepare(data_batch, sparse_row_id_fn=sparse_row_id_fn) diff --git a/python/mxnet/gluon/__init__.py b/python/mxnet/gluon/__init__.py index f43da1dae738..3de1f5f0ad0b 100644 --- a/python/mxnet/gluon/__init__.py +++ b/python/mxnet/gluon/__init__.py @@ -19,8 +19,6 @@ # pylint: disable=wildcard-import """Neural network module.""" -from . import metric - from .parameter import * from .block import * diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py index 921d54d71d4a..6e735316cae2 100644 --- a/python/mxnet/gluon/block.py +++ b/python/mxnet/gluon/block.py @@ -30,8 +30,7 @@ import numpy as np from ..base import mx_real_t, MXNetError, NDArrayHandle, py_str -from .. import symbol, ndarray, initializer, autograd, _deferred_compute as dc -from ..symbol.numpy import _symbol as np_symbol +from .. import symbol, ndarray, initializer, np_symbol, autograd, _deferred_compute as dc from ..symbol import Symbol from ..ndarray import NDArray from .. import name as _name diff --git a/python/mxnet/gluon/contrib/data/text.py b/python/mxnet/gluon/contrib/data/text.py index 916b41880d45..0536ac585484 100644 --- a/python/mxnet/gluon/contrib/data/text.py +++ b/python/mxnet/gluon/contrib/data/text.py @@ -29,7 +29,7 @@ from ...data import dataset from ...utils import download, check_sha1, _get_repo_file_url from ....contrib import text -from .... import ndarray as nd, base +from .... import nd, base class _LanguageModelDataset(dataset._DownloadedDataset): # pylint: disable=abstract-method def __init__(self, root, namespace, vocabulary): diff --git a/python/mxnet/gluon/contrib/data/vision/dataloader.py b/python/mxnet/gluon/contrib/data/vision/dataloader.py index 3213398b2214..0c71d90453d8 100644 --- a/python/mxnet/gluon/contrib/data/vision/dataloader.py +++ b/python/mxnet/gluon/contrib/data/vision/dataloader.py @@ -21,9 +21,9 @@ import logging import numpy as np -from ..... import ndarray as nd +from ..... import nd from .....util import is_np_array -from ..... import numpy as _mx_np # pylint: disable=reimported +from ..... import np as _mx_np # pylint: disable=reimported from ....nn import HybridSequential, Sequential, HybridBlock, Block from ....data.vision import transforms from ....data import DataLoader diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py index 65a18aaf80cd..1629c212957f 100644 --- a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py +++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py @@ -23,7 +23,7 @@ from .......base import numeric_types from ......block import Block from .......util import is_np_array -from ....... import ndarray as nd, numpy_extension as npx, numpy as np +from ....... import nd, npx, np from .utils import _check_bbox_shape, bbox_crop, bbox_translate from .utils import bbox_resize, bbox_random_crop_with_constraints diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py index c47e02b7213f..ed8a53d7c3a6 100644 --- a/python/mxnet/gluon/contrib/estimator/estimator.py +++ b/python/mxnet/gluon/contrib/estimator/estimator.py @@ -33,7 +33,7 @@ from ...trainer import Trainer from ...utils import split_and_load from ....context import Context, cpu, gpu, num_gpus -from ...metric import Loss as metric_loss +from ....metric import Loss as metric_loss from .batch_processor import BatchProcessor __all__ = ['Estimator'] diff --git a/python/mxnet/gluon/contrib/estimator/event_handler.py b/python/mxnet/gluon/contrib/estimator/event_handler.py index 5709a803a610..338c7f00e05e 100644 --- a/python/mxnet/gluon/contrib/estimator/event_handler.py +++ b/python/mxnet/gluon/contrib/estimator/event_handler.py @@ -25,8 +25,8 @@ import numpy as np -from ...metric import CompositeEvalMetric, EvalMetric -from ...metric import Loss as metric_loss +from ....metric import CompositeEvalMetric, EvalMetric +from ....metric import Loss as metric_loss from .utils import _check_metrics __all__ = ['TrainBegin', 'TrainEnd', 'EpochBegin', 'EpochEnd', 'BatchBegin', 'BatchEnd', diff --git a/python/mxnet/gluon/contrib/estimator/utils.py b/python/mxnet/gluon/contrib/estimator/utils.py index dc0c4bf8f081..d9126a2f6763 100644 --- a/python/mxnet/gluon/contrib/estimator/utils.py +++ b/python/mxnet/gluon/contrib/estimator/utils.py @@ -20,7 +20,7 @@ """Gluon Estimator Utility Functions""" from ...loss import SoftmaxCrossEntropyLoss -from ...metric import Accuracy, EvalMetric, CompositeEvalMetric +from ....metric import Accuracy, EvalMetric, CompositeEvalMetric def _check_metrics(metrics): if isinstance(metrics, CompositeEvalMetric): @@ -31,7 +31,7 @@ def _check_metrics(metrics): metrics = metrics or [] if not all([isinstance(metric, EvalMetric) for metric in metrics]): raise ValueError("metrics must be a Metric or a list of Metric, " - "refer to mxnet.gluon.metric.EvalMetric: {}".format(metrics)) + "refer to mxnet.metric.EvalMetric: {}".format(metrics)) return metrics def _check_handler_metric_ref(handler, known_metrics): diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py index 4335c5cd3431..945867b909c8 100644 --- a/python/mxnet/gluon/contrib/nn/basic_layers.py +++ b/python/mxnet/gluon/contrib/nn/basic_layers.py @@ -24,8 +24,8 @@ 'PixelShuffle3D'] import warnings -from .... import ndarray as nd, context -from ...block import HybridBlock +from .... import nd, context +from ...block import HybridBlock, Block from ...nn import Sequential, HybridSequential, BatchNorm class Concurrent(Sequential): diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py index c51981678367..d991bc769ac9 100644 --- a/python/mxnet/gluon/data/dataloader.py +++ b/python/mxnet/gluon/data/dataloader.py @@ -39,7 +39,7 @@ from . import sampler as _sampler from . import batchify as _batchify -from ... import ndarray as nd, context +from ... import nd, context from ...util import is_np_shape, is_np_array, set_np from ... import numpy as _mx_np # pylint: disable=reimported diff --git a/python/mxnet/gluon/data/vision/datasets.py b/python/mxnet/gluon/data/vision/datasets.py index 028d846c6bee..c88648cbb73e 100644 --- a/python/mxnet/gluon/data/vision/datasets.py +++ b/python/mxnet/gluon/data/vision/datasets.py @@ -30,7 +30,7 @@ from .. import dataset from ...utils import download, check_sha1, _get_repo_file_url -from .... import ndarray as nd, image, recordio, base +from .... import nd, image, recordio, base from .... import numpy as _mx_np # pylint: disable=reimported from ....util import is_np_array, default_array from ....base import numeric_types diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py index 8e364532a2f7..a424a32837c3 100644 --- a/python/mxnet/gluon/nn/basic_layers.py +++ b/python/mxnet/gluon/nn/basic_layers.py @@ -27,7 +27,7 @@ from .activations import Activation from ..block import Block, HybridBlock from ..utils import _indent -from ... import ndarray as nd, symbol as sym +from ... import nd, sym from ...util import is_np_array diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/metric.py similarity index 66% rename from python/mxnet/gluon/metric.py rename to python/mxnet/metric.py index 5b081ceac4d8..eb8f99a66d48 100644 --- a/python/mxnet/gluon/metric.py +++ b/python/mxnet/metric.py @@ -22,12 +22,11 @@ import math from collections import OrderedDict -from .. import numpy -from ..util import use_np +import numpy -from ..base import numeric_types, string_types -from .. import ndarray -from .. import registry +from .base import numeric_types, string_types +from . import ndarray +from . import registry def check_label_shapes(labels, preds, wrap=False, shape=False): @@ -90,6 +89,7 @@ def __init__(self, name, output_names=None, self.name = str(name) self.output_names = output_names self.label_names = label_names + self._has_global_stats = kwargs.pop("has_global_stats", False) self._kwargs = kwargs self.reset() @@ -148,6 +148,13 @@ def reset(self): """Resets the internal evaluation result to initial state.""" self.num_inst = 0 self.sum_metric = 0.0 + self.global_num_inst = 0 + self.global_sum_metric = 0.0 + + def reset_local(self): + """Resets the local portion of the internal evaluation results to initial state.""" + self.num_inst = 0 + self.sum_metric = 0.0 def get(self): """Gets the current evaluation result. @@ -162,13 +169,25 @@ def get(self): if self.num_inst == 0: return (self.name, float('nan')) else: - res = self.sum_metric / self.num_inst - if isinstance(res, numpy.ndarray) and len(res.shape) == 0: - # currently calling ' c = mxnet.numpy.array([1,2,3]).sum() ' would get - # ' array(6.) ', a ndarray with shape () - # In this case, returning a 'float' in .get() is more explicit. - res = res.item() - return (self.name, res) + return (self.name, self.sum_metric / self.num_inst) + + def get_global(self): + """Gets the current global evaluation result. + + Returns + ------- + names : list of str + Name of the metrics. + values : list of float + Value of the evaluations. + """ + if self._has_global_stats: + if self.global_num_inst == 0: + return (self.name, float('nan')) + else: + return (self.name, self.global_sum_metric / self.global_num_inst) + else: + return self.get() def get_name_value(self): """Returns zipped name and value pairs. @@ -185,6 +204,24 @@ def get_name_value(self): value = [value] return list(zip(name, value)) + def get_global_name_value(self): + """Returns zipped name and value pairs for global results. + + Returns + ------- + list of tuples + A (name, value) tuple list. + """ + if self._has_global_stats: + name, value = self.get_global() + if not isinstance(name, list): + name = [name] + if not isinstance(value, list): + value = [value] + return list(zip(name, value)) + else: + return self.get_name_value() + # pylint: disable=invalid-name register = registry.get_register_func(EvalMetric, 'metric') alias = registry.get_alias_func(EvalMetric, 'metric') @@ -219,9 +256,9 @@ def create(metric, *args, **kwargs): >>> def custom_metric(label, pred): ... return np.mean(np.abs(label - pred)) ... - >>> metric1 = mx.gluon.metric.create('acc') - >>> metric2 = mx.gluon.metric.create(custom_metric) - >>> metric3 = mx.gluon.metric.create([metric1, metric2, 'rmse']) + >>> metric1 = mx.metric.create('acc') + >>> metric2 = mx.metric.create(custom_metric) + >>> metric3 = mx.metric.create([metric1, metric2, 'rmse']) """ if callable(metric): return CustomMetric(metric, *args, **kwargs) @@ -256,9 +293,9 @@ class CompositeEvalMetric(EvalMetric): -------- >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])] >>> labels = [mx.nd.array([0, 1, 1])] - >>> eval_metrics_1 = mx.gluon.metric.Accuracy() - >>> eval_metrics_2 = mx.gluon.metric.F1() - >>> eval_metrics = mx.gluon.metric.CompositeEvalMetric() + >>> eval_metrics_1 = mx.metric.Accuracy() + >>> eval_metrics_2 = mx.metric.F1() + >>> eval_metrics = mx.metric.CompositeEvalMetric() >>> for child_metric in [eval_metrics_1, eval_metrics_2]: >>> eval_metrics.add(child_metric) >>> eval_metrics.update(labels = labels, preds = predicts) @@ -269,7 +306,8 @@ class CompositeEvalMetric(EvalMetric): def __init__(self, metrics=None, name='composite', output_names=None, label_names=None): super(CompositeEvalMetric, self).__init__( - name, output_names=output_names, label_names=label_names) + name, output_names=output_names, label_names=label_names, + has_global_stats=True) if metrics is None: metrics = [] self.metrics = [create(i) for i in metrics] @@ -331,6 +369,14 @@ def reset(self): except AttributeError: pass + def reset_local(self): + """Resets the local portion of the internal evaluation results to initial state.""" + try: + for metric in self.metrics: + metric.reset_local() + except AttributeError: + pass + def get(self): """Returns the current evaluation result. @@ -353,6 +399,28 @@ def get(self): values.extend(value) return (names, values) + def get_global(self): + """Returns the current evaluation result. + + Returns + ------- + names : list of str + Name of the metrics. + values : list of float + Value of the evaluations. + """ + names = [] + values = [] + for metric in self.metrics: + name, value = metric.get_global() + if isinstance(name, string_types): + name = [name] + if isinstance(value, numeric_types): + value = [value] + names.extend(name) + values.extend(value) + return (names, values) + def get_config(self): config = super(CompositeEvalMetric, self).get_config() config.update({'metrics': [i.get_config() for i in self.metrics]}) @@ -366,7 +434,6 @@ def get_config(self): @register @alias('acc') -@use_np class Accuracy(EvalMetric): """Computes accuracy classification score. @@ -393,7 +460,7 @@ class Accuracy(EvalMetric): -------- >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])] >>> labels = [mx.nd.array([0, 1, 1])] - >>> acc = mx.gluon.metric.Accuracy() + >>> acc = mx.metric.Accuracy() >>> acc.update(preds = predicts, labels = labels) >>> print acc.get() ('accuracy', 0.6666666666666666) @@ -402,7 +469,8 @@ def __init__(self, axis=1, name='accuracy', output_names=None, label_names=None): super(Accuracy, self).__init__( name, axis=axis, - output_names=output_names, label_names=label_names) + output_names=output_names, label_names=label_names, + has_global_stats=True) self.axis = axis def update(self, labels, preds): @@ -420,26 +488,25 @@ def update(self, labels, preds): labels, preds = check_label_shapes(labels, preds, True) for label, pred_label in zip(labels, preds): - pred_label = pred_label.as_np_ndarray().as_in_ctx(label.ctx) - label = label.as_np_ndarray() if pred_label.shape != label.shape: - pred_label = pred_label.argmax(axis=self.axis) - pred_label = pred_label.astype('int32') - label = label.astype('int32') + pred_label = ndarray.argmax(pred_label, axis=self.axis) + pred_label = pred_label.asnumpy().astype('int32') + label = label.asnumpy().astype('int32') # flatten before checking shapes to avoid shape miss match - label = label.reshape(-1) - pred_label = pred_label.reshape(-1) + label = label.flat + pred_label = pred_label.flat check_label_shapes(label, pred_label) - num_correct = (pred_label == label).sum().astype('float64') + num_correct = (pred_label == label).sum() self.sum_metric += num_correct + self.global_sum_metric += num_correct self.num_inst += len(pred_label) + self.global_num_inst += len(pred_label) @register @alias('top_k_accuracy', 'top_k_acc') -@use_np class TopKAccuracy(EvalMetric): """Computes top k predictions accuracy. @@ -468,7 +535,7 @@ class TopKAccuracy(EvalMetric): >>> top_k = 3 >>> labels = [mx.nd.array([2, 6, 9, 2, 3, 4, 7, 8, 9, 6])] >>> predicts = [mx.nd.array(np.random.rand(10, 10))] - >>> acc = mx.gluon.metric.TopKAccuracy(top_k=top_k) + >>> acc = mx.metric.TopKAccuracy(top_k=top_k) >>> acc.update(labels, predicts) >>> print acc.get() ('top_k_accuracy', 0.3) @@ -478,7 +545,8 @@ def __init__(self, top_k=1, name='top_k_accuracy', output_names=None, label_names=None): super(TopKAccuracy, self).__init__( name, top_k=top_k, - output_names=output_names, label_names=label_names) + output_names=output_names, label_names=label_names, + has_global_stats=True) self.top_k = top_k assert(self.top_k > 1), 'Please use Accuracy if top_k is no more than 1' self.name += '_%d' % self.top_k @@ -502,89 +570,43 @@ def update(self, labels, preds): # we do not care about the order of top k elements. It is # much faster, which is important since that computation is # single-threaded due to Python GIL. - pred_label = pred_label.as_np_ndarray().as_in_ctx(label.ctx).astype('float32') - pred_label = numpy.argpartition(pred_label, -self.top_k) - label = label.as_np_ndarray().astype('int32') + pred_label = numpy.argpartition(pred_label.asnumpy().astype('float32'), -self.top_k) + label = label.asnumpy().astype('int32') check_label_shapes(label, pred_label) num_samples = pred_label.shape[0] num_dims = len(pred_label.shape) if num_dims == 1: - num_correct = (pred_label.reshape(-1) == label.reshape(-1)).sum() - self.sum_metric += num_correct.astype('float64') + self.sum_metric += (pred_label.flat == label.flat).sum() elif num_dims == 2: num_classes = pred_label.shape[1] top_k = min(num_classes, self.top_k) for j in range(top_k): - num_correct = (pred_label[:, num_classes - 1 - j].reshape(-1) == label.reshape(-1)).sum() - self.sum_metric += num_correct.astype('float64') + num_correct = (pred_label[:, num_classes - 1 - j].flat == label.flat).sum() + self.sum_metric += num_correct + self.global_sum_metric += num_correct self.num_inst += num_samples + self.global_num_inst += num_samples -def predict_with_threshold(pred, threshold=0.5): - """Do thresholding of predictions in binary and multilabel cases. - - Parameters - ---------- - preds : ndarray - predictions in shape of (batch_size, ...) or (batch_size, ..., num_categories) - - preds : float or ndarray - threshold(s) in shape of float or (num_categories) - """ - if isinstance(threshold, float): - return pred > threshold - elif isinstance(threshold, (numpy.ndarray, ndarray.ndarray.NDArray)): - num_classes = pred.shape[-1] - assert threshold.shape[-1] == num_classes, \ - "shape mismatch: %s vs. %s"%(pred.shape[-1], threshold.shape[-1]) - return pred > threshold - else: - raise ValueError("{} is a wrong type for threshold!".format(type(threshold))) - - -def one_hot(idx, num): - return (numpy.arange(num).astype(idx) == idx[:, None]).astype('int32') - - -@use_np -class _ClassificationMetrics(object): +class _BinaryClassificationMetrics(object): """Private container class for classification metric statistics. True/false positive and true/false negative counts are sufficient statistics for various classification metrics. This class provides the machinery to track those statistics across mini-batches of (label, prediction) pairs. - - Parameters - ---------- - class_type : str, default "binary" - "binary": f1 for binary classification. - "multiclass": f1 for multiclassification problem. - "multilabel": f1 for multilabel classification. - beta : float, default 1 - weight of precision in harmonic mean. - threshold : float, default 0.5 - threshold for deciding whether the predictions are positive or negative. - """ - def __init__(self, class_type="binary", threshold=0.5, beta=1): - self.class_type = class_type - self.threshold = threshold - self.beta = beta - self.reset_stats() - - def _set(self, num, ctx): - if self.num_classes is None: - self.num_classes = num - self.true_positives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx) - self.false_negatives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx) - self.false_positives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx) - self.true_negatives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx) - else: - assert self.num_classes == num, \ - "Input number of classes has changed from {} to {}".format(self.num_classes, num) - - def update_stats(self, label, pred): + def __init__(self): + self.true_positives = 0 + self.false_negatives = 0 + self.false_positives = 0 + self.true_negatives = 0 + self.global_true_positives = 0 + self.global_false_negatives = 0 + self.global_false_positives = 0 + self.global_true_negatives = 0 + + def update_binary_stats(self, label, pred): """Update various binary classification counts for a single (label, pred) pair. Parameters @@ -595,107 +617,92 @@ def update_stats(self, label, pred): pred : `NDArray` Predicted values. """ - pred = pred.as_np_ndarray().as_in_ctx(label.ctx) - label = label.as_np_ndarray().astype('int32') - if self.class_type == "binary": - self._set(1, label.ctx) - if label.max() > 1: - raise ValueError("Wrong label for binary classification.") - if pred.shape == label.shape: - pass - elif pred.shape[-1] > 2: - raise ValueError("The shape of prediction {} is wrong for binary classification.".format(pred.shape)) - elif pred.shape[-1] == 2: - pred = pred.reshape(-1, 2)[:, 1] - pred_label = predict_with_threshold(pred, self.threshold).reshape(-1) - label = label.reshape(-1) - - elif self.class_type == "multiclass": - num = pred.shape[-1] - self._set(num, label.ctx) - assert label.max() < num, "pred contains fewer classes than label!" - pred_label = one_hot(pred.argmax(axis=-1).reshape(-1), num) - label = one_hot(label.reshape(-1), num) - - elif self.class_type == "multilabel": - num = pred.shape[-1] - self._set(num, label.ctx) - assert pred.shape == label.shape, \ - "The shape of label should be same as that of prediction for multilabel classification." - pred_label = predict_with_threshold(pred, self.threshold).reshape(-1, num) - label = label.reshape(-1, num) - else: - raise ValueError( - "Wrong class_type {}! Only supports ['binary', 'multiclass', 'multilabel']".format(self.class_type)) - - check_label_shapes(label, pred_label) - + pred = pred.asnumpy() + label = label.asnumpy().astype('int32') + pred_label = numpy.argmax(pred, axis=1) + + check_label_shapes(label, pred) + if len(numpy.unique(label)) > 2: + raise ValueError("%s currently only supports binary classification." + % self.__class__.__name__) pred_true = (pred_label == 1) - pred_false = (pred_label == 0) + pred_false = 1 - pred_true label_true = (label == 1) - label_false = (label == 0) + label_false = 1 - label_true - true_pos = (pred_true * label_true).sum(0) - false_pos = (pred_true * label_false).sum(0) - false_neg = (pred_false * label_true).sum(0) - true_neg = (pred_false * label_false).sum(0) + true_pos = (pred_true * label_true).sum() + false_pos = (pred_true * label_false).sum() + false_neg = (pred_false * label_true).sum() + true_neg = (pred_false * label_false).sum() self.true_positives += true_pos + self.global_true_positives += true_pos self.false_positives += false_pos + self.global_false_positives += false_pos self.false_negatives += false_neg + self.global_false_negatives += false_neg self.true_negatives += true_neg + self.global_true_negatives += true_neg @property def precision(self): - if self.num_classes is not None: - return self.true_positives / numpy.maximum(self.true_positives + self.false_positives, 1e-12) + if self.true_positives + self.false_positives > 0: + return float(self.true_positives) / (self.true_positives + self.false_positives) else: return 0. @property - def micro_precision(self): - if self.num_classes is not None: - return self.true_positives.sum() / \ - numpy.maximum(self.true_positives.sum() + self.false_positives.sum(), 1e-12) + def global_precision(self): + if self.global_true_positives + self.global_false_positives > 0: + return float(self.global_true_positives) / (self.global_true_positives + self.global_false_positives) else: return 0. @property def recall(self): - if self.num_classes is not None: - return self.true_positives / numpy.maximum(self.true_positives + self.false_negatives, 1e-12) + if self.true_positives + self.false_negatives > 0: + return float(self.true_positives) / (self.true_positives + self.false_negatives) else: return 0. @property - def micro_recall(self): - if self.num_classes is not None: - return self.true_positives.sum() / \ - numpy.maximum(self.true_positives.sum() + self.false_negatives.sum(), 1e-12) + def global_recall(self): + if self.global_true_positives + self.global_false_negatives > 0: + return float(self.global_true_positives) / (self.global_true_positives + self.global_false_negatives) else: return 0. @property def fscore(self): - return (1 + self.beta ** 2) * self.precision * self.recall / \ - numpy.maximum(self.beta ** 2 * self.precision + self.recall, 1e-12) + if self.precision + self.recall > 0: + return 2 * self.precision * self.recall / (self.precision + self.recall) + else: + return 0. @property - def micro_fscore(self): - if self.micro_precision + self.micro_recall > 0: - return (1 + self.beta ** 2) * self.micro_precision * self.micro_recall / \ - (self.beta ** 2 * self.micro_precision + self.micro_recall) + def global_fscore(self): + if self.global_precision + self.global_recall > 0: + return 2 * self.global_precision * self.global_recall / (self.global_precision + self.global_recall) else: return 0. - def binary_matthewscc(self): + def matthewscc(self, use_global=False): """Calculate the Matthew's Correlation Coefficent""" - if not self.total_examples: - return 0. + if use_global: + if not self.global_total_examples: + return 0. + + true_pos = float(self.global_true_positives) + false_pos = float(self.global_false_positives) + false_neg = float(self.global_false_negatives) + true_neg = float(self.global_true_negatives) + else: + if not self.total_examples: + return 0. - true_pos = float(self.true_positives) - false_pos = float(self.false_positives) - false_neg = float(self.false_negatives) - true_neg = float(self.true_negatives) + true_pos = float(self.true_positives) + false_pos = float(self.false_positives) + false_neg = float(self.false_negatives) + true_neg = float(self.true_negatives) terms = [(true_pos + false_pos), (true_pos + false_neg), @@ -708,21 +715,32 @@ def binary_matthewscc(self): @property def total_examples(self): - if self.num_classes is None: - return 0 - return int(self.false_negatives[0] + self.false_positives[0] + \ - self.true_negatives[0] + self.true_positives[0]) + return self.false_negatives + self.false_positives + \ + self.true_negatives + self.true_positives + + @property + def global_total_examples(self): + return self.global_false_negatives + self.global_false_positives + \ + self.global_true_negatives + self.global_true_positives + + def local_reset_stats(self): + self.false_positives = 0 + self.false_negatives = 0 + self.true_positives = 0 + self.true_negatives = 0 def reset_stats(self): - self.num_classes = None - self.true_positives = None - self.false_negatives = None - self.false_positives = None - self.true_negatives = None + self.false_positives = 0 + self.false_negatives = 0 + self.true_positives = 0 + self.true_negatives = 0 + self.global_false_positives = 0 + self.global_false_negatives = 0 + self.global_true_positives = 0 + self.global_true_negatives = 0 @register -@use_np class F1(EvalMetric): """Computes the F1 score of a binary classification problem. @@ -750,34 +768,28 @@ class F1(EvalMetric): label_names : list of str, or None Name of labels that should be used when updating with update_dict. By default include all labels. - class_type : str, default "binary" - "binary": f1 for binary classification. - "multiclass": f1 for multiclassification problem. - "multilabel": f1 for multilabel classification. - threshold : float, default 0.5 - threshold for postive confidence value. - average : str, default 'micro' + average : str, default 'macro' Strategy to be used for aggregating across mini-batches. - "macro": Calculate metrics for each label and return unweighted mean of f1. - "micro": Calculate metrics globally by counting the total TP, FN and FP. - None: Return f1 scores for each class (numpy.ndarray) . + "macro": average the F1 scores for each batch. + "micro": compute a single F1 score across all batches. Examples -------- >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])] >>> labels = [mx.nd.array([0., 1., 1.])] - >>> f1 = mx.gluon.metric.F1() + >>> f1 = mx.metric.F1() >>> f1.update(preds = predicts, labels = labels) >>> print f1.get() ('f1', 0.8) """ def __init__(self, name='f1', - output_names=None, label_names=None, class_type="binary", threshold=0.5, average="micro"): + output_names=None, label_names=None, average="macro"): self.average = average - self.metrics = _ClassificationMetrics(class_type=class_type, threshold=threshold) + self.metrics = _BinaryClassificationMetrics() EvalMetric.__init__(self, name=name, - output_names=output_names, label_names=label_names) + output_names=output_names, label_names=label_names, + has_global_stats=True) def update(self, labels, preds): """Updates the internal evaluation result. @@ -793,149 +805,36 @@ def update(self, labels, preds): labels, preds = check_label_shapes(labels, preds, True) for label, pred in zip(labels, preds): - self.metrics.update_stats(label, pred) + self.metrics.update_binary_stats(label, pred) - if self.average == "micro": - self.sum_metric = self.metrics.micro_fscore * self.metrics.total_examples - elif self.average == "macro": - self.sum_metric = self.metrics.fscore.mean() * self.metrics.total_examples + if self.average == "macro": + self.sum_metric += self.metrics.fscore + self.global_sum_metric += self.metrics.global_fscore + self.num_inst += 1 + self.global_num_inst += 1 + self.metrics.reset_stats() else: self.sum_metric = self.metrics.fscore * self.metrics.total_examples - self.num_inst = self.metrics.total_examples + self.global_sum_metric = self.metrics.global_fscore * self.metrics.global_total_examples + self.num_inst = self.metrics.total_examples + self.global_num_inst = self.metrics.global_total_examples def reset(self): """Resets the internal evaluation result to initial state.""" self.sum_metric = 0. self.num_inst = 0 + self.global_num_inst = 0 + self.global_sum_metric = 0.0 self.metrics.reset_stats() - -@register -@use_np -class Fbeta(F1): - """Computes the Fbeta score of a binary classification problem. - - The Fbeta score is equivalent to harmonic mean of the precision and recall, - where the best value is 1.0 and the worst value is 0.0. The formula for Fbeta score is:: - - Fbeta = (1 + beta ** 2) * (precision * recall) / (beta ** 2 * precision + recall) - - The formula for precision and recall is:: - - precision = true_positives / (true_positives + false_positives) - recall = true_positives / (true_positives + false_negatives) - - .. note:: - - This Fbeta score only supports binary classification. - - Parameters - ---------- - name : str - Name of this metric instance for display. - output_names : list of str, or None - Name of predictions that should be used when updating with update_dict. - By default include all predictions. - label_names : list of str, or None - Name of labels that should be used when updating with update_dict. - By default include all labels. - class_type : str, default "binary" - "binary": f1 for binary classification. - "multiclass": f1 for multiclassification problem. - "multilabel": f1 for multilabel classification. - beta : float, default 1 - weight of precision in harmonic mean. - threshold : float, default 0.5 - threshold for postive confidence value. - average : str, default 'micro' - Strategy to be used for aggregating across mini-batches. - "macro": Calculate metrics for each label and return unweighted mean of f1. - "micro": Calculate metrics globally by counting the total TP, FN and FP. - None: Return f1 scores for each class. - - Examples - -------- - >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])] - >>> labels = [mx.nd.array([0., 1., 1.])] - >>> fbeta = mx.gluon.metric.Fbeta(beta=2) - >>> fbeta.update(preds = predicts, labels = labels) - >>> print fbeta.get() - ('fbeta', 0.9090909090909091) - """ - - def __init__(self, name='fbeta', - output_names=None, label_names=None, class_type="binary", beta=1, threshold=0.5, average="micro"): - super(Fbeta, self).__init__( - name=name, output_names=output_names, label_names=label_names, - class_type=class_type, threshold=threshold, average=average) - self.metrics = _ClassificationMetrics(class_type=class_type, threshold=threshold, beta=beta) - - -@register -@use_np -class BinaryAccuracy(EvalMetric): - """Computes the accuracy of a binary or multilabel classification problem. - - Parameters - ---------- - name : str - Name of this metric instance for display. - output_names : list of str, or None - Name of predictions that should be used when updating with update_dict. - By default include all predictions. - label_names : list of str, or None - Name of labels that should be used when updating with update_dict. - By default include all labels. - threshold : float or ndarray, default 0.5 - threshold for deciding whether the predictions are positive or negative. - - Examples - -------- - >>> predicts = [mx.nd.array([0.7, 1, 0.55])] - >>> labels = [mx.nd.array([0., 1., 0.])] - >>> bacc = mx.gluon.metric.BinaryAccuracy(threshold=0.6) - >>> bacc.update(preds = predicts, labels = labels) - >>> print bacc.get() - ('binary_accuracy', 0.6666666666666666) - """ - - def __init__(self, name='binary_accuracy', - output_names=None, label_names=None, threshold=0.5): - self.threshold = threshold - EvalMetric.__init__(self, name=name, - output_names=output_names, label_names=label_names) - - def update(self, labels, preds): - """Updates the internal evaluation result. - - Parameters - ---------- - labels : list of `NDArray` - Each label denotes positive/negative for each class. - - preds : list of `NDArray` - Each prediction value is a confidence value of being positive for each class. - """ - labels, preds = check_label_shapes(labels, preds, True) - - for label, pred_label in zip(labels, preds): - pred_label = predict_with_threshold(pred_label, self.threshold) - - pred_label = pred_label.as_np_ndarray().astype('int32').as_in_ctx(label.ctx) - label = label.as_np_ndarray().astype('int32') - # flatten before checking shapes to avoid shape miss match - label = label.reshape(-1) - pred_label = pred_label.reshape(-1) - - check_label_shapes(label, pred_label) - - num_correct = (pred_label == label).sum().astype('float64') - self.sum_metric += num_correct - self.num_inst += len(pred_label) + def reset_local(self): + """Resets the internal evaluation result to initial state.""" + self.sum_metric = 0. + self.num_inst = 0 + self.metrics.local_reset_stats() @register -@use_np class MCC(EvalMetric): """Computes the Matthews Correlation Coefficient of a binary classification problem. @@ -966,6 +865,10 @@ class MCC(EvalMetric): label_names : list of str, or None Name of labels that should be used when updating with update_dict. By default include all labels. + average : str, default 'macro' + Strategy to be used for aggregating across mini-batches. + "macro": average the MCC for each batch. + "micro": compute a single MCC across all batches. Examples -------- @@ -984,9 +887,9 @@ class MCC(EvalMetric): [0.]*(false_positives + true_negatives) + [1.]*(false_negatives + true_positives) )] - >>> f1 = mx.gluon.metric.F1() + >>> f1 = mx.metric.F1() >>> f1.update(preds = predicts, labels = labels) - >>> mcc = mx.gluon.metric.MCC() + >>> mcc = mx.metric.MCC() >>> mcc.update(preds = predicts, labels = labels) >>> print f1.get() ('f1', 0.95233560306652054) @@ -995,10 +898,12 @@ class MCC(EvalMetric): """ def __init__(self, name='mcc', - output_names=None, label_names=None): - self._metrics = _ClassificationMetrics() + output_names=None, label_names=None, average="macro"): + self._average = average + self._metrics = _BinaryClassificationMetrics() EvalMetric.__init__(self, name=name, - output_names=output_names, label_names=label_names) + output_names=output_names, label_names=label_names, + has_global_stats=True) def update(self, labels, preds): """Updates the internal evaluation result. @@ -1014,35 +919,72 @@ def update(self, labels, preds): labels, preds = check_label_shapes(labels, preds, True) for label, pred in zip(labels, preds): - self._metrics.update_stats(label, pred) + self._metrics.update_binary_stats(label, pred) - self.sum_metric = self._metrics.binary_matthewscc() * self._metrics.total_examples - self.num_inst = self._metrics.total_examples + if self._average == "macro": + self.sum_metric += self._metrics.matthewscc() + self.global_sum_metric += self._metrics.matthewscc(use_global=True) + self.num_inst += 1 + self.global_num_inst += 1 + self._metrics.reset_stats() + else: + self.sum_metric = self._metrics.matthewscc() * self._metrics.total_examples + self.global_sum_metric = self._metrics.matthewscc(use_global=True) * \ + self._metrics.global_total_examples + self.num_inst = self._metrics.total_examples + self.global_num_inst = self._metrics.global_total_examples def reset(self): """Resets the internal evaluation result to initial state.""" self.sum_metric = 0. self.num_inst = 0. + self.global_sum_metric = 0. + self.global_num_inst = 0. self._metrics.reset_stats() - -#################### -# REGRESSION METRICS -#################### + def reset_local(self): + """Resets the internal evaluation result to initial state.""" + self.sum_metric = 0. + self.num_inst = 0. + self._metrics.local_reset_stats() @register -@use_np -class MAE(EvalMetric): - """Computes Mean Absolute Error (MAE) loss. +class Perplexity(EvalMetric): + """Computes perplexity. - The mean absolute error is given by + Perplexity is a measurement of how well a probability distribution + or model predicts a sample. A low perplexity indicates the model + is good at predicting the sample. + + The perplexity of a model q is defined as .. math:: - \\frac{\\sum_i^n |y_i - \\hat{y}_i|}{n} + b^{\\big(-\\frac{1}{N} \\sum_{i=1}^N \\log_b q(x_i) \\big)} + = \\exp \\big(-\\frac{1}{N} \\sum_{i=1}^N \\log q(x_i)\\big) + + where we let `b = e`. + + :math:`q(x_i)` is the predicted value of its ground truth + label on sample :math:`x_i`. + + For example, we have three samples :math:`x_1, x_2, x_3` and their labels + are :math:`[0, 1, 1]`. + Suppose our model predicts :math:`q(x_1) = p(y_1 = 0 | x_1) = 0.3` + and :math:`q(x_2) = 1.0`, + :math:`q(x_3) = 0.6`. The perplexity of model q is + :math:`exp\\big(-(\\log 0.3 + \\log 1.0 + \\log 0.6) / 3\\big) = 1.77109762852`. Parameters ---------- + ignore_label : int or None + Index of invalid label to ignore when + counting. By default, sets to -1. + If set to `None`, it will include all entries. + axis : int (default -1) + The axis from prediction that was used to + compute softmax. By default use the last + axis. name : str Name of this metric instance for display. output_names : list of str, or None @@ -1054,18 +996,21 @@ class MAE(EvalMetric): Examples -------- - >>> predicts = [mx.nd.array([3, -0.5, 2, 7])] - >>> labels = [mx.nd.array([2.5, 0.0, 2, 8])] - >>> mean_absolute_error = mx.gluon.metric.MAE() - >>> mean_absolute_error.update(labels = labels, preds = predicts) - >>> print mean_absolute_error.get() - ('mae', 0.5) + >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])] + >>> labels = [mx.nd.array([0, 1, 1])] + >>> perp = mx.metric.Perplexity(ignore_label=None) + >>> perp.update(labels, predicts) + >>> print perp.get() + ('Perplexity', 1.7710976285155853) """ - - def __init__(self, name='mae', + def __init__(self, ignore_label, axis=-1, name='perplexity', output_names=None, label_names=None): - super(MAE, self).__init__( - name, output_names=output_names, label_names=label_names) + super(Perplexity, self).__init__( + name, ignore_label=ignore_label, + output_names=output_names, label_names=label_names, + has_global_stats=True) + self.ignore_label = ignore_label + self.axis = axis def update(self, labels, preds): """Updates the internal evaluation result. @@ -1078,28 +1023,64 @@ def update(self, labels, preds): preds : list of `NDArray` Predicted values. """ - labels, preds = check_label_shapes(labels, preds, True) - + assert len(labels) == len(preds) + loss = 0. + num = 0 for label, pred in zip(labels, preds): - label = label.as_np_ndarray() - pred = pred.as_np_ndarray().as_in_ctx(label.ctx) + assert label.size == pred.size/pred.shape[-1], \ + "shape mismatch: %s vs. %s"%(label.shape, pred.shape) + label = label.as_in_context(pred.context).reshape((label.size,)) + pred = ndarray.pick(pred, label.astype(dtype='int32'), axis=self.axis) + if self.ignore_label is not None: + ignore = (label == self.ignore_label).astype(pred.dtype) + num -= ndarray.sum(ignore).asscalar() + pred = pred*(1-ignore) + ignore + loss -= ndarray.sum(ndarray.log(ndarray.maximum(1e-10, pred))).asscalar() + num += pred.size + self.sum_metric += loss + self.global_sum_metric += loss + self.num_inst += num + self.global_num_inst += num - num_inst = label.shape[0] - mae = numpy.abs(label - pred).reshape(num_inst, -1).mean(axis=-1).sum() + def get(self): + """Returns the current evaluation result. - self.sum_metric += mae - self.num_inst += num_inst + Returns + ------- + Tuple of (str, float) + Representing name of the metric and evaluation result. + """ + if self.num_inst == 0: + return (self.name, float('nan')) + else: + return (self.name, math.exp(self.sum_metric/self.num_inst)) + + def get_global(self): + """Returns the current global evaluation result. + + Returns + ------- + Tuple of (str, float) + Representing name of the metric and evaluation result. + """ + if self.global_num_inst == 0: + return (self.name, float('nan')) + else: + return (self.name, math.exp(self.global_sum_metric/self.global_num_inst)) + +#################### +# REGRESSION METRICS +#################### @register -@use_np -class MSE(EvalMetric): - """Computes Mean Squared Error (MSE) loss. +class MAE(EvalMetric): + """Computes Mean Absolute Error (MAE) loss. - The mean squared error is given by + The mean absolute error is given by .. math:: - \\frac{\\sum_i^n (y_i - \\hat{y}_i)^2}{n} + \\frac{\\sum_i^n |y_i - \\hat{y}_i|}{n} Parameters ---------- @@ -1114,17 +1095,19 @@ class MSE(EvalMetric): Examples -------- - >>> predicts = [mx.nd.array([3, -0.5, 2, 7])] - >>> labels = [mx.nd.array([2.5, 0.0, 2, 8])] - >>> mean_squared_error = mx.gluon.metric.MSE() - >>> mean_squared_error.update(labels = labels, preds = predicts) - >>> print mean_squared_error.get() - ('mse', 0.375) + >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))] + >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))] + >>> mean_absolute_error = mx.metric.MAE() + >>> mean_absolute_error.update(labels = labels, preds = predicts) + >>> print mean_absolute_error.get() + ('mae', 0.5) """ - def __init__(self, name='mse', + + def __init__(self, name='mae', output_names=None, label_names=None): - super(MSE, self).__init__( - name, output_names=output_names, label_names=label_names) + super(MAE, self).__init__( + name, output_names=output_names, label_names=label_names, + has_global_stats=True) def update(self, labels, preds): """Updates the internal evaluation result. @@ -1140,25 +1123,29 @@ def update(self, labels, preds): labels, preds = check_label_shapes(labels, preds, True) for label, pred in zip(labels, preds): - label = label.as_np_ndarray() - pred = pred.as_np_ndarray().as_in_ctx(label.ctx) + label = label.asnumpy() + pred = pred.asnumpy() - num_inst = label.shape[0] - mse = ((label - pred)**2.0).reshape(num_inst, -1).mean(axis=-1).sum() + if len(label.shape) == 1: + label = label.reshape(label.shape[0], 1) + if len(pred.shape) == 1: + pred = pred.reshape(pred.shape[0], 1) - self.sum_metric += mse - self.num_inst += num_inst + mae = numpy.abs(label - pred).mean() + self.sum_metric += mae + self.global_sum_metric += mae + self.num_inst += 1 # numpy.prod(label.shape) + self.global_num_inst += 1 # numpy.prod(label.shape) @register -@use_np -class RMSE(MSE): - """Computes Root Mean Squred Error (RMSE) loss. +class MSE(EvalMetric): + """Computes Mean Squared Error (MSE) loss. - The root mean squared error is given by + The mean squared error is given by .. math:: - \\sqrt{\\frac{\\sum_i^n (y_i - \\hat{y}_i)^2}{n}} + \\frac{\\sum_i^n (y_i - \\hat{y}_i)^2}{n} Parameters ---------- @@ -1173,62 +1160,18 @@ class RMSE(MSE): Examples -------- - >>> predicts = [mx.nd.array([3, -0.5, 2, 7])] - >>> labels = [mx.nd.array([2.5, 0.0, 2, 8])] - >>> root_mean_squared_error = mx.gluon.metric.RMSE() - >>> root_mean_squared_error.update(labels = labels, preds = predicts) - >>> print root_mean_squared_error.get() - ('rmse', 0.612372457981) + >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))] + >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))] + >>> mean_squared_error = mx.metric.MSE() + >>> mean_squared_error.update(labels = labels, preds = predicts) + >>> print mean_squared_error.get() + ('mse', 0.375) """ - def __init__(self, name='rmse', + def __init__(self, name='mse', output_names=None, label_names=None): - super(RMSE, self).__init__( - name, output_names=output_names, label_names=label_names) - - def get(self): - if self.num_inst == 0: - return (self.name, float('nan')) - else: - return (self.name, math.sqrt(self.sum_metric / self.num_inst)) - - -@register -@use_np -class MeanPairwiseDistance(EvalMetric): - """Computes Mean Pairwise Distance. - - The mean pairwise distance is given by - - .. math:: - \\sqrt{\\frac{(\\sum_i^n (y_i - \\hat{y}_i)^p)^\\frac{1}{p}}{n}} - - Parameters - ---------- - name : str - Name of this metric instance for display. - output_names : list of str, or None - Name of predictions that should be used when updating with update_dict. - By default include all predictions. - label_names : list of str, or None - Name of labels that should be used when updating with update_dict. - By default include all labels. - p : float, default 2 - calculating distance using the p-norm - - Examples - -------- - >>> predicts = [mx.nd.array([[1., 2.], [3., 4.]])] - >>> labels = [mx.nd.array([[1., 0.], [4., 2.]])] - >>> mpd = mx.gluon.metric.MeanPairwiseDistance() - >>> mpd.update(labels = labels, preds = predicts) - >>> print mpd.get() - ('mpd', 2.1180338859558105) - """ - def __init__(self, name='mpd', - output_names=None, label_names=None, p=2): - super(MeanPairwiseDistance, self).__init__( - name, output_names=output_names, label_names=label_names) - self.p = p + super(MSE, self).__init__( + name, output_names=output_names, label_names=label_names, + has_global_stats=True) def update(self, labels, preds): """Updates the internal evaluation result. @@ -1244,30 +1187,29 @@ def update(self, labels, preds): labels, preds = check_label_shapes(labels, preds, True) for label, pred in zip(labels, preds): - label = label.as_np_ndarray() - pred = pred.as_np_ndarray().as_in_ctx(label.ctx) + label = label.asnumpy() + pred = pred.asnumpy() - label = label.reshape(label.shape[0], -1) - pred = pred.reshape(pred.shape[0], -1) - - dis = (((label - pred) ** self.p).sum(axis=-1)) ** (1./self.p) - dis = dis.sum() - num_inst = label.shape[0] + if len(label.shape) == 1: + label = label.reshape(label.shape[0], 1) + if len(pred.shape) == 1: + pred = pred.reshape(pred.shape[0], 1) - self.sum_metric += dis - self.num_inst += num_inst + mse = ((label - pred)**2.0).mean() + self.sum_metric += mse + self.global_sum_metric += mse + self.num_inst += 1 # numpy.prod(label.shape) + self.global_num_inst += 1 # numpy.prod(label.shape) @register -@use_np -class MeanCosineSimilarity(EvalMetric): - """Computes Mean Cosine Similarity. +class RMSE(EvalMetric): + """Computes Root Mean Squred Error (RMSE) loss. - The mean cosine similarity is given by + The root mean squared error is given by .. math:: - cos_sim(label, pred) = \frac{{label}.{pred}}{max(||label||.||pred||, eps)} - (calculating on the last dimension of label and pred.) + \\sqrt{\\frac{\\sum_i^n (y_i - \\hat{y}_i)^2}{n}} Parameters ---------- @@ -1279,23 +1221,21 @@ class MeanCosineSimilarity(EvalMetric): label_names : list of str, or None Name of labels that should be used when updating with update_dict. By default include all labels. - eps : float, default 1e-8 - small vale to avoid division by zero. Examples -------- - >>> predicts = [mx.nd.array([[1., 0.], [1., 1.]])] - >>> labels = [mx.nd.array([[3., 4.], [2., 2.]])] - >>> mcs = mx.gluon.metric.MeanCosineSimilarity() - >>> mcs.update(labels = labels, preds = predicts) - >>> print mcs.get() - ('cos_sim', 0.8) + >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))] + >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))] + >>> root_mean_squared_error = mx.metric.RMSE() + >>> root_mean_squared_error.update(labels = labels, preds = predicts) + >>> print root_mean_squared_error.get() + ('rmse', 0.612372457981) """ - def __init__(self, name='cos_sim', - output_names=None, label_names=None, eps=1e-8): - super(MeanCosineSimilarity, self).__init__( - name, output_names=output_names, label_names=label_names) - self.eps = eps + def __init__(self, name='rmse', + output_names=None, label_names=None): + super(RMSE, self).__init__( + name, output_names=output_names, label_names=label_names, + has_global_stats=True) def update(self, labels, preds): """Updates the internal evaluation result. @@ -1311,27 +1251,23 @@ def update(self, labels, preds): labels, preds = check_label_shapes(labels, preds, True) for label, pred in zip(labels, preds): - label = label.as_np_ndarray() - pred = pred.as_np_ndarray().as_in_ctx(label.ctx) + label = label.asnumpy() + pred = pred.asnumpy() if len(label.shape) == 1: - label = label.reshape(1, label.shape[0]) + label = label.reshape(label.shape[0], 1) if len(pred.shape) == 1: - pred = pred.reshape(1, pred.shape[0]) + pred = pred.reshape(pred.shape[0], 1) - sim = (label * pred).sum(axis=-1) - n_p = numpy.linalg.norm(pred, axis=-1) - n_l = numpy.linalg.norm(label, axis=-1) - sim = sim / numpy.maximum(n_l * n_p, self.eps) - sim = sim.sum() - num_inst = len(label.reshape(-1, label.shape[-1])) # numpy.prod(label.shape[:-1]) is not supported - self.sum_metric += sim - self.num_inst += num_inst + rmse = numpy.sqrt(((label - pred)**2.0).mean()) + self.sum_metric += rmse + self.global_sum_metric += rmse + self.num_inst += 1 + self.global_num_inst += 1 @register @alias('ce') -@use_np class CrossEntropy(EvalMetric): """Computes Cross Entropy loss. @@ -1346,15 +1282,9 @@ class :math:`k`. Parameters ---------- - eps : float, default 1e-12 - Use small constant for the case that predicted value is 0. - ignore_label : int or None, default None - Index of invalid label to ignore when - counting. By default, sets to -1. - If set to `None`, it will include all entries. - axis : int (default -1) - The axis from prediction that was used to - compute softmax. By default use the last axis. + eps : float + Cross Entropy loss is undefined for predicted value is 0 or 1, + so predicted values are added with the small constant. name : str Name of this metric instance for display. output_names : list of str, or None @@ -1368,17 +1298,17 @@ class :math:`k`. -------- >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])] >>> labels = [mx.nd.array([0, 1, 1])] - >>> ce = mx.gluon.metric.CrossEntropy() + >>> ce = mx.metric.CrossEntropy() >>> ce.update(labels, predicts) >>> print ce.get() ('cross-entropy', 0.57159948348999023) """ - def __init__(self, eps=1e-12, ignore_label=None, axis=-1, name='cross-entropy', + def __init__(self, eps=1e-12, name='cross-entropy', output_names=None, label_names=None): super(CrossEntropy, self).__init__( - name, output_names=output_names, label_names=label_names) - self.ignore_label = ignore_label - self.axis = axis + name, eps=eps, + output_names=output_names, label_names=label_names, + has_global_stats=True) self.eps = eps def update(self, labels, preds): @@ -1394,97 +1324,22 @@ def update(self, labels, preds): """ labels, preds = check_label_shapes(labels, preds, True) - loss = 0. - num = 0 for label, pred in zip(labels, preds): - assert label.size == pred.size/pred.shape[-1], \ - "shape mismatch: %s vs. %s"%(label.shape, pred.shape) - label = label.reshape((label.size,)) - pred = ndarray.pick(pred.as_in_context(label.ctx), label.astype(dtype='int32'), axis=self.axis) - label = label.as_np_ndarray() - pred = pred.as_np_ndarray() - if self.ignore_label is not None: - ignore = (label == self.ignore_label).astype(pred.dtype) - num -= ignore.sum() - pred = pred * (1 - ignore) + ignore - loss -= numpy.log(numpy.maximum(self.eps, pred)).sum() - num += pred.size - self.sum_metric += loss - self.num_inst += num + label = label.asnumpy() + pred = pred.asnumpy() + label = label.ravel() + assert label.shape[0] == pred.shape[0] -@register -@use_np -class Perplexity(CrossEntropy): - """Computes perplexity. - - Perplexity is a measurement of how well a probability distribution - or model predicts a sample. A low perplexity indicates the model - is good at predicting the sample. - - The perplexity of a model q is defined as - - .. math:: - b^{\\big(-\\frac{1}{N} \\sum_{i=1}^N \\log_b q(x_i) \\big)} - = \\exp \\big(-\\frac{1}{N} \\sum_{i=1}^N \\log q(x_i)\\big) - - where we let `b = e`. - - :math:`q(x_i)` is the predicted value of its ground truth - label on sample :math:`x_i`. - - For example, we have three samples :math:`x_1, x_2, x_3` and their labels - are :math:`[0, 1, 1]`. - Suppose our model predicts :math:`q(x_1) = p(y_1 = 0 | x_1) = 0.3` - and :math:`q(x_2) = 1.0`, - :math:`q(x_3) = 0.6`. The perplexity of model q is - :math:`exp\\big(-(\\log 0.3 + \\log 1.0 + \\log 0.6) / 3\\big) = 1.77109762852`. - - Parameters - ---------- - eps : float, default 1e-12 - Use small constant for the case that predicted value is 0. - ignore_label : int or None, default None - Index of invalid label to ignore when - counting. By default, sets to -1. - If set to `None`, it will include all entries. - axis : int (default -1) - The axis from prediction that was used to - compute softmax. By default use the last axis. - name : str - Name of this metric instance for display. - output_names : list of str, or None - Name of predictions that should be used when updating with update_dict. - By default include all predictions. - label_names : list of str, or None - Name of labels that should be used when updating with update_dict. - By default include all labels. - - Examples - -------- - >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])] - >>> labels = [mx.nd.array([0, 1, 1])] - >>> perp = mx.gluon.metric.Perplexity(ignore_label=None) - >>> perp.update(labels, predicts) - >>> print perp.get() - ('Perplexity', 1.7710976285155853) - """ - def __init__(self, eps=1e-12, ignore_label=None, axis=-1, name='perplexity', - output_names=None, label_names=None): - super(Perplexity, self).__init__( - name=name, eps=eps, ignore_label=ignore_label, axis=axis, - output_names=output_names, label_names=label_names) - - def get(self): - if self.num_inst == 0: - return (self.name, float('nan')) - else: - return (self.name, math.exp(self.sum_metric/self.num_inst)) - + prob = pred[numpy.arange(label.shape[0]), numpy.int64(label)] + cross_entropy = (-numpy.log(prob + self.eps)).sum() + self.sum_metric += cross_entropy + self.global_sum_metric += cross_entropy + self.num_inst += label.shape[0] + self.global_num_inst += label.shape[0] @register @alias('nll_loss') -@use_np class NegativeLogLikelihood(EvalMetric): """Computes the negative log-likelihood loss. @@ -1515,7 +1370,7 @@ class NegativeLogLikelihood(EvalMetric): -------- >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])] >>> labels = [mx.nd.array([0, 1, 1])] - >>> nll_loss = mx.gluon.metric.NegativeLogLikelihood() + >>> nll_loss = mx.metric.NegativeLogLikelihood() >>> nll_loss.update(labels, predicts) >>> print nll_loss.get() ('nll-loss', 0.57159948348999023) @@ -1524,7 +1379,8 @@ def __init__(self, eps=1e-12, name='nll-loss', output_names=None, label_names=None): super(NegativeLogLikelihood, self).__init__( name, eps=eps, - output_names=output_names, label_names=label_names) + output_names=output_names, label_names=label_names, + has_global_stats=True) self.eps = eps def update(self, labels, preds): @@ -1541,21 +1397,21 @@ def update(self, labels, preds): labels, preds = check_label_shapes(labels, preds, True) for label, pred in zip(labels, preds): - label = label.as_np_ndarray() - pred = pred.as_np_ndarray().as_in_ctx(label.ctx) + label = label.asnumpy() + pred = pred.asnumpy() - label = label.reshape(-1) + label = label.ravel() num_examples = pred.shape[0] assert label.shape[0] == num_examples, (label.shape[0], num_examples) prob = pred[numpy.arange(num_examples, dtype=numpy.int64), numpy.int64(label)] nll = (-numpy.log(prob + self.eps)).sum() self.sum_metric += nll + self.global_sum_metric += nll self.num_inst += num_examples - + self.global_num_inst += num_examples @register @alias('pearsonr') -@use_np class PearsonCorrelation(EvalMetric): """Computes Pearson correlation. @@ -1574,23 +1430,30 @@ class PearsonCorrelation(EvalMetric): label_names : list of str, or None Name of labels that should be used when updating with update_dict. By default include all labels. + average : str, default 'macro' + Strategy to be used for aggregating across mini-batches. + "macro": average the pearsonr scores for each batch. + "micro": compute a single pearsonr score across all batches. Examples -------- >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])] >>> labels = [mx.nd.array([[1, 0], [0, 1], [0, 1]])] - >>> pr = mx.gluon.metric.PearsonCorrelation() + >>> pr = mx.metric.PearsonCorrelation() >>> pr.update(labels, predicts) >>> print pr.get() ('pearsonr', 0.42163704544016178) """ def __init__(self, name='pearsonr', - output_names=None, label_names=None): + output_names=None, label_names=None, average='macro'): + self.average = average super(PearsonCorrelation, self).__init__( - name, output_names=output_names, label_names=label_names) - self.reset() + name, output_names=output_names, label_names=label_names, + has_global_stats=True) + if self.average == 'micro': + self.reset_micro() - def reset(self): + def reset_micro(self): self._sse_p = 0 self._mean_p = 0 self._sse_l = 0 @@ -1599,8 +1462,13 @@ def reset(self): self._label_nums = 0 self._conv = 0 + def reset(self): self.num_inst = 0 self.sum_metric = 0.0 + self.global_num_inst = 0 + self.global_sum_metric = 0.0 + if self.average == 'micro': + self.reset_micro() def update_variance(self, new_values, *aggregate): #Welford's online algorithm for variance update @@ -1628,26 +1496,34 @@ def update(self, labels, preds): labels, preds = check_label_shapes(labels, preds, True) for label, pred in zip(labels, preds): check_label_shapes(label, pred, False, True) - label = label.as_np_ndarray().reshape(-1).astype(numpy.float64) - pred = pred.as_np_ndarray().as_in_ctx(label.ctx).reshape(-1).astype(numpy.float64) - - self.num_inst += 1 - self._label_nums, self._mean_l, self._sse_l = \ - self.update_variance(label, self._label_nums, self._mean_l, self._sse_l) - self.update_cov(label, pred) - self._pred_nums, self._mean_p, self._sse_p = \ - self.update_variance(pred, self._pred_nums, self._mean_p, self._sse_p) + label = label.asnumpy().ravel().astype(numpy.float64) + pred = pred.asnumpy().ravel().astype(numpy.float64) + if self.average == 'macro': + pearson_corr = numpy.corrcoef(pred, label)[0, 1] + self.sum_metric += pearson_corr + self.global_sum_metric += pearson_corr + self.num_inst += 1 + self.global_num_inst += 1 + else: + self.global_num_inst += 1 + self.num_inst += 1 + self._label_nums, self._mean_l, self._sse_l = \ + self.update_variance(label, self._label_nums, self._mean_l, self._sse_l) + self.update_cov(label, pred) + self._pred_nums, self._mean_p, self._sse_p = \ + self.update_variance(pred, self._pred_nums, self._mean_p, self._sse_p) def get(self): if self.num_inst == 0: return (self.name, float('nan')) - - n = self._label_nums - pearsonr = self._conv / ((n-1) * numpy.sqrt(self._sse_p / (n - 1)) * numpy.sqrt(self._sse_l / (n - 1))) - return (self.name, float(pearsonr)) + if self.average == 'macro': + return (self.name, self.sum_metric / self.num_inst) + else: + n = self._label_nums + pearsonr = self._conv / ((n-1) * numpy.sqrt(self._sse_p / (n - 1)) * numpy.sqrt(self._sse_l / (n - 1))) + return (self.name, pearsonr) @register -@use_np class PCC(EvalMetric): """PCC is a multiclass equivalent for the Matthews correlation coefficient derived from a discrete solution to the Pearson correlation coefficient. @@ -1691,9 +1567,9 @@ class PCC(EvalMetric): [0]*(false_positives + true_negatives) + [1]*(false_negatives + true_positives) )] - >>> f1 = mx.gluon.metric.F1() + >>> f1 = mx.metric.F1() >>> f1.update(preds = predicts, labels = labels) - >>> pcc = mx.gluon.metric.PCC() + >>> pcc = mx.metric.PCC() >>> pcc.update(preds = predicts, labels = labels) >>> print f1.get() ('f1', 0.95233560306652054) @@ -1701,14 +1577,18 @@ class PCC(EvalMetric): ('pcc', 0.01917751877733392) """ def __init__(self, name='pcc', - output_names=None, label_names=None): + output_names=None, label_names=None, + has_global_stats=True): self.k = 2 super(PCC, self).__init__( - name=name, output_names=output_names, label_names=label_names) + name=name, output_names=output_names, label_names=label_names, + has_global_stats=has_global_stats) def _grow(self, inc): self.lcm = numpy.pad( self.lcm, ((0, inc), (0, inc)), 'constant', constant_values=(0)) + self.gcm = numpy.pad( + self.gcm, ((0, inc), (0, inc)), 'constant', constant_values=(0)) self.k += inc def _calc_mcc(self, cmat): @@ -1719,8 +1599,7 @@ def _calc_mcc(self, cmat): cov_yy = numpy.sum(y * (n - y)) if cov_xx == 0 or cov_yy == 0: return float('nan') - # i = cmat.diagonal() # mxnet.numpy.ndarray.diagonal() is currently not available. - i = cmat[numpy.arange(self.k), numpy.arange(self.k)] + i = cmat.diagonal() cov_xy = numpy.sum(i * n - x * y) return cov_xy / (cov_xx * cov_yy) ** 0.5 @@ -1739,29 +1618,42 @@ def update(self, labels, preds): # update the confusion matrix for label, pred in zip(labels, preds): - label = label.astype('int32', copy=False).as_np_ndarray() - pred = pred.as_np_ndarray().as_in_ctx(label.ctx) + label = label.astype('int32', copy=False).asnumpy() + pred = pred.asnumpy() if pred.shape != label.shape: - pred = pred.argmax(axis=1).astype(label, copy=False) + pred = pred.argmax(axis=1) else: pred = pred.astype('int32', copy=False) - n = int(max(pred.max(), label.max())) + n = max(pred.max(), label.max()) if n >= self.k: self._grow(n + 1 - self.k) - bcm = numpy.zeros((self.k, self.k), dtype='float64') + bcm = numpy.zeros((self.k, self.k)) for i, j in zip(pred, label): bcm[i, j] += 1 self.lcm += bcm + self.gcm += bcm + self.num_inst += 1 + self.global_num_inst += 1 @property def sum_metric(self): return self._calc_mcc(self.lcm) * self.num_inst + @property + def global_sum_metric(self): + return self._calc_mcc(self.gcm) * self.global_num_inst + def reset(self): """Resets the internal evaluation result to initial state.""" + self.global_num_inst = 0. + self.gcm = numpy.zeros((self.k, self.k)) + self.reset_local() + + def reset_local(self): + """Resets the local portion of the internal evaluation results to initial state.""" self.num_inst = 0. - self.lcm = numpy.zeros((self.k, self.k), dtype='float64') + self.lcm = numpy.zeros((self.k, self.k)) @register @@ -1782,7 +1674,8 @@ class Loss(EvalMetric): def __init__(self, name='loss', output_names=None, label_names=None): super(Loss, self).__init__( - name, output_names=output_names, label_names=label_names) + name, output_names=output_names, label_names=label_names, + has_global_stats=True) def update(self, _, preds): @@ -1792,7 +1685,9 @@ def update(self, _, preds): for pred in preds: loss = ndarray.sum(pred).asscalar() self.sum_metric += loss + self.global_sum_metric += loss self.num_inst += pred.size + self.global_num_inst += pred.size @register @@ -1814,7 +1709,6 @@ def __init__(self, name='caffe', @register -@use_np class CustomMetric(EvalMetric): """Computes a customized evaluation metric. @@ -1845,7 +1739,7 @@ class CustomMetric(EvalMetric): >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))] >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))] >>> feval = lambda x, y : (x + y).mean() - >>> eval_metrics = mx.gluon.metric.CustomMetric(feval=feval) + >>> eval_metrics = mx.metric.CustomMetric(feval=feval) >>> eval_metrics.update(labels, predicts) >>> print eval_metrics.get() ('custom()', 6.0) @@ -1859,7 +1753,8 @@ def __init__(self, feval, name=None, allow_extra_outputs=False, super(CustomMetric, self).__init__( name, feval=feval, allow_extra_outputs=allow_extra_outputs, - output_names=output_names, label_names=label_names) + output_names=output_names, label_names=label_names, + has_global_stats=True) self._feval = feval self._allow_extra_outputs = allow_extra_outputs @@ -1878,17 +1773,21 @@ def update(self, labels, preds): labels, preds = check_label_shapes(labels, preds, True) for pred, label in zip(preds, labels): - label = label.as_np_ndarray() - pred = pred.as_np_ndarray().as_in_ctx(label.ctx) + label = label.asnumpy() + pred = pred.asnumpy() reval = self._feval(label, pred) if isinstance(reval, tuple): (sum_metric, num_inst) = reval self.sum_metric += sum_metric + self.global_sum_metric += sum_metric self.num_inst += num_inst + self.global_num_inst += num_inst else: self.sum_metric += reval + self.global_sum_metric += reval self.num_inst += 1 + self.global_num_inst += 1 def get_config(self): raise NotImplementedError("CustomMetric cannot be serialized") @@ -1920,7 +1819,7 @@ def np(numpy_feval, name=None, allow_extra_outputs=False): >>> def custom_metric(label, pred): ... return np.mean(np.abs(label-pred)) ... - >>> metric = mx.gluon.metric.np(custom_metric) + >>> metric = mx.metric.np(custom_metric) """ def feval(label, pred): """Internal eval function.""" diff --git a/python/mxnet/model.py b/python/mxnet/model.py index 7aee2578f2a3..9acabeefcb2d 100644 --- a/python/mxnet/model.py +++ b/python/mxnet/model.py @@ -26,6 +26,8 @@ from . import ndarray as nd from . import symbol as sym +from . import optimizer as opt +from . import metric from . import kvstore as kvs from .context import cpu diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py new file mode 100644 index 000000000000..053a00b3abba --- /dev/null +++ b/python/mxnet/module/base_module.py @@ -0,0 +1,1067 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=fixme, too-many-arguments, too-many-locals, no-else-raise +# pylint: disable=too-many-public-methods, too-many-branches, too-many-lines +"""`BaseModule` defines an API for modules.""" + +import time +import logging +import warnings +import numpy as np + +from .. import metric +from .. import ndarray + +from ..context import cpu +from ..model import BatchEndParam +from ..initializer import Uniform +from ..io import DataDesc, DataIter, DataBatch +from ..base import _as_list + + +def _check_input_names(symbol, names, typename, throw): + """Check that all input names are in symbol's arguments.""" + args = symbol.list_arguments() + for name in names: + if name in args: + continue + candidates = [arg for arg in args if + not arg.endswith('_weight') and + not arg.endswith('_bias') and + not arg.endswith('_gamma') and + not arg.endswith('_beta')] + msg = "\033[91mYou created Module with Module(..., %s_names=%s) but " \ + "input with name '%s' is not found in symbol.list_arguments(). " \ + "Did you mean one of:\n\t%s\033[0m"%( + typename, str(names), name, '\n\t'.join(candidates)) + if throw: + raise ValueError(msg) + else: + warnings.warn(msg) + + +def _check_names_match(data_names, data_shapes, name, throw): + """Check that input names matches input data descriptors.""" + actual = [x[0] for x in data_shapes] + if sorted(data_names) != sorted(actual): + msg = "Data provided by %s_shapes don't match names specified by %s_names (%s vs. %s)"%( + name, name, str(data_shapes), str(data_names)) + if throw: + raise ValueError(msg) + else: + warnings.warn(msg) + + +def _parse_data_desc(data_names, label_names, data_shapes, label_shapes): + """parse data_attrs into DataDesc format and check that names match""" + data_shapes = [x if isinstance(x, DataDesc) else DataDesc(*x) for x in data_shapes] + _check_names_match(data_names, data_shapes, 'data', True) + if label_shapes is not None: + label_shapes = [x if isinstance(x, DataDesc) else DataDesc(*x) for x in label_shapes] + _check_names_match(label_names, label_shapes, 'label', False) + else: + _check_names_match(label_names, [], 'label', False) + return data_shapes, label_shapes + + +class BaseModule(object): + """The base class of a module. + + A module represents a computation component. One can think of module as a computation machine. + A module can execute forward and backward passes and update parameters in a model. + We aim to make the APIs easy to use, especially in the case when we need to use the imperative + API to work with multiple modules (e.g. stochastic depth network). + + A module has several states: + + - Initial state: Memory is not allocated yet, so the module is not ready for computation yet. + - Binded: Shapes for inputs, outputs, and parameters are all known, memory has been allocated, + and the module is ready for computation. + - Parameters are initialized: For modules with parameters, doing computation before + initializing the parameters might result in undefined outputs. + - Optimizer is installed: An optimizer can be installed to a module. After this, the parameters + of the module can be updated according to the optimizer after gradients are computed + (forward-backward). + + In order for a module to interact with others, it must be able to report the + following information in its initial state (before binding): + + - `data_names`: list of type string indicating the names of the required input data. + - `output_names`: list of type string indicating the names of the required outputs. + + After binding, a module should be able to report the following richer information: + + - state information + - `binded`: `bool`, indicates whether the memory buffers needed for computation + have been allocated. + - `for_training`: whether the module is bound for training. + - `params_initialized`: `bool`, indicates whether the parameters of this module + have been initialized. + - `optimizer_initialized`: `bool`, indicates whether an optimizer is defined + and initialized. + - `inputs_need_grad`: `bool`, indicates whether gradients with respect to the + input data are needed. Might be useful when implementing composition of modules. + + - input/output information + - `data_shapes`: a list of `(name, shape)`. In theory, since the memory is allocated, + we could directly provide the data arrays. But in the case of data parallelism, + the data arrays might not be of the same shape as viewed from the external world. + - `label_shapes`: a list of `(name, shape)`. This might be `[]` if the module does + not need labels (e.g. it does not contains a loss function at the top), or a module + is not bound for training. + - `output_shapes`: a list of `(name, shape)` for outputs of the module. + + - parameters (for modules with parameters) + - `get_params()`: return a tuple `(arg_params, aux_params)`. Each of those + is a dictionary of name to ``NDArray`` mapping. Those `NDArray` always lives on + CPU. The actual parameters used for computing might live on other devices (GPUs), + this function will retrieve (a copy of) the latest parameters. + - ``set_params(arg_params, aux_params)``: assign parameters to the devices + doing the computation. + - ``init_params(...)``: a more flexible interface to assign or initialize the parameters. + + - setup + - `bind()`: prepare environment for computation. + - `init_optimizer()`: install optimizer for parameter updating. + - `prepare()`: prepare the module based on the current data batch. + + - computation + - `forward(data_batch)`: forward operation. + - `backward(out_grads=None)`: backward operation. + - `update()`: update parameters according to installed optimizer. + - `get_outputs()`: get outputs of the previous forward operation. + - `get_input_grads()`: get the gradients with respect to the inputs computed + in the previous backward operation. + - `update_metric(metric, labels, pre_sliced=False)`: update performance metric + for the previous forward + computed results. + + - other properties (mostly for backward compatibility) + - `symbol`: the underlying symbolic graph for this module (if any) + This property is not necessarily constant. For example, for `BucketingModule`, + this property is simply the *current* symbol being used. For other modules, + this value might not be well defined. + + When those intermediate-level API are implemented properly, the following + high-level API will be automatically available for a module: + + - `fit`: train the module parameters on a data set. + - `predict`: run prediction on a data set and collect outputs. + - `score`: run prediction on a data set and evaluate performance. + + Examples + -------- + >>> # An example of creating a mxnet module. + >>> import mxnet as mx + >>> data = mx.symbol.Variable('data') + >>> fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128) + >>> act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu") + >>> fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64) + >>> act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu") + >>> fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10) + >>> out = mx.symbol.SoftmaxOutput(fc3, name = 'softmax') + >>> mod = mx.mod.Module(out) + """ + def __init__(self, logger=logging): + self.logger = logger + self.binded = False + self.for_training = False + self.inputs_need_grad = False + self.params_initialized = False + self.optimizer_initialized = False + self._symbol = None + self._total_exec_bytes = 0 + + ################################################################################ + # High Level API + ################################################################################ + def forward_backward(self, data_batch): + """A convenient function that calls both ``forward`` and ``backward``.""" + self.forward(data_batch, is_train=True) + self.backward() + + def score(self, eval_data, eval_metric, num_batch=None, batch_end_callback=None, + score_end_callback=None, + reset=True, epoch=0, sparse_row_id_fn=None): + """Runs prediction on ``eval_data`` and evaluates the performance according to + the given ``eval_metric``. + + Checkout `Module Tutorial `_ + to see an end-to-end use-case. + + Parameters + ---------- + eval_data : DataIter + Evaluation data to run prediction on. + eval_metric : EvalMetric or list of EvalMetrics + Evaluation metric to use. + num_batch : int + Number of batches to run. Defaults to ``None``, indicating run until the `DataIter` + finishes. + batch_end_callback : function + Could also be a list of functions. + reset : bool + Defaults to ``True``. Indicates whether we should reset `eval_data` before starting + evaluating. + epoch : int + Defaults to 0. For compatibility, this will be passed to callbacks (if any). + During training, this will correspond to the training epoch number. + sparse_row_id_fn : A callback function + The function takes `data_batch` as an input and returns a dict of + str -> NDArray. The resulting dict is used for pulling row_sparse + parameters from the kvstore, where the str key is the name of the param, + and the value is the row id of the param to pull. + + Examples + -------- + >>> # An example of using score for prediction. + >>> # Evaluate accuracy on val_dataiter + >>> metric = mx.metric.Accuracy() + >>> mod.score(val_dataiter, metric) + >>> mod.score(val_dataiter, ['mse', 'acc']) + """ + assert self.binded and self.params_initialized + + if reset: + eval_data.reset() + + if not isinstance(eval_metric, metric.EvalMetric): + eval_metric = metric.create(eval_metric) + + eval_metric.reset() + actual_num_batch = 0 + + for nbatch, eval_batch in enumerate(eval_data): + if num_batch is not None and nbatch == num_batch: + break + self.prepare(eval_batch, sparse_row_id_fn=sparse_row_id_fn) + self.forward(eval_batch, is_train=False) + if isinstance(eval_batch, list): + self.update_metric(eval_metric, [eb.label for eb in eval_batch], pre_sliced=True) + else: + self.update_metric(eval_metric, eval_batch.label) + + if batch_end_callback is not None: + batch_end_params = BatchEndParam(epoch=epoch, + nbatch=nbatch, + eval_metric=eval_metric, + locals=locals()) + for callback in _as_list(batch_end_callback): + callback(batch_end_params) + actual_num_batch += 1 + + if score_end_callback: + params = BatchEndParam(epoch=epoch, + nbatch=actual_num_batch, + eval_metric=eval_metric, + locals=locals()) + for callback in _as_list(score_end_callback): + callback(params) + + return eval_metric.get_name_value() + + def iter_predict(self, eval_data, num_batch=None, reset=True, sparse_row_id_fn=None): + """Iterates over predictions. + + Examples + -------- + >>> for pred, i_batch, batch in module.iter_predict(eval_data): + ... # pred is a list of outputs from the module + ... # i_batch is a integer + ... # batch is the data batch from the data iterator + + Parameters + ---------- + eval_data : DataIter + Evaluation data to run prediction on. + num_batch : int + Default is ``None``, indicating running all the batches in the data iterator. + reset : bool + Default is ``True``, indicating whether we should reset the data iter before start + doing prediction. + sparse_row_id_fn : A callback function + The function takes `data_batch` as an input and returns a dict of + str -> NDArray. The resulting dict is used for pulling row_sparse + parameters from the kvstore, where the str key is the name of the param, + and the value is the row id of the param to pull. + """ + assert self.binded and self.params_initialized + + if reset: + eval_data.reset() + + for nbatch, eval_batch in enumerate(eval_data): + if num_batch is not None and nbatch == num_batch: + break + self.prepare(eval_batch, sparse_row_id_fn=sparse_row_id_fn) + self.forward(eval_batch, is_train=False) + pad = eval_batch.pad + outputs = [out[0:out.shape[0]-pad] for out in self.get_outputs()] + + yield (outputs, nbatch, eval_batch) + + def predict(self, eval_data, num_batch=None, merge_batches=True, reset=True, + always_output_list=False, sparse_row_id_fn=None): + """Runs prediction and collects the outputs. + + When `merge_batches` is ``True`` (by default), the return value will be a list + ``[out1, out2, out3]``, where each element is formed by concatenating the outputs for + all the mini-batches. When `always_output_list` is ``False`` (as by default), + then in the case of a single output, `out1` is returned instead of ``[out1]``. + + When `merge_batches` is ``False``, the return value will be a nested list like + ``[[out1_batch1, out2_batch1], [out1_batch2], ...]``. This mode is useful because + in some cases (e.g. bucketing), the module does not necessarily produce the same + number of outputs. + + The objects in the results have type `NDArray`. If you need to work with a numpy array, + just call ``.asnumpy()`` on each `NDArray`. + + Parameters + ---------- + eval_data : DataIter or NDArray or numpy array + Evaluation data to run prediction on. + num_batch : int + Defaults to ``None``, indicates running all the batches in the data iterator. + merge_batches : bool + Defaults to ``True``, see above for return values. + reset : bool + Defaults to ``True``, indicates whether we should reset the data iter before + doing prediction. + always_output_list : bool + Defaults to ``False``, see above for return values. + sparse_row_id_fn : A callback function + The function takes `data_batch` as an input and returns a dict of + str -> NDArray. The resulting dict is used for pulling row_sparse + parameters from the kvstore, where the str key is the name of the param, + and the value is the row id of the param to pull. + + Returns + ------- + list of NDArray or list of list of NDArray + Prediction results. + + Examples + -------- + >>> # An example of using `predict` for prediction. + >>> # Predict on the first 10 batches of val_dataiter + >>> mod.predict(eval_data=val_dataiter, num_batch=10) + """ + assert self.binded and self.params_initialized + + if isinstance(eval_data, (ndarray.NDArray, np.ndarray)): + if isinstance(eval_data, np.ndarray): + eval_data = ndarray.array(eval_data) + self.forward(DataBatch([eval_data])) + return self.get_outputs()[0] + + if not isinstance(eval_data, DataIter): + raise ValueError('eval_data must be of type NDArray or DataIter') + + if reset: + eval_data.reset() + + output_list = [] + + for nbatch, eval_batch in enumerate(eval_data): + if num_batch is not None and nbatch == num_batch: + break + self.prepare(eval_batch, sparse_row_id_fn=sparse_row_id_fn) + self.forward(eval_batch, is_train=False) + pad = eval_batch.pad + outputs = [out[0:out.shape[0]-pad].copy() for out in self.get_outputs()] + + output_list.append(outputs) + + if len(output_list) == 0: + return output_list + + if merge_batches: + num_outputs = len(output_list[0]) + for out in output_list: + assert len(out) == num_outputs, \ + 'Cannot merge batches, as num of outputs is not the same ' + \ + 'in mini-batches. Maybe bucketing is used?' + output_list2 = [ndarray.concatenate([out[i] for out in output_list]) + for i in range(num_outputs)] + + if num_outputs == 1 and not always_output_list: + return output_list2[0] + return output_list2 + + return output_list + + def fit(self, train_data, eval_data=None, eval_metric='acc', + epoch_end_callback=None, batch_end_callback=None, kvstore='local', + optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), + eval_end_callback=None, + eval_batch_end_callback=None, initializer=Uniform(0.01), + arg_params=None, aux_params=None, allow_missing=False, + force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None, + validation_metric=None, monitor=None, sparse_row_id_fn=None): + """Trains the module parameters. + + Checkout `Module Tutorial `_ + to see an end-to-end use-case. + + Parameters + ---------- + train_data : DataIter + Train DataIter. + eval_data : DataIter + If not ``None``, will be used as validation set and the performance + after each epoch will be evaluated. + eval_metric : str or EvalMetric + Defaults to 'accuracy'. The performance measure used to display during training. + Other possible predefined metrics are: + 'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'. + epoch_end_callback : function or list of functions + Each callback will be called with the current `epoch`, `symbol`, `arg_params` + and `aux_params`. + batch_end_callback : function or list of function + Each callback will be called with a `BatchEndParam`. + kvstore : str or KVStore + Defaults to 'local'. + optimizer : str or Optimizer + Defaults to 'sgd'. + optimizer_params : dict + Defaults to ``(('learning_rate', 0.01),)``. The parameters for + the optimizer constructor. + The default value is not a dict, just to avoid pylint warning on dangerous + default values. + eval_end_callback : function or list of function + These will be called at the end of each full evaluation, with the metrics over + the entire evaluation set. + eval_batch_end_callback : function or list of function + These will be called at the end of each mini-batch during evaluation. + initializer : Initializer + The initializer is called to initialize the module parameters when they are + not already initialized. + arg_params : dict + Defaults to ``None``, if not ``None``, should be existing parameters from a trained + model or loaded from a checkpoint (previously saved model). In this case, + the value here will be used to initialize the module parameters, unless they + are already initialized by the user via a call to `init_params` or `fit`. + `arg_params` has a higher priority than `initializer`. + aux_params : dict + Defaults to ``None``. Similar to `arg_params`, except for auxiliary states. + allow_missing : bool + Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params` + and `aux_params` are not ``None``. If this is ``True``, then the missing parameters + will be initialized via the `initializer`. + force_rebind : bool + Defaults to ``False``. Whether to force rebinding the executors if already bound. + force_init : bool + Defaults to ``False``. Indicates whether to force initialization even if the + parameters are already initialized. + begin_epoch : int + Defaults to 0. Indicates the starting epoch. Usually, if resumed from a + checkpoint saved at a previous training phase at epoch N, then this value should be + N+1. + num_epoch : int + Number of epochs for training. + sparse_row_id_fn : A callback function + The function takes `data_batch` as an input and returns a dict of + str -> NDArray. The resulting dict is used for pulling row_sparse + parameters from the kvstore, where the str key is the name of the param, + and the value is the row id of the param to pull. + + Examples + -------- + >>> # An example of using fit for training. + >>> # Assume training dataIter and validation dataIter are ready + >>> # Assume loading a previously checkpointed model + >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3) + >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd', + ... optimizer_params={'learning_rate':0.01, 'momentum': 0.9}, + ... arg_params=arg_params, aux_params=aux_params, + ... eval_metric='acc', num_epoch=10, begin_epoch=3) + """ + assert num_epoch is not None, 'please specify number of epochs' + + self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label, + for_training=True, force_rebind=force_rebind) + if monitor is not None: + self.install_monitor(monitor) + self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params, + allow_missing=allow_missing, force_init=force_init) + self.init_optimizer(kvstore=kvstore, optimizer=optimizer, + optimizer_params=optimizer_params) + + if validation_metric is None: + validation_metric = eval_metric + if not isinstance(eval_metric, metric.EvalMetric): + eval_metric = metric.create(eval_metric) + + ################################################################################ + # training loop + ################################################################################ + for epoch in range(begin_epoch, num_epoch): + tic = time.time() + eval_metric.reset() + nbatch = 0 + data_iter = iter(train_data) + end_of_batch = False + next_data_batch = next(data_iter) + while not end_of_batch: + data_batch = next_data_batch + if monitor is not None: + monitor.tic() + self.forward_backward(data_batch) + self.update() + + if isinstance(data_batch, list): + self.update_metric(eval_metric, + [db.label for db in data_batch], + pre_sliced=True) + else: + self.update_metric(eval_metric, data_batch.label) + + try: + # pre fetch next batch + next_data_batch = next(data_iter) + self.prepare(next_data_batch, sparse_row_id_fn=sparse_row_id_fn) + except StopIteration: + end_of_batch = True + + if monitor is not None: + monitor.toc_print() + + if end_of_batch: + eval_name_vals = eval_metric.get_global_name_value() + + if batch_end_callback is not None: + batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch, + eval_metric=eval_metric, + locals=locals()) + for callback in _as_list(batch_end_callback): + callback(batch_end_params) + nbatch += 1 + + # one epoch of training is finished + for name, val in eval_name_vals: + self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val) + toc = time.time() + self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic)) + + # sync aux params across devices + arg_params, aux_params = self.get_params() + self.set_params(arg_params, aux_params) + + if epoch_end_callback is not None: + for callback in _as_list(epoch_end_callback): + callback(epoch, self.symbol, arg_params, aux_params) + + #---------------------------------------- + # evaluation on validation set + if eval_data: + res = self.score(eval_data, validation_metric, + score_end_callback=eval_end_callback, + batch_end_callback=eval_batch_end_callback, epoch=epoch) + #TODO: pull this into default + for name, val in res: + self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val) + + # end of 1 epoch, reset the data-iter for another epoch + train_data.reset() + + ################################################################################ + # Symbol information + ################################################################################ + @property + def data_names(self): + """A list of names for data required by this module.""" + raise NotImplementedError() + + @property + def output_names(self): + """A list of names for the outputs of this module.""" + raise NotImplementedError() + + ################################################################################ + # Input/Output information + ################################################################################ + @property + def data_shapes(self): + """A list of (name, shape) pairs specifying the data inputs to this module.""" + raise NotImplementedError() + + @property + def label_shapes(self): + """A list of (name, shape) pairs specifying the label inputs to this module. + If this module does not accept labels -- either it is a module without loss + function, or it is not bound for training, then this should return an empty + list ``[]``. + """ + raise NotImplementedError() + + @property + def output_shapes(self): + """A list of (name, shape) pairs specifying the outputs of this module.""" + raise NotImplementedError() + + ################################################################################ + # Parameters of a module + ################################################################################ + def get_params(self): + """Gets parameters, those are potentially copies of the actual parameters used + to do computation on the device. + + Returns + ------- + ``(arg_params, aux_params)`` + A pair of dictionaries each mapping parameter names to NDArray values. + + Examples + -------- + >>> # An example of getting module parameters. + >>> print mod.get_params() + ({'fc2_weight': , 'fc1_weight': , + 'fc3_bias': , 'fc3_weight': , + 'fc2_bias': , 'fc1_bias': }, {}) + """ + raise NotImplementedError() + + def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, + allow_missing=False, force_init=False, allow_extra=False): + """Initializes the parameters and auxiliary states. + + Parameters + ---------- + initializer : Initializer + Called to initialize parameters if needed. + arg_params : dict + If not ``None``, should be a dictionary of existing `arg_params`. Initialization + will be copied from that. + aux_params : dict + If not ``None``, should be a dictionary of existing `aux_params`. Initialization + will be copied from that. + allow_missing : bool + If ``True``, params could contain missing values, and the initializer will be + called to fill those missing params. + force_init : bool + If ``True``, `force_init` will force re-initialize even if already initialized. + allow_extra : boolean, optional + Whether allow extra parameters that are not needed by symbol. + If this is True, no error will be thrown when arg_params or aux_params + contain extra parameters that is not needed by the executor. + + Examples + -------- + >>> # An example of initializing module parameters. + >>> mod.init_params() + """ + raise NotImplementedError() + + def set_params(self, arg_params, aux_params, allow_missing=False, force_init=True, + allow_extra=False): + """Assigns parameter and aux state values. + + Parameters + ---------- + arg_params : dict + Dictionary of name to value (`NDArray`) mapping. + aux_params : dict + Dictionary of name to value (`NDArray`) mapping. + allow_missing : bool + If ``True``, params could contain missing values, and the initializer will be + called to fill those missing params. + force_init : bool + If ``True``, will force re-initialize even if already initialized. + allow_extra : boolean, optional + Whether allow extra parameters that are not needed by symbol. + If this is True, no error will be thrown when arg_params or aux_params + contain extra parameters that is not needed by the executor. + + Examples + -------- + >>> # An example of setting module parameters. + >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, n_epoch_load) + >>> mod.set_params(arg_params=arg_params, aux_params=aux_params) + """ + self.init_params(initializer=None, arg_params=arg_params, aux_params=aux_params, + allow_missing=allow_missing, force_init=force_init, + allow_extra=allow_extra) + + def save_params(self, fname): + """Saves model parameters to file. + + Parameters + ---------- + fname : str + Path to output param file. + + Examples + -------- + >>> # An example of saving module parameters. + >>> mod.save_params('myfile') + """ + arg_params, aux_params = self.get_params() + save_dict = {('arg:%s' % k) : v.as_in_context(cpu()) for k, v in arg_params.items()} + save_dict.update({('aux:%s' % k) : v.as_in_context(cpu()) for k, v in aux_params.items()}) + ndarray.save(fname, save_dict) + + def load_params(self, fname): + """Loads model parameters from file. + + Parameters + ---------- + fname : str + Path to input param file. + + Examples + -------- + >>> # An example of loading module parameters. + >>> mod.load_params('myfile') + """ + save_dict = ndarray.load(fname) + arg_params = {} + aux_params = {} + for k, value in save_dict.items(): + arg_type, name = k.split(':', 1) + if arg_type == 'arg': + arg_params[name] = value + elif arg_type == 'aux': + aux_params[name] = value + else: + raise ValueError("Invalid param file " + fname) + self.set_params(arg_params, aux_params) + + def get_states(self, merge_multi_context=True): + """Gets states from all devices + + If `merge_multi_context` is ``True``, returns output of form ``[out1, out2]``. + Otherwise, it returns output of the form + ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``. + All output elements are `NDArray`. + + Parameters + ---------- + merge_multi_context : bool + Defaults to ``True``. In the case when data-parallelism is used, the states + will be collected from multiple devices. A ``True`` value indicates that we + should merge the collected results so that they look like from a single + executor. + + Returns + ------- + A list of ``NDArray`` or a list of list of ``NDArray``. + """ + assert self.binded and self.params_initialized + assert not merge_multi_context + return [] + + def set_states(self, states=None, value=None): + """Sets value for states. Only one of states & value can be specified. + + Parameters + ---------- + states : list of list of NDArray + Source states arrays formatted like + ``[[state1_dev1, state1_dev2], [state2_dev1, state2_dev2]]``. + value : number + A single scalar value for all state arrays. + """ + assert self.binded and self.params_initialized + assert not states and not value + + def install_monitor(self, mon): + """Installs monitor on all executors.""" + raise NotImplementedError() + + ################################################################################ + # Computations + ################################################################################ + # pylint: disable=unused-argument + def prepare(self, data_batch, sparse_row_id_fn=None): + '''Prepares the module for processing a data batch. + + Usually involves switching bucket and reshaping. + For modules that contain `row_sparse` parameters in KVStore, + it prepares the `row_sparse` parameters based on the sparse_row_id_fn. + + When KVStore is used to update parameters for multi-device or multi-machine training, + a copy of the parameters are stored in KVStore. Note that for `row_sparse` parameters, + the `update()` updates the copy of parameters in KVStore, but doesn't broadcast + the updated parameters to all devices / machines. The `prepare` function is used to + broadcast `row_sparse` parameters with the next batch of data. + + Parameters + ---------- + data_batch : DataBatch + The current batch of data for forward computation. + + sparse_row_id_fn : A callback function + The function takes `data_batch` as an input and returns a dict of + str -> NDArray. The resulting dict is used for pulling row_sparse + parameters from the kvstore, where the str key is the name of the param, + and the value is the row id of the param to pull. + ''' + if sparse_row_id_fn is not None: + warnings.warn(UserWarning("sparse_row_id_fn is not invoked for BaseModule.")) + # pylint: enable=unused-argument + + def forward(self, data_batch, is_train=None): + """Forward computation. It supports data batches with different shapes, such as + different batch sizes or different image sizes. + If reshaping of data batch relates to modification of symbol or module, such as + changing image layout ordering or switching from training to predicting, module + rebinding is required. + + Parameters + ---------- + data_batch : DataBatch + Could be anything with similar API implemented. + is_train : bool + Default is ``None``, which means `is_train` takes the value of ``self.for_training``. + + Examples + -------- + >>> import mxnet as mx + >>> from collections import namedtuple + >>> Batch = namedtuple('Batch', ['data']) + >>> data = mx.sym.Variable('data') + >>> out = data * 2 + >>> mod = mx.mod.Module(symbol=out, label_names=None) + >>> mod.bind(data_shapes=[('data', (1, 10))]) + >>> mod.init_params() + >>> data1 = [mx.nd.ones((1, 10))] + >>> mod.forward(Batch(data1)) + >>> print mod.get_outputs()[0].asnumpy() + [[ 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]] + >>> # Forward with data batch of different shape + >>> data2 = [mx.nd.ones((3, 5))] + >>> mod.forward(Batch(data2)) + >>> print mod.get_outputs()[0].asnumpy() + [[ 2. 2. 2. 2. 2.] + [ 2. 2. 2. 2. 2.] + [ 2. 2. 2. 2. 2.]] + """ + raise NotImplementedError() + + def backward(self, out_grads=None): + """Backward computation. + + Parameters + ---------- + out_grads : NDArray or list of NDArray, optional + Gradient on the outputs to be propagated back. + This parameter is only needed when bind is called + on outputs that are not a loss function. + + Examples + -------- + >>> # An example of backward computation. + >>> mod.backward() + >>> print mod.get_input_grads()[0].asnumpy() + [[[ 1.10182791e-05 5.12257748e-06 4.01927764e-06 8.32566820e-06 + -1.59775993e-06 7.24269375e-06 7.28067835e-06 -1.65902311e-05 + 5.46342608e-06 8.44196393e-07] + ...]] + """ + raise NotImplementedError() + + def get_outputs(self, merge_multi_context=True): + """Gets outputs of the previous forward computation. + + If `merge_multi_context` is ``True``, it is like ``[out1, out2]``. Otherwise, + it returns out put of form ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``. + All the output elements have type `NDArray`. When `merge_multi_context` is ``False``, + those `NDArray` instances might live on different devices. + + Parameters + ---------- + merge_multi_context : bool + Defaults to ``True``. In the case when data-parallelism is used, the outputs + will be collected from multiple devices. A ``True`` value indicates that we + should merge the collected results so that they look like from a single + executor. + + Returns + ------- + list of `NDArray` or list of list of `NDArray`. + Output + + Examples + -------- + >>> # An example of getting forward output. + >>> print mod.get_outputs()[0].asnumpy() + [[ 0.09999977 0.10000153 0.10000716 0.10000195 0.09999853 0.09999743 + 0.10000272 0.10000113 0.09999088 0.09999888]] + """ + raise NotImplementedError() + + def get_input_grads(self, merge_multi_context=True): + """Gets the gradients to the inputs, computed in the previous backward computation. + + If `merge_multi_context` is ``True``, it is like ``[grad1, grad2]``. Otherwise, it + is like ``[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]``. All the output + elements have type `NDArray`. When `merge_multi_context` is ``False``, those `NDArray` + instances might live on different devices. + + Parameters + ---------- + merge_multi_context : bool + Defaults to ``True``. In the case when data-parallelism is used, the gradients + will be collected from multiple devices. A ``True`` value indicates that we + should merge the collected results so that they look like from a single + executor. + + Returns + ------- + list of NDArray or list of list of NDArray + Input gradients. + + Examples + -------- + >>> # An example of getting input gradients. + >>> print mod.get_input_grads()[0].asnumpy() + [[[ 1.10182791e-05 5.12257748e-06 4.01927764e-06 8.32566820e-06 + -1.59775993e-06 7.24269375e-06 7.28067835e-06 -1.65902311e-05 + 5.46342608e-06 8.44196393e-07] + ...]] + """ + raise NotImplementedError() + + def update(self): + """Updates parameters according to the installed optimizer and the gradients computed + in the previous forward-backward batch. + + When KVStore is used to update parameters for multi-device or multi-machine training, + a copy of the parameters are stored in KVStore. Note that for `row_sparse` parameters, + this function does update the copy of parameters in KVStore, but doesn't broadcast the + updated parameters to all devices / machines. Please call `prepare` to broadcast + `row_sparse` parameters with the next batch of data. + + Examples + -------- + >>> # An example of updating module parameters. + >>> mod.init_optimizer(kvstore='local', optimizer='sgd', + ... optimizer_params=(('learning_rate', 0.01), )) + >>> mod.backward() + >>> mod.update() + >>> print mod.get_params()[0]['fc3_weight'].asnumpy() + [[ 5.86930104e-03 5.28078526e-03 -8.88729654e-03 -1.08308345e-03 + 6.13054074e-03 4.27560415e-03 1.53817423e-03 4.62131854e-03 + 4.69872449e-03 -2.42400169e-03 9.94111411e-04 1.12386420e-03 + ...]] + """ + raise NotImplementedError() + + def update_metric(self, eval_metric, labels, pre_sliced=False): + """Evaluates and accumulates evaluation metric on outputs of the last forward + computation. + + Parameters + ---------- + eval_metric : EvalMetric + Evaluation metric to use. + labels : list of NDArray if `pre_sliced` parameter is set to `False`, + list of lists of NDArray otherwise. Typically `data_batch.label`. + pre_sliced: bool + Whether the labels are already sliced per device (default: False). + + Examples + -------- + >>> # An example of updating evaluation metric. + >>> mod.forward(data_batch) + >>> mod.update_metric(metric, data_batch.label) + """ + raise NotImplementedError() + + ################################################################################ + # module setup + ################################################################################ + def bind(self, data_shapes, label_shapes=None, for_training=True, + inputs_need_grad=False, force_rebind=False, shared_module=None, + grad_req='write'): + """Binds the symbols to construct executors. This is necessary before one + can perform computation with the module. + + Parameters + ---------- + data_shapes : list of (str, tuple) or DataDesc objects + Typically is ``data_iter.provide_data``. Can also be a list of + (data name, data shape). + label_shapes : list of (str, tuple) or DataDesc objects + Typically is ``data_iter.provide_label``. Can also be a list of + (label name, label shape). + for_training : bool + Default is ``True``. Whether the executors should be bind for training. + inputs_need_grad : bool + Default is ``False``. Whether the gradients to the input data need to be computed. + Typically this is not needed. But this might be needed when implementing composition + of modules. + force_rebind : bool + Default is ``False``. This function does nothing if the executors are already + bound. But with this ``True``, the executors will be forced to rebind. + shared_module : Module + Default is ``None``. This is used in bucketing. When not ``None``, the shared module + essentially corresponds to a different bucket -- a module with different symbol + but with the same sets of parameters (e.g. unrolled RNNs with different lengths). + grad_req : str, list of str, dict of str to str + Requirement for gradient accumulation. Can be 'write', 'add', or 'null' + (default to 'write'). + Can be specified globally (str) or for each argument (list, dict). + + Examples + -------- + >>> # An example of binding symbols. + >>> mod.bind(data_shapes=[('data', (1, 10, 10))]) + >>> # Assume train_iter is already created. + >>> mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) + """ + raise NotImplementedError() + + def init_optimizer(self, kvstore='local', optimizer='sgd', + optimizer_params=(('learning_rate', 0.01),), force_init=False): + """Installs and initializes optimizers, as well as initialize kvstore for + distributed training + + Parameters + ---------- + kvstore : str or KVStore + Defaults to `'local'`. + optimizer : str or Optimizer + Defaults to `'sgd'`. + optimizer_params : dict + Defaults to ``(('learning_rate', 0.01),)``. The default value is not a dictionary, + just to avoid pylint warning of dangerous default values. + force_init : bool + Defaults to ``False``, indicates whether to force re-initializing an optimizer + if it is already installed. + + Examples + -------- + >>> # An example of initializing optimizer. + >>> mod.init_optimizer(optimizer='sgd', optimizer_params=(('learning_rate', 0.005),)) + """ + raise NotImplementedError() + + ################################################################################ + # misc + ################################################################################ + @property + def symbol(self): + """Gets the symbol associated with this module. + + Except for `Module`, for other types of modules (e.g. `BucketingModule`), this + property might not be a constant throughout its life time. Some modules might + not even be associated with any symbols. + """ + return self._symbol diff --git a/tests/nightly/estimator/test_estimator_cnn.py b/tests/nightly/estimator/test_estimator_cnn.py index b3b0d536af24..e6d8b846f614 100644 --- a/tests/nightly/estimator/test_estimator_cnn.py +++ b/tests/nightly/estimator/test_estimator_cnn.py @@ -116,7 +116,7 @@ def test_estimator_cpu(): # Define estimator est = estimator.Estimator(net=net, loss=loss, - train_metrics=mx.gluon.metric.Accuracy(), + train_metrics=mx.metric.Accuracy(), trainer=trainer, context=context) # Call fit() @@ -140,7 +140,7 @@ def test_estimator_gpu(): train_data, test_data = load_data_mnist(batch_size, resize=224) loss = gluon.loss.SoftmaxCrossEntropyLoss() net.hybridize() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001}) # Define estimator est = estimator.Estimator(net=net, diff --git a/tests/nightly/estimator/test_sentiment_rnn.py b/tests/nightly/estimator/test_sentiment_rnn.py index 69380389d48e..0e4f39c5687f 100644 --- a/tests/nightly/estimator/test_sentiment_rnn.py +++ b/tests/nightly/estimator/test_sentiment_rnn.py @@ -190,11 +190,11 @@ def run(net, train_dataloader, test_dataloader, num_epochs, ctx, lr): trainer = mx.gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr}) # Define loss and evaluation metrics loss = gluon.loss.SoftmaxCrossEntropyLoss() - metrics = mx.gluon.metric.CompositeEvalMetric() - acc = mx.gluon.metric.Accuracy() - nested_metrics = mx.gluon.metric.CompositeEvalMetric() - metrics.add([acc, mx.gluon.metric.Loss()]) - nested_metrics.add([metrics, mx.gluon.metric.Accuracy()]) + metrics = mx.metric.CompositeEvalMetric() + acc = mx.metric.Accuracy() + nested_metrics = mx.metric.CompositeEvalMetric() + metrics.add([acc, mx.metric.Loss()]) + nested_metrics.add([metrics, mx.metric.Accuracy()]) # Define estimator est = estimator.Estimator(net=net, loss=loss, train_metrics=nested_metrics, diff --git a/tests/nightly/test_optimizer.py b/tests/nightly/test_optimizer.py new file mode 100644 index 000000000000..0a87368d991e --- /dev/null +++ b/tests/nightly/test_optimizer.py @@ -0,0 +1,90 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx + +import sys +import os +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +sys.path.insert(0, os.path.join(curr_path, '../unittest')) +from common import setup_module, with_seed + +# This script is testing the efficiency of LARS +# We are training LeNet-5 at batch-size 8000 in 10 epochs above 98% accuracy +# Which is not doable with simple SGD + momentum (from what have been tested so far) + +def lenet5(): + """LeNet-5 Symbol""" + #pylint: disable=no-member + data = mx.sym.Variable('data') + conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20) + tanh1 = mx.sym.Activation(data=conv1, act_type="tanh") + pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", + kernel=(2, 2), stride=(2, 2)) + # second conv + conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50) + tanh2 = mx.sym.Activation(data=conv2, act_type="tanh") + pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", + kernel=(2, 2), stride=(2, 2)) + # first fullc + flatten = mx.sym.Flatten(data=pool2) + fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=500) + tanh3 = mx.sym.Activation(data=fc1, act_type="tanh") + # second fullc + fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10) + # loss + lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax') + #pylint: enable=no-member + return lenet + +@with_seed() +def test_lars(): + num_epochs = 10 + batch_size = 8000 + mnist = mx.test_utils.get_mnist() + train_iter = mx.io.NDArrayIter(mnist['train_data'], + mnist['train_label'], + batch_size, + shuffle=True) + test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size) + ctx = mx.gpu(0) + lenet_model = mx.mod.Module(lenet5(), context=ctx) + warmup_epochs = 1 + epoch_it = int(train_iter.num_data / batch_size) + # LARS works best with Polynomial scheduler and warmup + base_lr = 0.01 + optimizer_params={ + 'learning_rate': base_lr, + 'lr_scheduler': mx.lr_scheduler.PolyScheduler(base_lr=base_lr, + max_update=epoch_it * num_epochs, + warmup_steps=epoch_it * warmup_epochs), + 'momentum': 0.9, + 'eta': 14., + } + lenet_model.fit(train_iter, + eval_data=test_iter, + optimizer='lars', + optimizer_params=optimizer_params, + eval_metric='acc', + num_epoch=num_epochs) + + # predict accuracy for lenet + acc = mx.metric.Accuracy() + lenet_model.score(test_iter, acc) + accuracy = acc.get()[1] + assert accuracy > 0.98, "LeNet-5 training accuracy on MNIST was too low" + diff --git a/tests/nightly/test_tlocal_racecondition.py b/tests/nightly/test_tlocal_racecondition.py new file mode 100644 index 000000000000..d43c45937c05 --- /dev/null +++ b/tests/nightly/test_tlocal_racecondition.py @@ -0,0 +1,110 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from mxnet import gluon +from mxnet import image +from mxnet import nd +import numpy as np +import logging + +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +root_url = ('https://apache-mxnet.s3-accelerate.amazonaws.com/' + 'gluon/dataset/pikachu/') +data_dir = './data/pikachu/' +dataset = {'train.rec': 'e6bcb6ffba1ac04ff8a9b1115e650af56ee969c8', + 'train.idx': 'dcf7318b2602c06428b9988470c731621716c393', + 'val.rec': 'd6c33f799b4d058e82f2cb5bd9a976f69d72d520'} +for k, v in dataset.items(): + gluon.utils.download(root_url+k, data_dir+k, sha1_hash=v) + +T = 1 +devs = [mx.gpu(i) for i in range(4)] +data_shape = 224 * T +batch_size = 20 * len(devs) +rgb_mean = np.array([1,2,3]) + +class_names = ['pikachu'] +num_class = len(class_names) + +def get_iterators(data_shape, batch_size): + train_iter = image.ImageDetIter( + batch_size=batch_size, + data_shape=(3, data_shape, data_shape), + path_imgrec=data_dir+'train.rec', + path_imgidx=data_dir+'train.idx', + shuffle=True, + mean=True, + rand_crop=1, + min_object_covered=0.95, + max_attempts=200) + val_iter = image.ImageDetIter( + batch_size=batch_size, + data_shape=(3, data_shape, data_shape), + path_imgrec=data_dir+'val.rec', + shuffle=False, + mean=True) + return train_iter, val_iter, class_names, num_class + +train_data, test_data, class_names, num_class = get_iterators( + data_shape, batch_size) + + +class MyCustom(mx.operator.CustomOp): + def __init__(self): + super(MyCustom, self).__init__() + def forward(self, is_train, req, in_data, out_data, aux): + self.assign(out_data[0], req[0], 0) + def backward(self, req, out_grad, in_data, out_data, in_grad, aux): + self.assign(in_grad[0], req[0], 0) + self.assign(in_grad[1], req[1], 0) + +@mx.operator.register("MyCustom") +class MyCustomProp(mx.operator.CustomOpProp): + def __init__(self): + super(MyCustomProp, self).__init__(need_top_grad = False) + def list_arguments(self): + return ["data", "label"] + def list_outputs(self): + return ["loss"] + def infer_shape(self, in_shape): + return [in_shape[0], in_shape[1]], [(1, )], [] + def infer_type(self, in_type): + dtype = in_type[0] + return [dtype, dtype], [dtype], [] + def create_operator(self, ctx, shapes, dtypes): + return MyCustom() + +class MyMetric(mx.metric.EvalMetric): + def __init__(self): + super(MyMetric, self).__init__("MyMetric") + self.name = ['empty'] + def update(self, labels, preds): + pass + def get(self): + return self.name, [0] + +if __name__ == '__main__': + x = mx.sym.Variable("data") + label = mx.sym.Variable("label") + x = mx.sym.FullyConnected(data = x, num_hidden = 100) + label = mx.sym.Reshape(data = label, shape = (0, -1)) + sym = mx.sym.Custom(data = x, label = label, op_type = "MyCustom") + model = mx.module.Module(context = devs, symbol = sym, data_names = ('data',), label_names = ('label',)) + model.fit(train_data = train_data, begin_epoch = 0, num_epoch = 20, allow_missing = True, batch_end_callback = mx.callback.Speedometer(batch_size, 5), eval_metric = MyMetric()) diff --git a/tests/python/gpu/test_contrib_amp.py b/tests/python/gpu/test_contrib_amp.py index d7a6e80b8982..86126744a127 100644 --- a/tests/python/gpu/test_contrib_amp.py +++ b/tests/python/gpu/test_contrib_amp.py @@ -103,6 +103,255 @@ def test_amp_coverage(amp_tests): - If you are not sure which list to choose, FP32_FUNCS is the safest option""") +@pytest.mark.skip(reason='Error during waitall(). Tracked in #18099') +@with_seed() +def test_amp_conversion(amp_tests): + def check_amp_convert_symbol(): + x = mx.sym.var("x") + y = mx.sym.var("y") + z = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True) + siny = mx.sym.sin(y) + res = z + siny + # Compare symbols with similar computation graphs created using convert_symbol and manually. + res_converted = amp.convert_symbol(res, target_dtype="float16", + target_dtype_ops=["FullyConnected"], + fp32_ops=["sin"]) + + x_fp16 = mx.sym.amp_cast(x, dtype="float16") + y_fp16 = mx.sym.amp_cast(y, dtype="float16") + siny = mx.sym.sin(y) + z = mx.sym.FullyConnected(x_fp16, y_fp16, num_hidden=10, no_bias=True) + amp_casted_z = mx.sym.amp_cast(z, dtype="float32") + res_expected = amp_casted_z + siny + assert same_symbol_structure(res_converted, res_expected), \ + "convert_symbol generating wrong computation graph" + + # convert_symbol called with incorrect inputs + pytest.raises(AssertionError, amp.convert_symbol, res, + target_dtype="float16", target_dtype_ops=["FullyConnected"], + fp32_ops=["elemwise_add"]) + pytest.raises(AssertionError, amp.convert_symbol, res, + target_dtype="float16", target_dtype_ops=["FullyConnected"], + fp32_ops=["Activation"], + conditional_fp32_ops=[('Activation', 'act_type', ['selu'])]) + pytest.raises(AssertionError, amp.convert_symbol, res, + target_dtype="float16", target_dtype_ops=["Activation"], + fp32_ops=["Activation"], + conditional_fp32_ops=[('Activation', 'act_type', ['selu'])]) + pytest.raises(AssertionError, amp.convert_symbol, res, + target_dtype="float16", target_dtype_ops=["FullyConnected"], + fp32_ops=["FullyConnected"]) + + # Test for op in conditional ops with condition not satisfied + x = mx.sym.var("x") + y = mx.sym.var("y") + fc_cond = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True) + res_converted = amp.convert_symbol(fc_cond, target_dtype="float16", + target_dtype_ops=[], + fp32_ops=["sin"], + conditional_fp32_ops=[("FullyConnected", "no_bias", ["False"])]) + + res_expected = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True) + assert same_symbol_structure(res_converted, res_expected), \ + "convert_symbol generating wrong computation graph when conditional ops is used" + + # Test for op in conditional ops with condition satisfied + res_converted = amp.convert_symbol(fc_cond, target_dtype="float16", target_dtype_ops=[], + fp32_ops=["sin"], + conditional_fp32_ops=[("FullyConnected", "no_bias", ["True"])]) + x_fp32 = mx.sym.amp_cast(x, dtype="float32") + y_fp32 = mx.sym.amp_cast(y, dtype="float32") + res_expected = mx.sym.FullyConnected(x_fp32, y_fp32, num_hidden=10, no_bias=True) + assert same_symbol_structure(res_converted, res_expected), \ + "convert_symbol generating wrong computation graph when conditional ops used with satisfying condition" + + # Test with a real world model, default inputs for convert_symbol + dir_path = os.path.dirname(os.path.realpath(__file__)) + model_path = os.path.join(dir_path, 'model') + if not os.path.isdir(model_path): + os.mkdir(model_path) + + prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path) + sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) + inputs = {} + inputs['data'] = mx.nd.ones((1, 3, 224, 224)) + inputs.update(arg_params) + converted_sym = amp.convert_symbol(sym) + exe = converted_sym.simple_bind(mx.gpu(0), data=(1, 3, 224, 224), grad_req='null') + exe.forward(is_train=False, **inputs) + exe.outputs[0].asnumpy() + + inputs2 = {} + inputs2['data'] = mx.nd.ones((1, 3, 224, 224)) + inputs2['fc1_weight'] = inputs['fc1_weight'].astype(np.float16) + inputs2['fc1_bias'] = inputs['fc1_bias'].astype(np.float16) + + # Test with a real world model, tweak inputs for convert_symbol + converted_sym = amp.convert_symbol(sym, target_dtype="float16", + target_dtype_ops=["Convolution"], data_names=["data"], + cast_optional_params=True) + converted_sym2 = amp.convert_symbol(sym, target_dtype="float16", + target_dtype_ops=["Convolution"], data_names=["data"], + cast_optional_params=False) + + exe = converted_sym.simple_bind(mx.gpu(0), data=(1, 3, 224, 224), grad_req='null') + exe2 = converted_sym2.simple_bind(mx.gpu(), data=(1, 3, 224, 224), grad_req='null') + + converted_args = converted_sym.list_arguments() + converted_auxs = converted_sym.list_auxiliary_states() + for i, key in enumerate(exe.arg_arrays): + if converted_args[i] in arg_params: + arg_params[converted_args[i]] = arg_params[converted_args[i]].astype(exe.arg_arrays[i].dtype) + for i, key in enumerate(exe.aux_arrays): + if converted_auxs[i] in aux_params: + aux_params[converted_auxs[i]] = aux_params[converted_auxs[i]].astype(exe.aux_arrays[i].dtype) + + inputs2.update(arg_params) + exe.forward(is_train=False, **inputs2) + exe.outputs[0].wait_to_read() + + inputs['fc1_weight'] = inputs['fc1_weight'].astype(np.float16) + inputs['fc1_bias'] = inputs['fc1_bias'].astype(np.float16) + exe2.forward(is_train=False, **inputs) + exe2.outputs[0].wait_to_read() + + + def check_amp_convert_model(): + # Test with real world model, default inputs for convert_model + dir_path = os.path.dirname(os.path.realpath(__file__)) + model_path = os.path.join(dir_path, 'model') + if not os.path.isdir(model_path): + os.mkdir(model_path) + prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path) + + sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) + + # Test with real world model, tweak inputs for convert_model + result_sym, result_arg_params, result_aux_params = amp.convert_model(sym, + arg_params, + aux_params, + target_dtype="float16", + target_dtype_ops=["Convolution"]) + mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu()) + mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]]) + + mod.set_params(result_arg_params, result_aux_params) + mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))], + label=[mx.nd.ones((1,))])) + mod.get_outputs()[0].asnumpy() + assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == np.float32 + + # Call convert_model with cast_optional_params set to True + result_sym, result_arg_params, result_aux_params = amp.convert_model(sym, + arg_params, + aux_params, + target_dtype="float16", + target_dtype_ops=["Convolution"], cast_optional_params=True) + mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu()) + mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]]) + mod.set_params(result_arg_params, result_aux_params) + mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))], + label=[mx.nd.ones((1,))])) + mod.get_outputs()[0].asnumpy() + assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == np.float16 + + + def check_amp_convert_hybrid_block(): + # Test conversion for hybrid block on CPU + model_cpu = get_model("resnet50_v1") + model_cpu.collect_params().initialize(ctx=mx.cpu()) + model_cpu.hybridize() + model_cpu(mx.nd.random.uniform(0, 1, shape=(1, 3, 224, 224), ctx=mx.cpu())) + converted_model_cpu = amp.convert_hybrid_block(model_cpu) + + # Test with real world model, default inputs for convert_hybrid_block + model = get_model("resnet50_v1") + model.collect_params().initialize(ctx=mx.gpu()) + model.hybridize() + model(mx.nd.zeros((1, 3, 224, 224))) + converted_model = amp.convert_hybrid_block(model) + result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224), + dtype=np.float32)) + result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224), + dtype=np.float32)) + + # Test with real world model, tweak inputs for convert_hybrid_block + converted_model = amp.convert_hybrid_block(model, target_dtype="float16", + target_dtype_ops=["Convolution"]) + result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224), + dtype=np.float32)) + result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224), + dtype=np.float32)) + + # Check symbolic block + dir_path = os.path.dirname(os.path.realpath(__file__)) + model_path = os.path.join(dir_path, 'model') + if not os.path.isdir(model_path): + os.mkdir(model_path) + prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path) + net = SymbolBlock.imports(os.path.join(model_path, "imagenet1k-resnet-18-symbol.json"), + input_names=["data", "softmax_label"], + param_file=os.path.join(model_path, "imagenet1k-resnet-18-0000.params")) + net.collect_params().reset_ctx(ctx=mx.gpu()) + net.hybridize() + net(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,))) + converted_model = amp.convert_hybrid_block(net) + result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,))) + result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,))) + + # Check symbolic block, tweaked inputs + converted_model = amp.convert_hybrid_block(net, target_dtype="float16", target_dtype_ops=["Convolution"]) + result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, ))) + result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, ))) + params = converted_model.collect_params() + assert params["stage2_unit1_conv2_weight"].dtype == np.float32 + + # Pass cast_optional_params as True to convert_hybrid_block + converted_model = amp.convert_hybrid_block(net, target_dtype="float16", target_dtype_ops=["Convolution"], + cast_optional_params=True) + params = converted_model.collect_params() + assert params["stage2_unit1_conv2_weight"].dtype == np.float16 + + + def check_amp_convert_bucketing_module(): + model = train_model(context=mx.current_context()) + result_model = amp.convert_bucketing_module(model) + val_sent = [] + batch_size = 128 + invalid_label = -1 + num_sentence = 1000 + buckets = [5, 10, 20, 30, 40] + len_vocab = 50 + + for _ in range(num_sentence): + len_sentence = randint(6, max(buckets)-1) # leave out the two last buckets empty + val_sentence = [] + for _ in range(len_sentence): + val_sentence.append(randint(1, len_vocab)) + val_sent.append(val_sentence) + + data_val = mx.rnn.BucketSentenceIter(val_sent, batch_size, buckets=buckets, + invalid_label=invalid_label) + result_model.bind(data_val.provide_data, data_val.provide_label, for_training=False) + result_model.score(data_val, mx.metric.Perplexity(invalid_label), + batch_end_callback=mx.callback.Speedometer(batch_size, 1)) + + # AMP conversion with cast_optional_params set to true + # Flaky test when cast_optional_params set to True : https://github.com/apache/incubator-mxnet/issues/16030 + ''' + result_model = amp.convert_bucketing_module(model, cast_optional_params=True) + result_model.bind(data_val.provide_data, data_val.provide_label, for_training=False) + result_model.score(data_val, mx.metric.Perplexity(invalid_label), + batch_end_callback=mx.callback.Speedometer(batch_size, 1)) + ''' + + + with mx.Context(mx.gpu(0)): + check_amp_convert_symbol() + check_amp_convert_model() + check_amp_convert_hybrid_block() + check_amp_convert_bucketing_module() + @with_seed() @pytest.mark.skip(reason='Error during waitall(). Tracked in #18099') @assert_raises_cudnn_not_satisfied(min_version='5.1.10') diff --git a/tests/python/tensorrt/lenet5_train.py b/tests/python/tensorrt/lenet5_train.py new file mode 100644 index 000000000000..441729fe0d56 --- /dev/null +++ b/tests/python/tensorrt/lenet5_train.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import mxnet as mx +import numpy as np +from tempfile import TemporaryDirectory + +def get_iters(mnist, batch_size): + """Get MNIST iterators.""" + train_iter = mx.io.NDArrayIter(mnist['train_data'], + mnist['train_label'], + batch_size, + shuffle=True) + val_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size) + test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size) + all_test_labels = np.array(mnist['test_label']) + return train_iter, val_iter, test_iter, all_test_labels + +def lenet5(): + """LeNet-5 Symbol""" + #pylint: disable=no-member + data = mx.sym.Variable('data') + data = mx.sym.Cast(data, 'float16') + conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20) + tanh1 = mx.sym.Activation(data=conv1, act_type="tanh") + pool1 = mx.sym.Pooling(data=tanh1, pool_type="max", + kernel=(2, 2), stride=(2, 2)) + # second conv + conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50) + tanh2 = mx.sym.Activation(data=conv2, act_type="tanh") + pool2 = mx.sym.Pooling(data=tanh2, pool_type="max", + kernel=(2, 2), stride=(2, 2)) + # first fullc + flatten = mx.sym.Flatten(data=pool2) + fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=500) + tanh3 = mx.sym.Activation(data=fc1, act_type="tanh") + # second fullc + fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10) + fc2 = mx.sym.Cast(fc2, 'float32') + # loss + lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax') + #pylint: enable=no-member + return lenet + + +def train_lenet5(num_epochs, batch_size, train_iter, val_iter, test_iter): + """train LeNet-5 model on MNIST data""" + ctx = mx.gpu(0) + lenet_model = mx.mod.Module(lenet5(), context=ctx) + + lenet_model.fit(train_iter, + eval_data=val_iter, + optimizer='sgd', + optimizer_params={'learning_rate': 0.1, 'momentum': 0.9}, + eval_metric='acc', + batch_end_callback=mx.callback.Speedometer(batch_size, 1), + num_epoch=num_epochs) + + # predict accuracy for lenet + acc = mx.metric.Accuracy() + lenet_model.score(test_iter, acc) + accuracy = acc.get()[1] + assert accuracy > 0.95, "LeNet-5 training accuracy on MNIST was too low" + return lenet_model + + +if __name__ == '__main__': + num_epochs = 10 + batch_size = 128 + model_name = 'lenet5' + model_dir = os.getenv("LENET_MODEL_DIR", "/tmp") + model_file = '%s/%s-symbol.json' % (model_dir, model_name) + params_file = '%s/%s-%04d.params' % (model_dir, model_name, num_epochs) + + if not (os.path.exists(model_file) and os.path.exists(params_file)): + with TemporaryDirectory() as path: + mnist = mx.test_utils.get_mnist(path) + + _, _, _, all_test_labels = get_iters(mnist, batch_size) + + trained_lenet = train_lenet5(num_epochs, batch_size, + *get_iters(mnist, batch_size)[:-1]) + trained_lenet.save_checkpoint(model_name, num_epochs) diff --git a/tests/python/tensorrt/test_cvnets.py b/tests/python/tensorrt/test_cvnets.py new file mode 100644 index 000000000000..99312d76dc7a --- /dev/null +++ b/tests/python/tensorrt/test_cvnets.py @@ -0,0 +1,169 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import gc +import gluoncv +import mxnet as mx +import numpy as np + +from mxnet import gluon +from time import time + +from mxnet.gluon.data.vision import transforms + + +def get_classif_model(model_name, use_tensorrt, ctx=mx.gpu(0), batch_size=128): + mx.contrib.tensorrt.set_use_fp16(False) + h, w = 32, 32 + net = gluoncv.model_zoo.get_model(model_name, pretrained=True) + net.hybridize() + net.forward(mx.nd.zeros((batch_size, 3, h, w))) + net.export(model_name) + _sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0) + if use_tensorrt: + sym = _sym.get_backend_symbol('TensorRT') + arg_params, aux_params = mx.contrib.tensorrt.init_tensorrt_params(sym, arg_params, + aux_params) + else: + sym = _sym + executor = sym.simple_bind(ctx=ctx, data=(batch_size, 3, h, w), + softmax_label=(batch_size,), + grad_req='null', force_rebind=True) + executor.copy_params_from(arg_params, aux_params) + return executor + + +def cifar10_infer(model_name, use_tensorrt, num_workers, ctx=mx.gpu(0), batch_size=128): + executor = get_classif_model(model_name, use_tensorrt, ctx, batch_size) + + num_ex = 10000 + all_preds = np.zeros([num_ex, 10]) + + all_label_test = np.zeros(num_ex) + + transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]) + ]) + + data_loader = lambda: gluon.data.DataLoader( + gluon.data.vision.CIFAR10(train=False).transform_first(transform_test), + batch_size=batch_size, shuffle=False, num_workers=num_workers) + + val_data = data_loader() + + for idx, (data, label) in enumerate(val_data): + # Skip last batch if it's undersized. + if data.shape[0] < batch_size: + continue + offset = idx * batch_size + all_label_test[offset:offset + batch_size] = label.asnumpy() + + # warm-up, but don't use result + executor.forward(is_train=False, data=data) + executor.outputs[0].wait_to_read() + + gc.collect() + val_data = data_loader() + example_ct = 0 + start = time() + + # if use_tensorrt: + for idx, (data, label) in enumerate(val_data): + # Skip last batch if it's undersized. + if data.shape[0] < batch_size: + continue + executor.forward(is_train=False, data=data) + preds = executor.outputs[0].asnumpy() + offset = idx * batch_size + all_preds[offset:offset + batch_size, :] = preds[:batch_size] + example_ct += batch_size + + all_preds = np.argmax(all_preds, axis=1) + matches = (all_preds[:example_ct] == all_label_test[:example_ct]).sum() + duration = time() - start + + return duration, 100.0 * matches / example_ct + + +def run_experiment_for(model_name, batch_size, num_workers): + print("\n===========================================") + print("Model: %s" % model_name) + print("===========================================") + print("*** Running inference using pure MXNet ***\n") + mx_duration, mx_pct = cifar10_infer(model_name=model_name, batch_size=batch_size, + num_workers=num_workers, use_tensorrt=False) + print("\nMXNet: time elapsed: %.3fs, accuracy: %.2f%%" % (mx_duration, mx_pct)) + print("\n*** Running inference using MXNet + TensorRT ***\n") + trt_duration, trt_pct = cifar10_infer(model_name=model_name, batch_size=batch_size, + num_workers=num_workers, use_tensorrt=True) + print("TensorRT: time elapsed: %.3fs, accuracy: %.2f%%" % (trt_duration, trt_pct)) + speedup = mx_duration / trt_duration + print("TensorRT speed-up (not counting compilation): %.2fx" % speedup) + + acc_diff = abs(mx_pct - trt_pct) + print("Absolute accuracy difference: %f" % acc_diff) + return speedup, acc_diff + + +def test_tensorrt_on_cifar_resnets(batch_size=32, tolerance=0.1, num_workers=1): + original_use_fp16 = mx.contrib.tensorrt.get_use_fp16() + try: + models = [ + 'cifar_resnet20_v1', + 'cifar_resnet56_v1', + 'cifar_resnet110_v1', + 'cifar_resnet20_v2', + 'cifar_resnet56_v2', + 'cifar_resnet110_v2', + 'cifar_wideresnet16_10', + 'cifar_wideresnet28_10', + 'cifar_wideresnet40_8', + 'cifar_resnext29_16x64d' + ] + + num_models = len(models) + + speedups = np.zeros(num_models, dtype=np.float32) + acc_diffs = np.zeros(num_models, dtype=np.float32) + + test_start = time() + + for idx, model in enumerate(models): + speedup, acc_diff = run_experiment_for(model, batch_size, num_workers) + speedups[idx] = speedup + acc_diffs[idx] = acc_diff + assert acc_diff < tolerance, "Accuracy difference between MXNet and TensorRT > %.2f%% for model %s" % ( + tolerance, model) + + print("Perf and correctness checks run on the following models:") + print(models) + mean_speedup = np.mean(speedups) + std_speedup = np.std(speedups) + print("\nSpeedups:") + print(speedups) + print("Speedup range: [%.2f, %.2f]" % (np.min(speedups), np.max(speedups))) + print("Mean speedup: %.2f" % mean_speedup) + print("St. dev. of speedups: %.2f" % std_speedup) + print("\nAcc. differences: %s" % str(acc_diffs)) + + test_duration = time() - test_start + + print("Test duration: %.2f seconds" % test_duration) + finally: + mx.contrib.tensorrt.set_use_fp16(original_use_fp16) + diff --git a/tests/python/train/test_autograd.py b/tests/python/train/test_autograd.py index f0fdc5ea2576..02a3601eb362 100644 --- a/tests/python/train/test_autograd.py +++ b/tests/python/train/test_autograd.py @@ -55,7 +55,7 @@ def get_net(): batch_size=batch_size, shuffle=True, flat=True, silent=False) def score(net, ctx_list): - metric = gluon.metric.Accuracy() + metric = mx.metric.Accuracy() val_data.reset() for batch in val_data: datas = gluon.utils.split_and_load(batch.data[0], ctx_list, batch_axis=0) @@ -69,7 +69,7 @@ def score(net, ctx_list): def train(net, epoch, ctx_list): net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx_list) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5}) - metric = gluon.metric.Accuracy() + metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() for i in range(epoch): diff --git a/tests/python/train/test_bucketing.py b/tests/python/train/test_bucketing.py new file mode 100644 index 000000000000..a233e46e0992 --- /dev/null +++ b/tests/python/train/test_bucketing.py @@ -0,0 +1,122 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: skip-file +import numpy as np +import mxnet as mx +import random +from random import randint +from mxnet.contrib.amp import amp + + +def prepare_bucketing_data(buckets, len_vocab, batch_size, invalid_label, num_sentence): + train_sent = [] + val_sent = [] + + for _ in range(num_sentence): + len_sentence = randint(6, max(buckets)-1) # leave out the two last buckets empty + train_sentence = [] + val_sentence = [] + for _ in range(len_sentence): + train_sentence.append(randint(1, len_vocab)) + val_sentence.append(randint(1, len_vocab)) + train_sent.append(train_sentence) + val_sent.append(val_sentence) + + data_train = mx.rnn.BucketSentenceIter(train_sent, batch_size, buckets=buckets, + invalid_label=invalid_label) + data_val = mx.rnn.BucketSentenceIter(val_sent, batch_size, buckets=buckets, + invalid_label=invalid_label) + + return (data_train, data_val) + + +def train_model(context=mx.cpu()): + import logging + head = '%(asctime)-15s %(message)s' + logging.basicConfig(level=logging.DEBUG, format=head) + console = logging.StreamHandler() + console.setLevel(logging.DEBUG) + logging.getLogger('').addHandler(console) + + batch_size = 128 + num_epochs = 5 + num_hidden = 25 + num_embed = 25 + num_layers = 2 + len_vocab = 50 + buckets = [5, 10, 20, 30, 40] + + invalid_label = -1 + num_sentence = 1000 + + data_train, data_val = prepare_bucketing_data(buckets, len_vocab, batch_size, invalid_label, num_sentence) + + stack = mx.rnn.SequentialRNNCell() + for i in range(num_layers): + stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_' % i)) + + def sym_gen(seq_len): + data = mx.sym.Variable('data') + label = mx.sym.Variable('softmax_label') + embed = mx.sym.Embedding(data=data, input_dim=len_vocab, + output_dim=num_embed, name='embed') + + stack.reset() + outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True) + + pred = mx.sym.Reshape(outputs, shape=(-1, num_hidden)) + pred = mx.sym.FullyConnected(data=pred, num_hidden=len_vocab, name='pred') + + label = mx.sym.Reshape(label, shape=(-1,)) + loss = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') + + return loss, ('data',), ('softmax_label',) + + contexts = context + + model = mx.mod.BucketingModule( + sym_gen=sym_gen, + default_bucket_key=data_train.default_bucket_key, + context=contexts) + + logging.info('Begin fit...') + model.fit( + train_data=data_train, + eval_data=data_val, + eval_metric=mx.metric.Perplexity(invalid_label), # Use Perplexity for multiclass classification. + kvstore='device', + optimizer='sgd', + optimizer_params={'learning_rate': 0.01, + 'momentum': 0, + 'wd': 0.00001}, + initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), + num_epoch=num_epochs, + batch_end_callback=mx.callback.Speedometer(batch_size, 50)) + logging.info('Finished fit...') + return model + + +def test_bucket_module(): + # This test forecasts random sequence of words to check bucketing. + # We cannot guarantee the accuracy of such an impossible task, and comments out the following line. + # assert model.score(data_val, mx.metric.MSE())[0][1] < 350, "High mean square error." + model = train_model() + + +if __name__ == "__main__": + test_bucket_module() diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py new file mode 100644 index 000000000000..80885b33f955 --- /dev/null +++ b/tests/python/train/test_mlp.py @@ -0,0 +1,114 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: skip-file +import mxnet as mx +import numpy as np +import os, sys +import pickle as pickle +import logging +from mxnet.test_utils import get_mnist_ubyte + + +def test_mlp(tmpdir): + # symbol net + batch_size = 100 + data = mx.symbol.Variable('data') + fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128) + act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu") + fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64) + act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu") + fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10) + softmax = mx.symbol.SoftmaxOutput(fc3, name = 'sm') + + def accuracy(label, pred): + py = np.argmax(pred, axis=1) + return np.sum(py == label) / float(label.size) + + num_epoch = 4 + prefix = './mlp' + + #check data + path = str(tmpdir) + get_mnist_ubyte(path) + + train_dataiter = mx.io.MNISTIter( + image=os.path.join(path, 'train-images-idx3-ubyte'), + label=os.path.join(path, 'train-labels-idx1-ubyte'), + data_shape=(784,), + label_name='sm_label', + batch_size=batch_size, shuffle=True, flat=True, silent=False, seed=10) + val_dataiter = mx.io.MNISTIter( + image=os.path.join(path, 't10k-images-idx3-ubyte'), + label=os.path.join(path, 't10k-labels-idx1-ubyte'), + data_shape=(784,), + label_name='sm_label', + batch_size=batch_size, shuffle=True, flat=True, silent=False) + # print logging by default + logging.basicConfig(level=logging.DEBUG) + + model = mx.model.FeedForward.create( + softmax, + X=train_dataiter, + eval_data=val_dataiter, + eval_metric=mx.metric.np(accuracy), + epoch_end_callback=mx.callback.do_checkpoint(prefix), + ctx=[mx.cpu(i) for i in range(2)], + num_epoch=num_epoch, + learning_rate=0.1, wd=0.0004, + momentum=0.9) + + logging.info('Finish traning...') + prob = model.predict(val_dataiter) + logging.info('Finish predict...') + val_dataiter.reset() + y = np.concatenate([batch.label[0].asnumpy() for batch in val_dataiter]).astype('int') + py = np.argmax(prob, axis=1) + acc1 = float(np.sum(py == y)) / len(y) + logging.info('final accuracy = %f', acc1) + assert(acc1 > 0.94) + + # predict internal featuremaps + internals = softmax.get_internals() + fc2 = internals['fc2_output'] + mfeat = mx.model.FeedForward(symbol=fc2, + arg_params=model.arg_params, + aux_params=model.aux_params, + allow_extra_params=True) + feat = mfeat.predict(val_dataiter) + assert feat.shape == (10000, 64) + # pickle the model + smodel = pickle.dumps(model) + model2 = pickle.loads(smodel) + prob2 = model2.predict(val_dataiter) + assert np.sum(np.abs(prob - prob2)) == 0 + + # load model from checkpoint + model3 = mx.model.FeedForward.load(prefix, num_epoch) + prob3 = model3.predict(val_dataiter) + assert np.sum(np.abs(prob - prob3)) == 0 + + # save model explicitly + model.save(prefix, 128) + model4 = mx.model.FeedForward.load(prefix, 128) + prob4 = model4.predict(val_dataiter) + assert np.sum(np.abs(prob - prob4)) == 0 + + for i in range(num_epoch): + os.remove('%s-%04d.params' % (prefix, i + 1)) + os.remove('%s-symbol.json' % prefix) + os.remove('%s-0128.params' % prefix) diff --git a/tests/python/train/test_sparse_fm.py b/tests/python/train/test_sparse_fm.py new file mode 100644 index 000000000000..76a2705fe4e5 --- /dev/null +++ b/tests/python/train/test_sparse_fm.py @@ -0,0 +1,144 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import mxnet.ndarray as nd +from mxnet.test_utils import * +import numpy as np +import os +import sys +CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +sys.path.insert(0, os.path.join(CURR_PATH, '../unittest')) +from common import retry + +@retry(5) +def test_factorization_machine_module(verbose=False): + """ Test factorization machine model with sparse operators """ + def check_factorization_machine_module(optimizer=None, num_epochs=None): + print("check_factorization_machine_module( {} )".format(optimizer)) + + def fm(factor_size, feature_dim, init): + x = mx.symbol.Variable("data", stype='csr') + v = mx.symbol.Variable("v", shape=(feature_dim, factor_size), + init=init, stype='row_sparse') + + w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1), + init=init, stype='row_sparse') + w1_bias = mx.symbol.var('w1_bias', shape=(1)) + w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias) + + v_s = mx.symbol._internal._square_sum(data=v, axis=1, keepdims=True) + x_s = mx.symbol.square(data=x) + bd_sum = mx.sym.dot(x_s, v_s) + + w2 = mx.symbol.dot(x, v) + w2_squared = 0.5 * mx.symbol.square(data=w2) + + w_all = mx.symbol.Concat(w1, w2_squared, dim=1) + sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True) + sum2 = 0.5 * mx.symbol.negative(bd_sum) + model = mx.sym.elemwise_add(sum1, sum2) + + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y) + return model + + # model + init = mx.initializer.Normal(sigma=0.01) + factor_size = 4 + feature_dim = 10000 + model = fm(factor_size, feature_dim, init) + + # data iter + num_batches = 5 + batch_size = 64 + num_samples = batch_size * num_batches + # generate some random csr data + csr_nd = rand_ndarray((num_samples, feature_dim), 'csr', 0.1) + label = mx.nd.ones((num_samples,1)) + # the alternative is to use LibSVMIter + train_iter = mx.io.NDArrayIter(data=csr_nd, + label={'label':label}, + batch_size=batch_size, + last_batch_handle='discard') + # create module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label']) + # allocate memory by given the input data and lable shapes + mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) + # initialize parameters by uniform random numbers + mod.init_params(initializer=init) + if optimizer == 'sgd': + # use Sparse SGD with learning rate 0.1 to train + sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01, + rescale_grad=1.0/batch_size) + mod.init_optimizer(optimizer=sgd) + if num_epochs is None: + num_epochs = 10 + expected_accuracy = 0.02 + elif optimizer == 'adam': + # use Sparse Adam to train + adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.0005, + rescale_grad=1.0/batch_size) + mod.init_optimizer(optimizer=adam) + if num_epochs is None: + num_epochs = 10 + expected_accuracy = 0.05 + elif optimizer == 'adagrad': + # use Sparse AdaGrad with learning rate 0.1 to train + adagrad = mx.optimizer.AdaGrad(clip_gradient=5.0, learning_rate=0.01, + rescale_grad=1.0/batch_size) + mod.init_optimizer(optimizer=adagrad) + if num_epochs is None: + num_epochs = 20 + expected_accuracy = 0.09 + else: + raise AssertionError("Unsupported optimizer type '" + optimizer + "' specified") + # use accuracy as the metric + metric = mx.metric.create('MSE') + # train 'num_epochs' epoch + for epoch in range(num_epochs): + train_iter.reset() + metric.reset() + for batch in train_iter: + mod.forward(batch, is_train=True) # compute predictions + mod.update_metric(metric, batch.label) # accumulate prediction accuracy + mod.backward() # compute gradients + mod.update() # update parameters + print('Epoch %d, Training %s' % (epoch, metric.get())) + if num_epochs > 1: + assert(metric.get()[1] < expected_accuracy) + + if verbose is True: + print("============ SGD ==========================") + start = time.clock() + check_factorization_machine_module('sgd') + if verbose is True: + print("Duration: {}".format(time.clock() - start)) + print("============ ADAM ==========================") + start = time.clock() + check_factorization_machine_module('adam') + if verbose is True: + print("Duration: {}".format(time.clock() - start)) + print("============ ADAGRAD ==========================") + start = time.clock() + check_factorization_machine_module('adagrad') + if verbose is True: + print("Duration: {}".format(time.clock() - start)) + +# run as a script +if __name__ == "__main__": + test_factorization_machine_module() diff --git a/tests/python/unittest/test_contrib_svrg_module.py b/tests/python/unittest/test_contrib_svrg_module.py new file mode 100644 index 000000000000..e9509f743f73 --- /dev/null +++ b/tests/python/unittest/test_contrib_svrg_module.py @@ -0,0 +1,307 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import numpy as np +from common import with_seed, assertRaises +from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule +from mxnet.test_utils import * +import unittest + +def setup(): + train_data = np.random.randint(1, 5, [1000, 2]) + weights = np.array([1.0, 2.0]) + train_label = train_data.dot(weights) + + di = mx.io.NDArrayIter(train_data, train_label, batch_size=32, shuffle=True, label_name='lin_reg_label') + X = mx.sym.Variable('data') + Y = mx.symbol.Variable('lin_reg_label') + fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) + lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") + + mod = SVRGModule( + symbol=lro, + data_names=['data'], + label_names=['lin_reg_label'], update_freq=2) + mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) + mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False) + + return di, mod + + +def test_bind_module(): + _, mod = setup() + assert mod.binded == True + assert mod._mod_aux.binded == True + + +def test_module_init(): + _, mod = setup() + assert mod._mod_aux is not None + + +def test_module_initializer(): + def regression_model(m): + x = mx.symbol.var("data", stype='csr') + v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1), + stype='row_sparse') + model = mx.symbol.dot(lhs=x, rhs=v) + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out") + return model + + #shape of the data + n, m = 128, 100 + model = regression_model(m) + + data = mx.nd.zeros(shape=(n, m), stype='csr') + label = mx.nd.zeros((n, 1)) + iterator = mx.io.NDArrayIter(data=data, label={'label': label}, + batch_size=n, last_batch_handle='discard') + + # create module + mod = SVRGModule(symbol=model, data_names=['data'], label_names=['label'], update_freq=2) + mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label) + mod.init_params() + v = mod._arg_params['v'] + assert v.stype == 'row_sparse' + assert np.sum(v.asnumpy()) != 0 + + +def test_module_bind(): + x = mx.sym.Variable("data") + net = mx.sym.FullyConnected(x, num_hidden=1) + + mod = SVRGModule(symbol=net, data_names=['data'], label_names=None, update_freq=2) + assertRaises(TypeError, mod.bind, data_shapes=['data', mx.nd.zeros(shape=(2, 1))]) + + mod.bind(data_shapes=[('data', (2, 1))]) + assert mod.binded == True + assert mod._mod_aux.binded == True + + +@unittest.skip("Flaky test https://gitsvrhub.com/apache/incubator-mxnet/issues/12510") +@with_seed() +def test_module_save_load(tmpdir): + import os + + x = mx.sym.Variable("data") + y = mx.sym.Variable("softmax_label") + net = mx.sym.FullyConnected(x, y, num_hidden=1) + + mod = SVRGModule(symbol=net, data_names=['data'], label_names=['softmax_label'], update_freq=2) + mod.bind(data_shapes=[('data', (1, 1))]) + mod.init_params() + mod.init_optimizer(optimizer='sgd', optimizer_params={'learning_rate': 0.1}) + mod.update() + + tmp = str(tmpdir) + tmp_file = os.path.join(tmp, 'svrg_test_output') + mod.save_checkpoint(tmp_file, 0, save_optimizer_states=True) + + mod2 = SVRGModule.load(tmp_file, 0, load_optimizer_states=True, data_names=('data', )) + mod2.bind(data_shapes=[('data', (1, 1))]) + mod2.init_optimizer(optimizer_params={'learning_rate': 0.1}) + assert mod._symbol.tojson() == mod2._symbol.tojson() + + # Multi-device + mod3 = SVRGModule(symbol=net, data_names=['data'], label_names=['softmax_label'], update_freq=3, + context=[mx.cpu(0), mx.cpu(1)]) + mod3.bind(data_shapes=[('data', (10, 10))]) + mod3.init_params() + mod3.init_optimizer(optimizer_params={'learning_rate': 1.0}) + mod3.update() + mod3.save_checkpoint(tmp_file, 0, save_optimizer_states=True) + + mod4 = SVRGModule.load(tmp_file, 0, load_optimizer_states=True, data_names=('data', )) + mod4.bind(data_shapes=[('data', (10, 10))]) + mod4.init_optimizer(optimizer_params={'learning_rate': 1.0}) + assert mod3._symbol.tojson() == mod4._symbol.tojson() + + +@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510") +@with_seed() +def test_svrgmodule_reshape(): + data = mx.sym.Variable("data") + sym = mx.sym.FullyConnected(data=data, num_hidden=4, name='fc') + + dshape=(3, 4) + mod = SVRGModule(sym, data_names=["data"], label_names=None, context=[mx.cpu(0), mx.cpu(1)], update_freq=2) + mod.bind(data_shapes=[('data', dshape)]) + mod.init_params() + mod._mod_aux.init_params() + mod.init_optimizer(optimizer_params={"learning_rate": 1.0}) + + data_batch = mx.io.DataBatch(data=[mx.nd.ones(dshape)], label=None) + mod.forward(data_batch) + mod.backward([mx.nd.ones(dshape)]) + mod.update() + assert mod.get_outputs()[0].shape == dshape + + dshape = (2, 4) + mod.reshape(data_shapes=[('data', dshape)]) + mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape)], + label=None)) + mod.backward([mx.nd.ones(dshape)]) + mod.update() + assert mod.get_outputs()[0].shape == dshape + + +@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510") +@with_seed() +def test_update_full_grad(): + def create_network(): + train_data = np.random.randint(1, 5, [10, 2]) + weights = np.array([1.0, 2.0]) + train_label = train_data.dot(weights) + + di = mx.io.NDArrayIter(train_data, train_label, batch_size=5, shuffle=True, label_name='lin_reg_label') + X = mx.sym.Variable('data') + Y = mx.symbol.Variable('lin_reg_label') + fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) + lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") + + mod = SVRGModule( + symbol=lro, + data_names=['data'], + label_names=['lin_reg_label'], update_freq=2) + mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) + mod.init_params(initializer=mx.init.One(), allow_missing=False, force_init=False, allow_extra=False) + mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), + force_init=False) + return di, mod + + di, svrg_mod = create_network() + + # Calculates the average of full gradients over number batches + full_grads_weights = mx.nd.zeros(shape=svrg_mod.get_params()[0]['fc1_weight'].shape) + arg, aux = svrg_mod.get_params() + svrg_mod._mod_aux.set_params(arg_params=arg, aux_params=aux) + num_batch = 2 + + for batch in di: + svrg_mod.forward(batch) + svrg_mod.backward() + full_grads_weights = mx.nd.broadcast_add(svrg_mod._exec_group.grad_arrays[0][0], full_grads_weights, axis=0) + full_grads_weights /= num_batch + + di.reset() + svrg_mod.update_full_grads(di) + assert same(full_grads_weights, svrg_mod._param_dict[0]['fc1_weight']) + + +@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510") +@with_seed() +def test_svrg_with_sgd(): + def create_module_with_sgd(): + train_data = np.random.randint(1, 5, [100, 2]) + weights = np.array([1.0, 2.0]) + train_label = train_data.dot(weights) + + di = mx.io.NDArrayIter(train_data, train_label, batch_size=10, shuffle=True, label_name='lin_reg_label') + X = mx.sym.Variable('data') + Y = mx.symbol.Variable('lin_reg_label') + fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1) + lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro") + + reg_mod = mx.mod.Module( + symbol=lro, + data_names=['data'], + label_names=['lin_reg_label']) + reg_mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) + reg_mod.init_params(initializer=mx.init.One(), allow_missing=False, force_init=False, allow_extra=False) + reg_mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),)) + + svrg_mod = SVRGModule(symbol=lro, + data_names=['data'], + label_names=['lin_reg_label'], + update_freq=2) + svrg_mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label) + svrg_mod.init_params(initializer=mx.init.One(), allow_missing=False, force_init=False, allow_extra=False) + svrg_mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),)) + + return di,reg_mod, svrg_mod + + di, reg_mod, svrg_mod = create_module_with_sgd() + num_epoch = 10 + + # Use metric MSE + metrics = mx.metric.create("mse") + + # Train with SVRGModule + for e in range(num_epoch): + metrics.reset() + if e % svrg_mod.update_freq == 0: + svrg_mod.update_full_grads(di) + di.reset() + for batch in di: + svrg_mod.forward_backward(data_batch=batch) + svrg_mod.update() + svrg_mod.update_metric(metrics, batch.label) + svrg_mse = metrics.get()[1] + + # Train with SGD standard Module + di.reset() + for e in range(num_epoch): + metrics.reset() + di.reset() + for batch in di: + reg_mod.forward_backward(data_batch=batch) + reg_mod.update() + reg_mod.update_metric(metrics, batch.label) + sgd_mse = metrics.get()[1] + + assert svrg_mse < sgd_mse + + +@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510") +@with_seed() +def test_accumulate_kvstore(): + # Test KVStore behavior when push a list of values + kv = mx.kv.create('local') + kv.init("fc1_weight", mx.nd.zeros(shape=(1, 2))) + kv.init("fc1_weight_full", mx.nd.zeros(shape=(1, 2))) + b = [mx.nd.ones(shape=(1, 2)) for i in range(4)] + a = mx.nd.zeros(shape=(1, 2)) + kv.push("fc1_weight_full", b) + kv.pull("fc1_weight_full", out=a) + assert same(a, [mx.nd.array([4, 4])]) + assert kv.num_workers == 1 + + # Test accumulate in KVStore and allocate gradients + kv_test = mx.kv.create('local') + _, svrg_mod = setup() + svrg_mod.init_optimizer(kvstore=kv_test, optimizer='sgd', optimizer_params=(('learning_rate', 0.01),), + force_init=False) + svrg_mod._accumulate_kvstore("fc1_weight", b) + assert len(svrg_mod._param_dict) == svrg_mod._ctx_len + assert same(svrg_mod._param_dict[0]["fc1_weight"], b[0]) + + +@unittest.skip("Flaky test https://github.com/apache/incubator-mxnet/issues/12510") +@with_seed() +def test_fit(): + di, mod = setup() + num_epoch = 100 + metric = mx.metric.create("mse") + mod.fit(di, eval_metric=metric, optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), num_epoch=num_epoch, + kvstore='local') + + # Estimated MSE for using SGD optimizer of lr = 0.025, SVRG MSE should be smaller + estimated_mse = 1e-5 + assert metric.get()[1] < estimated_mse + diff --git a/tests/python/unittest/test_gluon_batch_processor.py b/tests/python/unittest/test_gluon_batch_processor.py index bff80813bb12..952ed1c4a0da 100644 --- a/tests/python/unittest/test_gluon_batch_processor.py +++ b/tests/python/unittest/test_gluon_batch_processor.py @@ -52,7 +52,7 @@ def test_batch_processor_fit(): num_epochs = 1 ctx = mx.cpu() loss = gluon.loss.L2Loss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() net.initialize(ctx=ctx) processor = BatchProcessor() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001}) @@ -83,7 +83,7 @@ def test_batch_processor_validation(): num_epochs = 1 ctx = mx.cpu() loss = gluon.loss.L2Loss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() val_loss = gluon.loss.L1Loss() net.initialize(ctx=ctx) processor = BatchProcessor() diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py index 844c8b2b857f..93fceab3ed9e 100644 --- a/tests/python/unittest/test_gluon_estimator.py +++ b/tests/python/unittest/test_gluon_estimator.py @@ -58,7 +58,7 @@ def test_fit(): num_epochs = 1 ctx = mx.cpu() loss = gluon.loss.L2Loss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() net.initialize(ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001}) est = Estimator(net=net, @@ -87,7 +87,7 @@ def test_validation(): num_epochs = 1 ctx = mx.cpu() loss = gluon.loss.L2Loss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() val_loss = gluon.loss.L1Loss() net.initialize(ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001}) @@ -126,7 +126,7 @@ def test_initializer(): ctx = mx.cpu() loss = gluon.loss.L2Loss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() # no initializer est = Estimator(net=net, loss=loss, @@ -166,7 +166,7 @@ def test_trainer(): ctx = mx.cpu() loss = gluon.loss.L2Loss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() net.initialize(ctx=ctx) # input no trainer with warnings.catch_warnings(record=True) as w: @@ -206,7 +206,7 @@ def test_metric(): est.fit(train_data=train_data, epochs=num_epochs) # input list of metrics - metrics = [mx.gluon.metric.Accuracy(), mx.gluon.metric.Accuracy()] + metrics = [mx.metric.Accuracy(), mx.metric.Accuracy()] est = Estimator(net=net, loss=loss, train_metrics=metrics, @@ -227,14 +227,14 @@ def test_metric(): loss=loss, trainer=trainer, context=ctx) - assert isinstance(est.train_metrics[0], mx.gluon.metric.Accuracy) + assert isinstance(est.train_metrics[0], mx.metric.Accuracy) def test_loss(): ''' test with invalid loss ''' net = _get_test_network() ctx = mx.cpu() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() net.initialize(ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001}) # input invalid loss @@ -250,7 +250,7 @@ def test_context(): ''' test with no context, list of context, invalid context ''' net = _get_test_network() loss = gluon.loss.L2Loss() - metrics = mx.gluon.metric.Accuracy() + metrics = mx.metric.Accuracy() # input no context est = Estimator(net=net, loss=loss, @@ -332,7 +332,7 @@ def test_default_handlers(): net.initialize(ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001}) - train_acc = mx.gluon.metric.RMSE() + train_acc = mx.metric.RMSE() loss = gluon.loss.L2Loss() est = Estimator(net=net, @@ -359,7 +359,7 @@ def test_default_handlers(): # handler with mixed metrics, some handler use metrics prepared by estimator # some handler use metrics user prepared - logging = LoggingHandler(metrics=[mx.gluon.metric.RMSE("val acc")]) + logging = LoggingHandler(metrics=[mx.metric.RMSE("val acc")]) with pytest.raises(ValueError): est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[logging]) @@ -383,7 +383,7 @@ def test_val_net(): ctx = mx.cpu() loss = gluon.loss.L2Loss() val_loss = gluon.loss.L2Loss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() net.initialize(ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001}) est = Estimator(net=net, @@ -448,7 +448,7 @@ def test_val_handlers(): net.initialize(ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001}) - train_acc = mx.gluon.metric.RMSE() + train_acc = mx.metric.RMSE() loss = gluon.loss.L2Loss() est = Estimator(net=net, diff --git a/tests/python/unittest/test_gluon_event_handler.py b/tests/python/unittest/test_gluon_event_handler.py index 4cadc9466ed1..a07282cd46dd 100644 --- a/tests/python/unittest/test_gluon_event_handler.py +++ b/tests/python/unittest/test_gluon_event_handler.py @@ -84,7 +84,7 @@ def test_checkpoint_handler(): net = _get_test_network() ce_loss = loss.SoftmaxCrossEntropyLoss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc) checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir, model_prefix=model_prefix, @@ -130,7 +130,7 @@ def test_resume_checkpoint(): net = _get_test_network() ce_loss = loss.SoftmaxCrossEntropyLoss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc) checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir, model_prefix=model_prefix, @@ -155,7 +155,7 @@ def test_early_stopping(): net = _get_test_network() ce_loss = loss.SoftmaxCrossEntropyLoss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc) early_stopping = event_handler.EarlyStoppingHandler(monitor=acc, patience=0, @@ -179,7 +179,7 @@ def test_logging(): net = _get_test_network() ce_loss = loss.SoftmaxCrossEntropyLoss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc) est.logger.addHandler(logging.FileHandler(output_dir)) @@ -226,7 +226,7 @@ def epoch_end(self, estimator, *args, **kwargs): test_data = _get_test_data() net = _get_test_network() ce_loss = loss.SoftmaxCrossEntropyLoss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc) custom_handler = CustomStopHandler(3, 2) est.fit(test_data, event_handlers=[custom_handler], epochs=3) @@ -249,7 +249,7 @@ def test_logging_interval(): dataloader = _get_test_data(in_size=data_size) num_epochs = 1 ce_loss = loss.SoftmaxCrossEntropyLoss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() logging = LoggingHandler(metrics=[acc], log_interval=log_interval) est = estimator.Estimator(net=net, loss=ce_loss, @@ -273,7 +273,7 @@ def test_logging_interval(): ''' test case #2: log interval is 5 ''' old_stdout = sys.stdout sys.stdout = mystdout = StringIO() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() log_interval = 5 logging = LoggingHandler(metrics=[acc], log_interval=log_interval) est = estimator.Estimator(net=net, @@ -299,7 +299,7 @@ def test_validation_handler_batch_axis(): test_data = _get_test_data() net = _get_test_network() ce_loss = loss.SoftmaxCrossEntropyLoss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc) est.fit(test_data, epochs=3) @@ -315,7 +315,7 @@ def test_validation_handler(): net = _get_test_network() ce_loss = loss.SoftmaxCrossEntropyLoss() - acc = mx.gluon.metric.Accuracy() + acc = mx.metric.Accuracy() est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc) val_handler = ValidationHandler(val_data=test_data, eval_fn=est.evaluate, diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py index 5e6c0f798d9e..a07ed440ea69 100644 --- a/tests/python/unittest/test_loss.py +++ b/tests/python/unittest/test_loss.py @@ -65,6 +65,50 @@ def get_net(num_hidden, flatten=True): fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten) return fc3 +# tracked at: https://github.com/apache/incubator-mxnet/issues/11692 +@with_seed() +def test_ce_loss(): + nclass = 10 + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, nclass)) + label = mx.nd.array(np.random.randint(0, nclass, size=(N,)), dtype='int32') + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label') + output = get_net(nclass) + l = mx.symbol.Variable('label') + Loss = gluon.loss.SoftmaxCrossEntropyLoss() + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + eval_metric=mx.metric.Loss(), optimizer='adam', + initializer=mx.init.Xavier(magnitude=2)) + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05 + +# tracked at: https://github.com/apache/incubator-mxnet/issues/11691 +@with_seed() +def test_bce_loss(): + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, 20)) + label = mx.nd.array(np.random.randint(2, size=(N,)), dtype='float32') + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label') + output = get_net(1) + l = mx.symbol.Variable('label') + Loss = gluon.loss.SigmoidBinaryCrossEntropyLoss() + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + eval_metric=mx.metric.Loss(), optimizer='adam', + initializer=mx.init.Xavier(magnitude=2)) + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.01 + # Test against npy + data = mx.random.uniform(-5, 5, shape=(10,)) + label = mx.random.uniform(0, 1, shape=(10,)) + mx_bce_loss = Loss(data, label).asnumpy() + prob_npy = 1.0 / (1.0 + np.exp(-data.asnumpy())) + label_npy = label.asnumpy() + npy_bce_loss = - label_npy * np.log(prob_npy) - (1 - label_npy) * np.log(1 - prob_npy) + assert_almost_equal(mx_bce_loss, npy_bce_loss, rtol=1e-4, atol=1e-5) @with_seed() def test_bce_equal_ce2(): @@ -86,6 +130,58 @@ def test_logistic_loss_equal_bce(): assert_almost_equal(loss_binary(data, label), loss_bce(data, label), atol=1e-6) assert_almost_equal(loss_signed(data, 2 * label - 1), loss_bce(data, label), atol=1e-6) +@with_seed() +def test_kl_loss(): + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, 10)) + label = mx.nd.softmax(mx.random.uniform(0, 1, shape=(N, 2))) + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label') + output = mx.sym.log_softmax(get_net(2)) + l = mx.symbol.Variable('label') + Loss = gluon.loss.KLDivLoss() + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + eval_metric=mx.metric.Loss(), optimizer='adam') + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05 + + +@with_seed() +def test_l2_loss(): + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, 10)) + label = mx.random.uniform(-1, 1, shape=(N, 1)) + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True) + output = get_net(1) + l = mx.symbol.Variable('label') + Loss = gluon.loss.L2Loss() + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(), + optimizer='adam') + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05 + + +@with_seed() +def test_l1_loss(): + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, 10)) + label = mx.random.uniform(-1, 1, shape=(N, 1)) + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True) + output = get_net(1) + l = mx.symbol.Variable('label') + Loss = gluon.loss.L1Loss() + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(), + optimizer='adam') + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1 + @with_seed() def test_ctc_loss(): @@ -114,6 +210,145 @@ def test_ctc_loss(): assert_almost_equal(l, np.array([18.82820702, 16.50581741])) +@with_seed() +def test_ctc_loss_train(): + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, 20, 10)) + label = mx.nd.arange(4, repeat=N).reshape((N, 4)) + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True) + output = get_net(5, False) + l = mx.symbol.Variable('label') + Loss = gluon.loss.CTCLoss(layout='NTC', label_layout='NT') + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(), + optimizer='adam') + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 10 + + +@with_seed() +def test_sample_weight_loss(): + nclass = 10 + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, nclass)) + label = mx.nd.array(np.random.randint(0, nclass, size=(N,)), dtype='int32') + weight = mx.nd.array([1 for i in range(10)] + [0 for i in range(10)]) + data_iter = mx.io.NDArrayIter(data, {'label': label, 'w': weight}, batch_size=10) + output = get_net(nclass) + l = mx.symbol.Variable('label') + w = mx.symbol.Variable('w') + Loss = gluon.loss.SoftmaxCrossEntropyLoss() + loss = Loss(output, l, w) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label', 'w')) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + eval_metric=mx.metric.Loss(), optimizer='adam') + data_iter = mx.io.NDArrayIter(data[10:], {'label': label, 'w': weight}, batch_size=10) + score = mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] + assert score > 1 + data_iter = mx.io.NDArrayIter(data[:10], {'label': label, 'w': weight}, batch_size=10) + score = mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] + assert score < 0.05 + + +@with_seed(1234) +def test_saveload(): + nclass = 10 + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, nclass)) + label = mx.nd.array(np.random.randint(0, nclass, size=(N,)), dtype='int32') + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label') + output = get_net(nclass) + l = mx.symbol.Variable('label') + Loss = gluon.loss.SoftmaxCrossEntropyLoss() + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=100, optimizer_params={'learning_rate': 1.}, + eval_metric=mx.metric.Loss()) + mod.save_checkpoint('test', 100, save_optimizer_states=True) + mod = mx.mod.Module.load('test', 100, load_optimizer_states=True, + data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=100, optimizer_params={'learning_rate': 1.}, + eval_metric=mx.metric.Loss()) + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05 + +@with_seed() +def test_huber_loss(): + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, 10)) + label = mx.random.uniform(-1, 1, shape=(N, 1)) + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True) + output = get_net(1) + l = mx.symbol.Variable('label') + Loss = gluon.loss.HuberLoss() + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(), + optimizer='adam') + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05 + + +@with_seed() +def test_hinge_loss(): + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, 10)) + label = mx.nd.sign(mx.random.uniform(-1, 1, shape=(N, 1))) + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True) + output = get_net(1) + l = mx.symbol.Variable('label') + Loss = gluon.loss.HingeLoss() + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(), + optimizer='adam') + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.06 + + +@with_seed() +def test_squared_hinge_loss(): + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, 10)) + label = mx.nd.sign(mx.random.uniform(-1, 1, shape=(N, 1))) + data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True) + output = get_net(1) + l = mx.symbol.Variable('label') + Loss = gluon.loss.SquaredHingeLoss() + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(), + optimizer='adam') + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05 + + +@with_seed() +def test_triplet_loss(): + N = 20 + data = mx.random.uniform(-1, 1, shape=(N, 10)) + pos = mx.random.uniform(-1, 1, shape=(N, 10)) + neg = mx.random.uniform(-1, 1, shape=(N, 10)) + data_iter = mx.io.NDArrayIter(data, {'pos': pos, 'neg': neg}, batch_size=10, + label_name='label', shuffle=True) + output = get_net(10) + pos = mx.symbol.Variable('pos') + neg = mx.symbol.Variable('neg') + Loss = gluon.loss.TripletLoss() + loss = Loss(output, pos, neg) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('pos','neg')) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(), + optimizer='adam') + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05 + @xfail_when_nonstandard_decimal_separator @with_seed() def test_sdml_loss(): @@ -208,3 +443,51 @@ def test_poisson_nllloss(): loss_compute_full = Loss_compute_full(mx.nd.array(np_pred), mx.nd.array(np_target)) assert_almost_equal(np_compute_full, loss_compute_full.asscalar()) +@with_seed() +def test_poisson_nllloss_mod(): + N = 1000 + data = mx.random.poisson(shape=(N, 2)) + label = mx.random.poisson(lam=4, shape=(N, 1)) + data_iter = mx.io.NDArrayIter(data, label, batch_size=20, label_name='label', shuffle=True) + output = mx.sym.exp(get_net(1)) + l = mx.symbol.Variable('label') + Loss = gluon.loss.PoissonNLLLoss(from_logits=False) + loss = Loss(output, l) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',)) + mod.fit(data_iter, num_epoch=20, optimizer_params={'learning_rate': 0.01}, + initializer=mx.init.Normal(sigma=0.1), eval_metric=mx.metric.Loss(), + optimizer='adam') + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05 + +@with_seed() +def test_bce_loss_with_pos_weight(): + # Suppose it's a multi-label classification + N = np.random.randint(5, 30) + data = mx.nd.random.uniform(-1, 1, shape=(N, 20)) + label = mx.nd.array(np.random.randint(2, size=(N, 5)), dtype='float32') + pos_weight = mx.nd.random.uniform(0, 10, shape=(1, 5)) + pos_weight = mx.nd.repeat(pos_weight, repeats=N, axis=0) + data_iter = mx.io.NDArrayIter(data, {'label': label, 'pos_w': pos_weight}, batch_size=10, label_name='label') + output = get_net(5) + l = mx.symbol.Variable('label') + pos_w = mx.symbol.Variable('pos_w') + Loss = gluon.loss.SigmoidBinaryCrossEntropyLoss() + loss = Loss(output, l, None, pos_w) + loss = mx.sym.make_loss(loss) + mod = mx.mod.Module(loss, data_names=('data',), label_names=('label', 'pos_w')) + mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01}, + eval_metric=mx.metric.Loss(), optimizer='adam', + initializer=mx.init.Xavier(magnitude=2)) + assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.01 + # Test against npy + data = mx.nd.random.uniform(-5, 5, shape=(N, 5)) + label = mx.nd.array(np.random.randint(2, size=(N, 5)), dtype='float32') + pos_weight = mx.nd.random.uniform(0, 10, shape=(1, 5)) + mx_bce_loss = Loss(data, label, None, pos_weight).asnumpy() + prob_npy = 1.0 / (1.0 + np.exp(-data.asnumpy())) + label_npy = label.asnumpy() + pos_weight_npy = pos_weight.asnumpy() + npy_bce_loss = (- label_npy * np.log(prob_npy)*pos_weight_npy - (1 - label_npy) * np.log(1 - prob_npy)).mean(axis=1) + assert_almost_equal(mx_bce_loss, npy_bce_loss, rtol=1e-4, atol=1e-5) + diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py index 88b9d9cedce2..cc92a59f9a95 100644 --- a/tests/python/unittest/test_metric.py +++ b/tests/python/unittest/test_metric.py @@ -16,7 +16,6 @@ # under the License. import mxnet as mx -from mxnet.test_utils import use_np import numpy as np import scipy from scipy.stats import pearsonr @@ -26,9 +25,9 @@ from copy import deepcopy def check_metric(metric, *args, **kwargs): - metric = mx.gluon.metric.create(metric, *args, **kwargs) + metric = mx.metric.create(metric, *args, **kwargs) str_metric = json.dumps(metric.get_config()) - metric2 = mx.gluon.metric.create(str_metric) + metric2 = mx.metric.create(str_metric) assert metric.get_config() == metric2.get_config() @@ -36,16 +35,93 @@ def test_metrics(): check_metric('acc', axis=0) check_metric('f1') check_metric('mcc') - check_metric('perplexity', axis=-1) + check_metric('perplexity', -1) check_metric('pearsonr') check_metric('pcc') check_metric('nll_loss') check_metric('loss') - composite = mx.gluon.metric.create(['acc', 'f1']) + composite = mx.metric.create(['acc', 'f1']) check_metric(composite) +def _check_global_metric(metric, *args, **kwargs): + def _create_pred_label(): + if use_same_shape: + pred = mx.nd.random.uniform(0, 1, shape=shape) + label = mx.nd.random.uniform(0, 1, shape=shape) + else: + # Make a random prediction + idx = np.random.rand(*shape).argsort(1) + pred = mx.nd.array(1 - 0.1 * idx) + # Label is half 1 and half 0 + # Setting all 0s or all 1s would make either + # MCC or F1 metrics always produce 0 + label = mx.nd.ones(shape[0]) + label[:shape[0] // 2] = 0 + return pred, label + + def _compare_metric_result(m1, m2): + # Compare names + assert m1[0] == m2[0] + # Compare values + if isinstance(m1[1], (list, tuple)): + assert len(m1[1]) == len(m2[1]) + for r1, r2 in zip(m1[1], m2[1]): + assert r1 == r2 or \ + (math.isnan(r1) and + math.isnan(r2)) + else: + assert m1[1] == m2[1] or \ + (math.isnan(m1[1]) and + math.isnan(m2[1])) + + shape = kwargs.pop('shape', (10,10)) + use_same_shape = kwargs.pop('use_same_shape', False) + m1 = mx.metric.create(metric, *args, **kwargs) + m2 = deepcopy(m1) + # check that global stats are not reset when calling + # reset_local() + for i in range(10): + pred, label = _create_pred_label() + m1.update([label], [pred]) + m1.reset_local() + m2.update([label], [pred]) + assert m1.get_global() == m2.get() + + # check that reset_local() properly resets the local state + m1.reset_local() + m2.reset() + pred, label = _create_pred_label() + m1.update([label], [pred]) + m1.reset_local() + pred, label = _create_pred_label() + m1.update([label], [pred]) + m2.update([label], [pred]) + _compare_metric_result(m1.get(), m2.get()) + +@with_seed() +def test_global_metric(): + _check_global_metric('acc') + _check_global_metric('TopKAccuracy', top_k=3) + _check_global_metric('f1', shape=(10,2)) + _check_global_metric('f1', shape=(10,2), average='micro') + _check_global_metric('mcc', shape=(10,2)) + _check_global_metric('mcc', shape=(10,2), average='micro') + _check_global_metric('perplexity', -1) + _check_global_metric('pearsonr', use_same_shape=True) + _check_global_metric('pcc', shape=(10,2)) + _check_global_metric('nll_loss') + _check_global_metric('loss') + _check_global_metric('ce') + _check_global_metric('mae', use_same_shape=True) + _check_global_metric('mse', use_same_shape=True) + _check_global_metric('rmse', use_same_shape=True) + def custom_metric(label, pred): + return np.mean(np.abs(label-pred)) + _check_global_metric(custom_metric, use_same_shape=True) + _check_global_metric(['acc', 'f1'], shape=(10,2)) + def test_nll_loss(): - metric = mx.gluon.metric.create('nll_loss') + metric = mx.metric.create('nll_loss') pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]]) label = mx.nd.array([2, 1]) metric.update([label], [pred]) @@ -56,37 +132,36 @@ def test_nll_loss(): def test_acc(): pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]]) label = mx.nd.array([0, 1, 1]) - metric = mx.gluon.metric.create('acc') + metric = mx.metric.create('acc') metric.update([label], [pred]) _, acc = metric.get() expected_acc = (np.argmax(pred, axis=1) == label).sum().asscalar() / label.size - np.testing.assert_almost_equal(acc, expected_acc) + assert acc == expected_acc def test_acc_2d_label(): # label maybe provided in 2d arrays in custom data iterator pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6], [0.8, 0.2], [0.3, 0.5], [0.6, 0.4]]) label = mx.nd.array([[0, 1, 1], [1, 0, 1]]) - metric = mx.gluon.metric.create('acc') + metric = mx.metric.create('acc') metric.update([label], [pred]) _, acc = metric.get() expected_acc = (np.argmax(pred, axis=1).asnumpy() == label.asnumpy().ravel()).sum() / \ float(label.asnumpy().ravel().size) - np.testing.assert_almost_equal(acc, expected_acc) + assert acc == expected_acc def test_loss_update(): pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]]) - metric1 = mx.gluon.metric.create('loss') - metric2 = mx.gluon.metric.create('loss') + metric1 = mx.metric.create('loss') + metric2 = mx.metric.create('loss') metric1.update(None, [pred]) metric2.update(None, pred) _, acc1 = metric1.get() _, acc2 = metric2.get() assert acc1 == acc2 -@xfail_when_nonstandard_decimal_separator -def test_binary_f1(): - microF1 = mx.gluon.metric.create("f1", average="micro") - macroF1 = mx.gluon.metric.F1(average="macro") +def test_f1(): + microF1 = mx.metric.create("f1", average="micro") + macroF1 = mx.metric.F1(average="macro") assert np.isnan(macroF1.get()[1]) assert np.isnan(microF1.get()[1]) @@ -116,7 +191,7 @@ def test_binary_f1(): microF1.update([label11, label12], [pred11, pred12]) macroF1.update([label11, label12], [pred11, pred12]) assert microF1.num_inst == 4 - assert macroF1.num_inst == 4 + assert macroF1.num_inst == 1 # f1 = 2 * tp / (2 * tp + fp + fn) fscore1 = 2. * (1) / (2 * 1 + 1 + 0) np.testing.assert_almost_equal(microF1.get()[1], fscore1) @@ -125,98 +200,29 @@ def test_binary_f1(): microF1.update([label21, label22], [pred21, pred22]) macroF1.update([label21, label22], [pred21, pred22]) assert microF1.num_inst == 6 - assert macroF1.num_inst == 6 + assert macroF1.num_inst == 2 fscore2 = 2. * (1) / (2 * 1 + 0 + 0) fscore_total = 2. * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0)) np.testing.assert_almost_equal(microF1.get()[1], fscore_total) - np.testing.assert_almost_equal(macroF1.get()[1], fscore_total) - -def test_multiclass_f1(): - microF1 = mx.gluon.metric.create("f1", class_type="multiclass", average="micro") - macroF1 = mx.gluon.metric.F1(class_type="multiclass", average="macro") - - assert np.isnan(macroF1.get()[1]) - assert np.isnan(microF1.get()[1]) + np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2.) +<<<<<<< HEAD # check one class is zero pred = mx.nd.array([[0.9, 0.1], [0.8, 0.2]]) label = mx.nd.array([0, 0]) macroF1.update([label], [pred]) microF1.update([label], [pred]) - assert macroF1.get()[1] == 0.5 # one class is 1.0, the other is 0. (divided by 0) - assert microF1.get()[1] == 1.0 # globally f1 is 1.0 - macroF1.reset() - microF1.reset() - - # test case from sklearn, here pred is probabilistic distributions instead of predicted labels - pred11 = mx.nd.array([[1, 0, 0], [0, 1, 0]]) - label11 = mx.nd.array([0, 2]) - pred12 = mx.nd.array([[0, 0, 1], [1, 0, 0], [0, 1, 0], [0, 0, 1]]) - label12 = mx.nd.array([1, 0, 0, 1]) - - microF1.update([label11, label12], [pred11, pred12]) - macroF1.update([label11, label12], [pred11, pred12]) - assert microF1.num_inst == 6 - assert macroF1.num_inst == 6 - - # from sklearn.metrics import f1_score - # overall_pred = [0, 1, 2, 0, 1, 2] - # overall_label = [0, 2, 1, 0, 0, 1] - fmacro = 0.26666666666666666 #f1_score(overall_label, overall_pred, average="macro") - fmicro = 0.3333333333333333 #f1_score(overall_label, overall_pred, average="micro") - np.testing.assert_almost_equal(microF1.get()[1], fmicro) - np.testing.assert_almost_equal(macroF1.get()[1], fmacro) - -@xfail_when_nonstandard_decimal_separator -def test_multilabel_f1(): - microF1 = mx.gluon.metric.create("f1", class_type="multilabel", average="micro") - macroF1 = mx.gluon.metric.F1(class_type="multilabel", average="macro") - - assert np.isnan(macroF1.get()[1]) - assert np.isnan(microF1.get()[1]) - - # check one class is zero - pred = mx.nd.array([[0.9, 0.1], - [0.8, 0.2]]) - label = mx.nd.array([[1, 1], [1, 1]]) - macroF1.update([label], [pred]) - microF1.update([label], [pred]) - assert macroF1.get()[1] == 0.5 # one class is 1.0, the other is 0. (divided by 0) - np.testing.assert_almost_equal(microF1.get()[1], 2.0 / 3) - macroF1.reset() - microF1.reset() - - pred11 = mx.nd.array([[0.9, 0.4, 0.3], [0.2, 0.7, 0.8]]) - label11 = mx.nd.array([[1, 0, 1], [0, 0, 1]]) - pred12 = mx.nd.array([[0.6, 0.6, 0.7]]) - label12 = mx.nd.array([[0, 1, 1]]) - - microF1.update([label11, label12], [pred11, pred12]) - macroF1.update([label11, label12], [pred11, pred12]) - assert microF1.num_inst == 3 - assert macroF1.num_inst == 3 - #from sklearn.metrics import f1_score - #overall_pred = [[1, 0, 0], [0, 1, 1], [1, 1, 1]] - #overall_label = [[1, 0, 1], [0, 0, 1], [0, 1, 1]] - fmacro = 0.7111111111111111 #f1_score(overall_label, overall_pred, average="macro") - fmicro = 0.7272727272727272 #f1_score(overall_label, overall_pred, average="micro") - np.testing.assert_almost_equal(microF1.get()[1], fmicro) - np.testing.assert_almost_equal(macroF1.get()[1], fmacro) - -@xfail_when_nonstandard_decimal_separator -def test_mcc(): - microMCC = mx.gluon.metric.create("mcc") - - assert np.isnan(microMCC.get()[1]) - # check divide by zero pred = mx.nd.array([[0.9, 0.1], [0.8, 0.2]]) label = mx.nd.array([0, 0]) microMCC.update([label], [pred]) + macroMCC.update([label], [pred]) assert microMCC.get()[1] == 0.0 + assert macroMCC.get()[1] == 0.0 microMCC.reset() + macroMCC.reset() pred11 = mx.nd.array([[0.1, 0.9], [0.5, 0.5]]) @@ -229,40 +235,51 @@ def test_mcc(): pred22 = mx.nd.array([[0.2, 0.8]]) label22 = mx.nd.array([1]) microMCC.update([label11, label12], [pred11, pred12]) + macroMCC.update([label11, label12], [pred11, pred12]) assert microMCC.num_inst == 4 + assert macroMCC.num_inst == 1 tp1 = 1; fp1 = 0; fn1 = 1; tn1=2 mcc1 = (tp1*tn1 - fp1*fn1) / np.sqrt((tp1+fp1)*(tp1+fn1)*(tn1+fp1)*(tn1+fn1)) np.testing.assert_almost_equal(microMCC.get()[1], mcc1) + np.testing.assert_almost_equal(macroMCC.get()[1], mcc1) microMCC.update([label21, label22], [pred21, pred22]) + macroMCC.update([label21, label22], [pred21, pred22]) assert microMCC.num_inst == 6 + assert macroMCC.num_inst == 2 tp2 = 1; fp2 = 0; fn2 = 0; tn2=1 mcc2 = (tp2*tn2 - fp2*fn2) / np.sqrt((tp2+fp2)*(tp2+fn2)*(tn2+fp2)*(tn2+fn2)) tpT = tp1+tp2; fpT = fp1+fp2; fnT = fn1+fn2; tnT = tn1+tn2; mccT = (tpT*tnT - fpT*fnT) / np.sqrt((tpT+fpT)*(tpT+fnT)*(tnT+fpT)*(tnT+fnT)) np.testing.assert_almost_equal(microMCC.get()[1], mccT) + np.testing.assert_almost_equal(macroMCC.get()[1], .5*(mcc1+mcc2)) def test_perplexity(): pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]]) label = mx.nd.array([0, 1, 1]) p = pred.asnumpy()[np.arange(label.size), label.asnumpy().astype('int32')] perplexity_expected = np.exp(-np.log(p).sum()/label.size) - metric = mx.gluon.metric.create('perplexity', axis=-1) + metric = mx.metric.create('perplexity', -1) metric.update([label], [pred]) _, perplexity = metric.get() - np.testing.assert_almost_equal(perplexity, perplexity_expected) + assert perplexity == perplexity_expected def test_pearsonr(): pred1 = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]]) label1 = mx.nd.array([[1, 0], [0, 1], [0, 1]]) pearsonr_expected_np = np.corrcoef(pred1.asnumpy().ravel(), label1.asnumpy().ravel())[0, 1] pearsonr_expected_scipy, _ = pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel()) - micro_pr = mx.gluon.metric.create('pearsonr') + macro_pr = mx.metric.create('pearsonr', average='macro') + micro_pr = mx.metric.create('pearsonr', average='micro') + assert np.isnan(macro_pr.get()[1]) assert np.isnan(micro_pr.get()[1]) + macro_pr.update([label1], [pred1]) micro_pr.update([label1], [pred1]) + np.testing.assert_almost_equal(macro_pr.get()[1], pearsonr_expected_np) + np.testing.assert_almost_equal(macro_pr.get()[1], pearsonr_expected_scipy) np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_np) np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_scipy) @@ -275,7 +292,11 @@ def test_pearsonr(): pearsonr_expected_np = np.corrcoef(pred12.asnumpy().ravel(), label12.asnumpy().ravel())[0, 1] pearsonr_expected_scipy, _ = pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel()) + macro_pr.reset() micro_pr.update([label2], [pred2]) + macro_pr.update([label12], [pred12]) + np.testing.assert_almost_equal(macro_pr.get()[1], pearsonr_expected_np) + np.testing.assert_almost_equal(macro_pr.get()[1], pearsonr_expected_scipy) np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_np) np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_scipy) @@ -296,18 +317,18 @@ def test_pcc(): [ 7, 3 ], [ 2, 5 ], ]) - met_pcc = mx.gluon.metric.create('pcc') + met_pcc = mx.metric.create('pcc') met_pcc.update(labels, preds) _, pcc = met_pcc.get() # pcc should agree with mcc for binary classification - met_mcc = mx.gluon.metric.create('mcc') + met_mcc = mx.metric.create('mcc') met_mcc.update(labels, preds) _, mcc = met_mcc.get() np.testing.assert_almost_equal(pcc, mcc) # pcc should agree with Pearson for binary classification - met_pear = mx.gluon.metric.create('pearsonr') + met_pear = mx.metric.create('pearsonr') met_pear.update(labels, [p.argmax(axis=1) for p in preds]) _, pear = met_pear.get() np.testing.assert_almost_equal(pcc, pear) @@ -356,7 +377,7 @@ def test_pcc(): # * order # * batch size # * update frequency - labels = [ [ i.reshape(-1) ] for i in labels[0] ] + labels = [ [ i ] for i in labels[0] ] labels.reverse() preds = [ [ i.reshape((1, -1)) ] for i in preds[0] ] preds.reverse() @@ -371,20 +392,19 @@ def test_single_array_input(): pred = mx.nd.array([[1,2,3,4]]) label = pred + 0.1 - mse = mx.gluon.metric.create('mse') + mse = mx.metric.create('mse') mse.update(label, pred) _, mse_res = mse.get() np.testing.assert_almost_equal(mse_res, 0.01) - mae = mx.gluon.metric.create('mae') + mae = mx.metric.create('mae') mae.update(label, pred) mae.get() _, mae_res = mae.get() np.testing.assert_almost_equal(mae_res, 0.1) - rmse = mx.gluon.metric.create('rmse') + rmse = mx.metric.create('rmse') rmse.update(label, pred) rmse.get() _, rmse_res = rmse.get() np.testing.assert_almost_equal(rmse_res, 0.1) - diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py new file mode 100644 index 000000000000..65d86f62baf4 --- /dev/null +++ b/tests/python/unittest/test_module.py @@ -0,0 +1,1031 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import mxnet as mx +import mxnet.ndarray as nd +from mxnet.test_utils import * +import numpy as np +from functools import reduce +from mxnet.module.executor_group import DataParallelExecutorGroup +from common import setup_module, with_seed, assertRaises, teardown_module +from collections import namedtuple +curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) +sys.path.insert(0, os.path.join(curr_path, "../train")) +from test_bucketing import train_model, prepare_bucketing_data + + +@with_seed() +def test_module_dtype(): + dtype = np.float16 + dshape = (3, 8, 7) + + sym = mx.sym.Variable('data') + sym = mx.sym.Activation(data=sym, act_type='relu', __layout__='TNC') + + mod = mx.mod.Module(sym, ('data',), None, context=[mx.cpu(0), mx.cpu(1)]) + mod.bind(data_shapes=[mx.io.DataDesc('data', dshape, dtype, layout='TNC')]) + mod.init_params() + mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape, dtype=dtype)], + label=None)) + mod.backward([mx.nd.ones(dshape, dtype=dtype)]) + + for x in mod.get_outputs(): + assert x.dtype == dtype + + +def test_module_bind(): + sym = mx.sym.Variable('data') + sym = mx.sym.Activation(data=sym, act_type='relu', __layout__='TNC') + + mod = mx.mod.Module(sym, ('data',), None, context=[mx.cpu(0), mx.cpu(1)]) + assertRaises(TypeError, mod.bind, data_shapes=[('data', mx.nd.array([10,10]))]) + assert mod.binded == False + + mod.bind(data_shapes=[('data', (10,10))]) + assert mod.binded == True + + +@with_seed() +def test_module_input_grads(): + a = mx.sym.Variable('a', __layout__='NC') + b = mx.sym.Variable('b', __layout__='NC') + c = mx.sym.Variable('c', __layout__='NC') + + c = a + 2 * b + 3 * c + net = mx.mod.Module(c, data_names=['b', 'c', 'a'], label_names=None, + context=[mx.cpu(0), mx.cpu(1)]) + net.bind(data_shapes=[['b', (5, 5)], ['c', (5, 5)], ['a', (5, 5)]], + label_shapes=None, inputs_need_grad=True) + net.init_params() + + net.forward(data_batch=mx.io.DataBatch(data=[nd.ones((5, 5)), + nd.ones((5, 5)), + nd.ones((5, 5))])) + net.backward(out_grads=[nd.ones((5, 5))]) + input_grads = net.get_input_grads() + b_grad = input_grads[0].asnumpy() + c_grad = input_grads[1].asnumpy() + a_grad = input_grads[2].asnumpy() + assert np.all(a_grad == 1), a_grad + assert np.all(b_grad == 2), b_grad + assert np.all(c_grad == 3), c_grad + + +@with_seed() +def test_module_ctx_group(): + def check_module_ctx_group(ctxs, group2ctxs, grad_ctxs=None): + with mx.AttrScope(ctx_group='dev1'): + a = mx.symbol.Variable('a') + a = a * 2 + with mx.AttrScope(ctx_group='dev2'): + b = mx.symbol.Variable('b') + c = a + b + shape = (2, 5) + mod1 = mx.mod.Module(c, context=ctxs, data_names=['a', 'b'], label_names=None, + group2ctxs=group2ctxs) + mod1.bind(data_shapes=[['a', shape], ['b', shape]], inputs_need_grad=True) + mod1.init_params() + mod1.forward(data_batch=mx.io.DataBatch(data=[mx.nd.ones(shape), mx.nd.ones(shape)]), is_train=True) + mod1.backward([mx.nd.ones(shape)]) + mod1_input_grads = mod1.get_input_grads() + + mod2 = mx.mod.Module(c, context=ctxs, data_names=['a', 'b'], label_names=None) + mod2.bind(data_shapes=[['a', shape], ['b', shape]], inputs_need_grad=True) + mod2.init_params() + mod2.forward(data_batch=mx.io.DataBatch(data=[mx.nd.ones(shape), mx.nd.ones(shape)]), is_train=True) + mod2.backward([mx.nd.ones(shape)]) + mod2_input_grads = mod2.get_input_grads() + + if grad_ctxs is not None: + assert(mod1_input_grads[0].context == grad_ctxs[0]) + assert(mod1_input_grads[1].context == grad_ctxs[1]) + assert(np.all(mod1_input_grads[0].asnumpy() == mod2_input_grads[0].asnumpy())) + assert(np.all(mod1_input_grads[1].asnumpy() == mod2_input_grads[1].asnumpy())) + + check_module_ctx_group([mx.cpu(0)], {'dev1': mx.cpu(1), 'dev2': mx.cpu(2)}, grad_ctxs=[mx.cpu(1), mx.cpu(2)]) + check_module_ctx_group([mx.cpu(0), mx.cpu(1)], + [{'dev1': mx.cpu(2), 'dev2': mx.cpu(3)}, {'dev1': mx.cpu(4), 'dev2': mx.cpu(5)}]) + check_module_ctx_group([mx.cpu(0), mx.cpu(1)], {'dev1': mx.cpu(2), 'dev2': mx.cpu(3)}) + check_module_ctx_group([mx.cpu(0), mx.cpu(1)], {'dev1': mx.cpu(2), 'dev2': [mx.cpu(3)]}) + check_module_ctx_group([mx.cpu(0), mx.cpu(1)], {'dev1':mx.cpu(2), 'dev2':[mx.cpu(3), mx.cpu(3)]}) + check_module_ctx_group([mx.cpu(0), mx.cpu(1)], + {'dev1':[mx.cpu(2), mx.cpu(2)], 'dev2':[mx.cpu(3), mx.cpu(3)]}) + +@with_seed() +def test_bucket_module_ctx_group(): + num_hidden = 10 + batch_size = 5 + def sym_gen(seq_len): + with mx.AttrScope(ctx_group='dev1'): + data = mx.symbol.Variable('data') + weight = mx.symbol.Variable('dev1_weight') + bias = mx.symbol.Variable('dev1_bias') + fc = data + for i in range(seq_len): + fc = mx.symbol.FullyConnected(data=fc, weight=weight, bias=bias, + name='dev1_fc_%d' % i, num_hidden=num_hidden) + with mx.AttrScope(ctx_group='dev2'): + label = mx.symbol.Variable('label') + weight = mx.symbol.Variable('dev2_weight') + bias = mx.symbol.Variable('dev2_bias') + for i in range(seq_len): + fc = mx.symbol.FullyConnected(data=fc, weight=weight, bias=bias, + name='dev2_fc_%d' % i, num_hidden=num_hidden) + sym = mx.symbol.SoftmaxOutput(fc, label, name='softmax') + + return sym, ('data',), ('label',) + + mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10, context=[mx.cpu(0)], + group2ctxs=[{'dev1': mx.cpu(1), 'dev2': mx.cpu(2)}]) + mod.bind(data_shapes=[['data', (batch_size, num_hidden)]], + label_shapes=[['label', (batch_size,)]], + for_training=True, inputs_need_grad=True) + assert(mod.binded) + +@with_seed() +def test_module_layout(): + sym = mx.sym.Variable('data') + sym = mx.sym.Activation(data=sym, act_type='relu', __layout__='TNC') + + dshape = (3, 8, 7) + mod = mx.mod.Module(sym, ('data',), None, context=[mx.cpu(0), mx.cpu(1)]) + mod.bind(data_shapes=[mx.io.DataDesc('data', dshape, layout='TNC')]) + mod.init_params() + mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape)], + label=None)) + mod.backward([mx.nd.ones(dshape)]) + assert mod.get_outputs()[0].shape == dshape + + hdshape = (3, 4, 7) + for x in mod.get_outputs(merge_multi_context=False)[0]: + assert x.shape == hdshape + + +@with_seed() +def test_save_load(): + previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1") + os.putenv('MXNET_UPDATE_ON_KVSTORE', '1') + def dict_equ(a, b): + assert set(a) == set(b) + for k in a: + assert (a[k].asnumpy() == b[k].asnumpy()).all() + + sym = mx.sym.Variable('data') + sym = mx.sym.FullyConnected(sym, num_hidden=100) + + # single device + mod = mx.mod.Module(sym, ('data',)) + mod.bind(data_shapes=[('data', (10, 10))]) + mod.init_params() + mod.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9}) + mod.update() + mod.save_checkpoint('test', 0, save_optimizer_states=True) + + mod2 = mx.mod.Module.load('test', 0, load_optimizer_states=True, data_names=('data',)) + mod2.bind(data_shapes=[('data', (10, 10))]) + mod2.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9}) + assert mod._symbol.tojson() == mod2._symbol.tojson() + dict_equ(mod.get_params()[0], mod2.get_params()[0]) + dict_equ(mod._updater.states, mod2._updater.states) + + # multi device + mod = mx.mod.Module(sym, ('data',), context=[mx.cpu(0), mx.cpu(1)]) + mod.bind(data_shapes=[('data', (10, 10))]) + mod.init_params() + mod.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9}) + mod.update() + mod.save_checkpoint('test', 0, save_optimizer_states=True) + + mod2 = mx.mod.Module.load('test', 0, load_optimizer_states=True, data_names=('data',)) + mod2.bind(data_shapes=[('data', (10, 10))]) + mod2.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9}) + assert mod._symbol.tojson() == mod2._symbol.tojson() + dict_equ(mod.get_params()[0], mod2.get_params()[0]) + dict_equ(mod._kvstore._updater.states, mod2._updater.states) + os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore) + + +@with_seed() +def test_bucketing_save_load(): + previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1") + os.putenv('MXNET_UPDATE_ON_KVSTORE', '1') + def dict_equ(a, b): + assert set(a) == set(b) + for k in a: + assert (a[k].asnumpy() == b[k].asnumpy()).all() + + + len_vocab = 50 + num_embed = 25 + num_epochs = 5 + batch_size = 128 + num_layers = 2 + num_hidden = 25 + buckets = [5, 10, 20, 30, 40] + invalid_label = -1 + num_sentence=1000 + + stack = mx.rnn.SequentialRNNCell() + for i in range(num_layers): + stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_' % i)) + + def sym_gen(seq_len): + data = mx.sym.Variable('data') + label = mx.sym.Variable('softmax_label') + embed = mx.sym.Embedding(data=data, input_dim=len_vocab, + output_dim=num_embed, name='embed') + stack.reset() + outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True) + + pred = mx.sym.Reshape(outputs, shape=(-1, num_hidden)) + pred = mx.sym.FullyConnected(data=pred, num_hidden=len_vocab, name='pred') + + label = mx.sym.Reshape(label, shape=(-1,)) + loss = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') + + return loss, ('data',), ('softmax_label',) + + model = train_model(context=mx.current_context()) + model.save_checkpoint("test", 0) + data_train, data_val = prepare_bucketing_data(buckets, len_vocab, batch_size, invalid_label, num_sentence) + mod2 = mx.mod.BucketingModule.load('test', 0, sym_gen=sym_gen, + default_bucket_key=data_train.default_bucket_key) + + mod2.bind(data_shapes=data_train.provide_data, + label_shapes=data_train.provide_label) + + for bucket_key in model._buckets.keys(): + dict_equ(model._buckets[model._default_bucket_key].get_params()[0], + mod2._buckets[mod2._default_bucket_key].get_params()[0]) + mod2.fit( + train_data=data_train, + eval_data=data_val, + eval_metric=mx.metric.Perplexity(invalid_label), # Use Perplexity for multiclass classification. + kvstore='device', + optimizer='sgd', + optimizer_params={'learning_rate': 0.01, + 'momentum': 0, + 'wd': 0.00001}, + initializer=mx.init.Xavier(factor_type="in", magnitude=2.34), + num_epoch=num_epochs, + batch_end_callback=mx.callback.Speedometer(batch_size, 50)) + os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore) + + +@with_seed() +def test_module_reshape(): + data = mx.sym.Variable('data') + sym = mx.sym.FullyConnected(data, num_hidden=20, name='fc') + + dshape = (7, 20) + mod = mx.mod.Module(sym, ('data',), None, context=[mx.cpu(0), mx.cpu(1)]) + mod.bind(data_shapes=[('data', dshape)]) + mod.init_params() + mod.init_optimizer(optimizer_params={'learning_rate': 1}) + + mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape)], + label=None)) + mod.backward([mx.nd.ones(dshape)]) + mod.update() + assert mod.get_outputs()[0].shape == dshape + assert (mod.get_params()[0]['fc_bias'].asnumpy() == -1).all() + + dshape = (14, 20) + mod.reshape(data_shapes=[('data', dshape)]) + mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape)], + label=None)) + mod.backward([mx.nd.ones(dshape)]) + mod.update() + assert mod.get_outputs()[0].shape == dshape + assert (mod.get_params()[0]['fc_bias'].asnumpy() == -3).all() + + +@with_seed() +def test_module_states(): + stack = mx.rnn.SequentialRNNCell() + for i in range(2): + stack.add(mx.rnn.LSTMCell(num_hidden=20, prefix='lstm_l%d_'%i)) + begin_state = stack.begin_state(func=mx.sym.Variable) + _, states = stack.unroll(10, begin_state=begin_state, inputs=mx.sym.Variable('data')) + + state_names = [i.name for i in begin_state] + mod = mx.mod.Module(mx.sym.Group(states), context=[mx.cpu(0), mx.cpu(1)], + label_names=None, state_names=state_names) + mod.bind(data_shapes=[('data', (5, 10))], label_shapes=None, for_training=False) + mod.init_params() + batch = mx.io.DataBatch(data=[mx.nd.zeros((5, 10))], label=[]) + + mod.set_states(value=1) + mod.forward(batch) + out = mod.get_outputs(merge_multi_context=False) + out1 = mod.get_outputs(merge_multi_context=True) + + mod.set_states(states=out) + mod.forward(batch) + out2 = mod.get_outputs(merge_multi_context=True) + + for x1, x2 in zip(out1, out2): + assert not mx.test_utils.almost_equal(x1.asnumpy(), x2.asnumpy(), rtol=1e-3) + + +@with_seed() +def test_module_switch_bucket(): + vocab_dim = 5000 + num_hidden = 100 + num_embedding = 100 + num_layer = 2 + default_key = 10 + test_key = 5 + batch_size = 32 + contexts = [mx.cpu(0)] + initializer = mx.init.Xavier(factor_type="in", magnitude=2.34) + + #generate symbols for an LSTM network + def sym_gen(seq_len): + data = mx.sym.Variable('data') + label = mx.sym.Variable('softmax_label') + embed = mx.sym.Embedding(data=data, input_dim=vocab_dim, + output_dim=num_embedding) + stack = mx.rnn.SequentialRNNCell() + for i in range(num_layer): + stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_'%i)) + outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True) + + pred = mx.sym.Reshape(outputs, shape=(-1, num_hidden)) + pred = mx.sym.FullyConnected(data=pred, num_hidden=vocab_dim, name='pred') + + label = mx.sym.Reshape(label, shape=(-1,)) + pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') + + return pred, ('data',), ('softmax_label',) + + def create_bucketing_module(key): + model = mx.mod.BucketingModule( + sym_gen = sym_gen, + default_bucket_key = key, + context = contexts) + model.bind([('data', (batch_size, key))], + [('softmax_label', (batch_size, key))], True, False) + model.init_params(initializer=initializer) + return model + #initialize the bucketing module with the default bucket key + bucketing_model = create_bucketing_module(default_key) + #check name + assert bucketing_model.symbol.list_arguments()[1] == "embedding0_weight",\ + "Error in assigning names for args in BucketingModule" + + #switch to test_key + bucketing_model.switch_bucket(test_key, [('data', (batch_size, test_key))], + [('softmax_label', (batch_size, test_key))]) + total_bytes_before = bucketing_model._buckets[default_key]._total_exec_bytes + + #remove test_key and switch again + del bucketing_model._buckets[test_key] + bucketing_model.switch_bucket(test_key, [('data', (batch_size, test_key))], + [('softmax_label', (batch_size, test_key))]) + total_bytes_after = bucketing_model._buckets[default_key]._total_exec_bytes + #the default bucket is expected to reuse the bytes allocated + assert total_bytes_after == total_bytes_before + + +# roywei: Getting rid of fixed seed as flakiness could not be reproduced, +# tracked at: https://github.com/apache/incubator-mxnet/issues/11705 +@with_seed() +def test_module_set_params(): + # data iter + data = mx.nd.array([[0.05, .10]]); + label = mx.nd.array([[.01, 0.99]]); + train_data = mx.io.NDArrayIter(data, label, batch_size=1) + + # symbols + x = mx.symbol.Variable('data') + x = mx.symbol.FullyConnected(name='fc_0', data=x, num_hidden=2) + x = mx.symbol.Activation(name="act_0", data=x, act_type='sigmoid') + x = mx.symbol.FullyConnected(name='fc_1', data=x, num_hidden=2) + x = mx.symbol.Activation(name="act_1", data=x, act_type='sigmoid') + x = mx.symbol.LinearRegressionOutput(data=x, name='softmax', grad_scale=2) + + # create module + mod = mx.mod.Module(x, context=[mx.cpu()]); + mod.bind(train_data.provide_data, label_shapes=train_data.provide_label, + for_training=True) + + arg_params_correct = {'fc_0_weight': mx.nd.array([[.15, .20], [.25, .30]]), + 'fc_0_bias' : mx.nd.array([.35, .35]), + 'fc_1_weight': mx.nd.array([[.40, .45], [.50, .55]]), + 'fc_1_bias' : mx.nd.array([.60, .60])} + + arg_params_missing = {'fc_0_weight': mx.nd.array([[.15, .20], [.25, .30]]), + 'fc_0_bias' : mx.nd.array([.35, .35]), + 'fc_1_weight': mx.nd.array([[.40, .45], [.50, .55]])} + + arg_params_extra = {'fc_0_weight': mx.nd.array([[.15, .20], [.25, .30]]), + 'fc_0_bias' : mx.nd.array([.35, .35]), + 'fc_1_weight': mx.nd.array([[.40, .45], [.50, .55]]), + 'fc_1_bias' : mx.nd.array([.60, .60]), + 'fc_2_weight': mx.nd.array([.60, .60])} + + arg_params_missing_extra = {'fc_2_weight': mx.nd.array([.60, .60])} + + # test regular set_params + mod.set_params(force_init=True, arg_params=arg_params_correct, aux_params={}) + + # test allow missing + mod.set_params(force_init=True, arg_params=arg_params_missing, aux_params={}, allow_missing=True) + assertRaises(RuntimeError, mod.set_params, + force_init=True, arg_params=arg_params_missing, + aux_params={}, allow_missing=False) + + # test allow extra + mod.set_params(force_init=True, arg_params=arg_params_extra, aux_params={}, allow_missing=True, allow_extra=True) + assertRaises(ValueError, mod.set_params, + force_init=True, arg_params=arg_params_extra, + aux_params={}, allow_missing=True, allow_extra=False) + + # test allow missing + extra, + assertRaises(RuntimeError, mod.set_params, + force_init=True, arg_params=arg_params_missing_extra, + aux_params={}, allow_missing=False, allow_extra=False) + + # test allow missing + extra, this will throw a runtime error + assertRaises(ValueError, mod.set_params, + force_init=True, arg_params=arg_params_missing_extra, + aux_params={}, allow_missing=True, allow_extra=False) + + +@with_seed() +def test_monitor(): + # data iter + data = mx.nd.array([[0.05, .10]]); + label = mx.nd.array([[.01, 0.99]]); + train_data = mx.io.NDArrayIter(data, label, batch_size=1) + + # symbols + x = mx.symbol.Variable('data') + x = mx.symbol.FullyConnected(name='fc_0', data=x, num_hidden=2) + x = mx.symbol.Activation(name="act_0", data=x, act_type='sigmoid') + x = mx.symbol.FullyConnected(name='fc_1', data=x, num_hidden=2) + x = mx.symbol.Activation(name="act_1", data=x, act_type='sigmoid') + x = mx.symbol.LinearRegressionOutput(data=x, name='softmax', grad_scale=2) + + # create monitor + def mean_abs(x): + sum_abs = mx.ndarray.sum(mx.ndarray.abs(x)) + return mx.ndarray.divide(sum_abs, reduce(lambda x, y: x * y, x.shape)) + mon = mx.mon.Monitor(1, stat_func=mean_abs, pattern='.*', sort=True) + + # create module + mod = mx.mod.Module(x, context=[mx.cpu()]); + mod.bind(train_data.provide_data, label_shapes=train_data.provide_label, + for_training=True) + mod.install_monitor(mon) + arg_params = {'fc_0_weight': mx.nd.array([[.15, .20], [.25, .30]]), + 'fc_0_bias' : mx.nd.array([.35, .35]), + 'fc_1_weight': mx.nd.array([[.40, .45], [.50, .55]]), + 'fc_1_bias' : mx.nd.array([.60, .60])} + mod.init_params(arg_params=arg_params) + + data_iter = iter(train_data) + data_batch = next(data_iter) + mon.tic() + mod.forward_backward(data_batch) + res = mon.toc() + keys = ['act_0', 'act_1', 'data', 'fc_0', 'fc_1', 'softmax'] + mon_result_counts = [0, 0, 0, 0, 0, 0] + assert(len(res) == 21) + for n, k, v in res: + for idx, key in enumerate(keys): + if k.startswith(key): + mon_result_counts[idx] += 1 + break + assert(mon_result_counts == [2, 2, 1, 6, 6, 4]) + +@with_seed() +def test_executor_group(): + def get_rnn_sym(num_layers, num_words, num_hidden, num_embed, seq_len, sparse_embedding): + stack = mx.rnn.SequentialRNNCell() + for i in range(num_layers): + stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_' % i)) + data = mx.sym.Variable('data') + label = mx.sym.Variable('softmax_label') + if sparse_embedding: + embed_weight = mx.sym.Variable('embed_weight', stype='row_sparse') + embed = mx.sym.contrib.SparseEmbedding(data=data, input_dim=num_words, + weight=embed_weight, output_dim=num_embed, + name='embed') + else: + embed = mx.sym.Embedding(data=data, input_dim=num_words, + output_dim=num_embed, name='embed') + + stack.reset() + outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True) + + pred = mx.sym.Reshape(outputs, shape=(-1, num_hidden)) + pred = mx.sym.FullyConnected(data=pred, num_hidden=num_words, name='pred') + + label = mx.sym.Reshape(label, shape=(-1,)) + pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax') + return pred + + def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=None, + extra_args=None, check_shared_grad=True): + # Test shared data arrays + for i in range(len(exec_grp_shared.execs)): + # test same shared_data_arrays for two exec groups + shared_data_array1 = exec_grp_shared.shared_data_arrays[i] + shared_data_array2 = exec_grp_created.shared_data_arrays[i] + if extra_args is not None: + assert len(shared_data_array1) == len(extra_args),\ + "exec_grp_shared.shared_data_arrays[%d] should have same number of args as extra_args" + assert len(shared_data_array1) == len(shared_data_array2),\ + "length of shared_data_array of the shared executor group not equal to the created executor group" + for k, v in shared_data_array1.items(): + if extra_args is not None: + assert k in extra_args, "arg %s is not in extra_args" % k + assert k in shared_data_array2,\ + "arg %s of the shared executor group not in the shared_data_array of the created executor group" % k + assert mx.test_utils.same_array(v, shared_data_array2[k]) + + for data_name, array in exec_grp_shared.shared_data_arrays[i].items(): + assert data_name in exec_grp_created.shared_data_arrays[i], \ + "Shared input data '%s' is not in " \ + "shared_data_arrays of created executor group." % (data_name) + assert mx.test_utils.same_array(array, exec_grp_created.shared_data_arrays[i][data_name]), \ + "Shared input data '%s' does not share memory." % (data_name) + + # Test shared argument arrays and gradient arrays + exec_shared = exec_grp_shared.execs[i] + exec_created = exec_grp_created.execs[i] + if shared_arg_names is not None: + # test shared arguments + for arg_name in shared_arg_names: + assert arg_name in exec_created.arg_dict, \ + "Shared argument '%s' is not in arg_dict of created executor group." % (arg_name) + assert mx.test_utils.same_array(exec_shared.arg_dict[arg_name], exec_created.arg_dict[arg_name]), \ + "Shared argument '%s' does not share memory." % (arg_name) + # test shared argument gradients + if check_shared_grad: + for arg_name in shared_arg_names: + assert arg_name in exec_created.grad_dict, \ + "Shared argument gradient '%s' is not in " \ + "grad_dict of created executor group." % (arg_name) + assert mx.test_utils.same_array(exec_shared.grad_dict[arg_name], \ + exec_created.grad_dict[arg_name]), \ + "Shared argument gradient '%s' does not share memory." % (arg_name) + + for arg_name, grad in exec_grp_shared.grad_req.items(): + assert grad == exec_grp_created.grad_req[arg_name], \ + "Gradient requirements for shared argument '%s' are inconsistent. " \ + "Shared executor group requires '%s' while created executor group requires '%s'" \ + %(arg_name, grad, exec_grp_created.grad_req[arg_name]) + + def check_shared_exec_group(sparse_embedding): + # generate an rnn sym with #layers=5 + sym = get_rnn_sym(num_layers=3, num_words=num_words, num_hidden=num_hidden, + num_embed=num_embed, seq_len=max_bucket_size, + sparse_embedding=sparse_embedding) + arg_names1 = sym.list_arguments() + input_names = [name[0] for name in data_shapes] + [name[0] for name in label_shapes] + shared_arg_names = [name for name in arg_names1 if name not in input_names] + exec_group1 = DataParallelExecutorGroup(symbol=sym, contexts=contexts, + workload=workload, data_shapes=data_shapes, + label_shapes=label_shapes, param_names=shared_arg_names, + for_training=True, inputs_need_grad=False) + + # shared_data_arrays should only have input "data" and "softmax_label" arrays + for i in range(len(contexts)): + assert len(exec_group1.shared_data_arrays[i]) == len(input_names),\ + "exec_group1.shared_data_arrays[%d] should have the same number of names as in input_names" % i + for name in input_names: + assert name in exec_group1.shared_data_arrays[i],\ + "arg %s should be in exec_group1.shared_data_arrays[%d]" % (name, i) + + # generate an rnn sym with #layers=5 + sym = get_rnn_sym(num_layers=5, num_words=num_words, num_hidden=num_hidden, + num_embed=num_embed, seq_len=max_bucket_size, + sparse_embedding=sparse_embedding) + arg_names2 = sym.list_arguments() + exec_group2 = DataParallelExecutorGroup(symbol=sym, contexts=contexts, + workload=workload, data_shapes=data_shapes, + label_shapes=label_shapes, param_names=shared_arg_names, + for_training=True, inputs_need_grad=False, + shared_group=exec_group1) + extra_args = [name for name in arg_names2 if name not in shared_arg_names] + check_shared_grad = not sparse_embedding + test_shared_exec_group(exec_grp_shared=exec_group1, exec_grp_created=exec_group2, + shared_arg_names=shared_arg_names, extra_args=extra_args, + check_shared_grad=check_shared_grad) + + contexts = [mx.cpu(0), mx.cpu(1)] + workload = [1] * len(contexts) + batch_size = 32 + max_bucket_size = 80 + num_words = 1000 + num_hidden = 100 + num_embed = 200 + data_shapes = [('data', (batch_size, max_bucket_size))] + label_shapes = [('softmax_label', (batch_size, max_bucket_size))] + sparse_embedding_opt = [True, False] + for opt in sparse_embedding_opt: + check_shared_exec_group(opt) + +@with_seed() +def test_factorization_machine_module(): + """ Test factorization machine model with sparse operators """ + # this unit test is to test the flow, training accuracy is tested in another test + def check_factorization_machine_module(num_epochs=None): + print("check_factorization_machine_module") + + def fm(factor_size, feature_dim, init): + x = mx.symbol.Variable("data", stype='csr') + v = mx.symbol.Variable("v", shape=(feature_dim, factor_size), + init=init, stype='row_sparse') + + w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1), + init=init, stype='row_sparse') + w1_bias = mx.symbol.var('w1_bias', shape=(1)) + w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias) + + v_s = mx.symbol._internal._square_sum(data=v, axis=1, keepdims=True) + x_s = mx.symbol.square(data=x) + bd_sum = mx.sym.dot(x_s, v_s) + + w2 = mx.symbol.dot(x, v) + w2_squared = 0.5 * mx.symbol.square(data=w2) + + w_all = mx.symbol.Concat(w1, w2_squared, dim=1) + sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True) + sum2 = 0.5 * mx.symbol.negative(bd_sum) + model = mx.sym.elemwise_add(sum1, sum2) + + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y) + return model + + # model + init = mx.initializer.Normal(sigma=0.01) + factor_size = 4 + feature_dim = 10000 + model = fm(factor_size, feature_dim, init) + + # data iter + num_batches = 5 + batch_size = 64 + num_samples = batch_size * num_batches + # generate some random csr data + csr_nd = rand_ndarray((num_samples, feature_dim), 'csr', 0.1) + label = mx.nd.ones((num_samples,1)) + # the alternative is to use LibSVMIter + train_iter = mx.io.NDArrayIter(data=csr_nd, + label={'label':label}, + batch_size=batch_size, + last_batch_handle='discard') + # create module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label']) + # allocate memory by given the input data and lable shapes + mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label) + # initialize parameters by uniform random numbers + mod.init_params(initializer=init) + + # use Sparse SGD with learning rate 0.1 to train + sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01, + rescale_grad=1.0/batch_size) + mod.init_optimizer(optimizer=sgd) + if num_epochs is None: + num_epochs = 50 + expected_accuracy = 0.02 + + # use accuracy as the metric + metric = mx.metric.create('MSE') + # train 'num_epochs' epoch + for epoch in range(num_epochs): + train_iter.reset() + metric.reset() + for batch in train_iter: + mod.forward(batch, is_train=True) # compute predictions + mod.update_metric(metric, batch.label) # accumulate prediction accuracy + mod.backward() # compute gradients + mod.update() # update parameters + print('Epoch %d, Training %s' % (epoch, metric.get())) + if num_epochs > 1: + assert(metric.get()[1] < expected_accuracy) + + check_factorization_machine_module() + +@with_seed() +def test_module_initializer(): + def regression_model(m): + x = mx.symbol.var("data", stype='csr') + v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1), + stype='row_sparse') + model = mx.symbol.dot(lhs=x, rhs=v) + y = mx.symbol.Variable("label") + model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out") + return model + + n, m = 128, 100 + model = regression_model(m) + + data = mx.nd.zeros(shape=(n, m), stype='csr') + label = mx.nd.zeros((n, 1)) + iterator = mx.io.NDArrayIter(data=data, label={'label':label}, + batch_size=n, last_batch_handle='discard') + + # create module + mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label']) + mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label) + mod.init_params() + v = mod._arg_params['v'] + assert(v.stype == 'row_sparse') + assert(np.sum(v.asnumpy()) != 0) + +@with_seed() +def test_forward_reshape(): + num_class=10 + data1 = mx.sym.Variable('data1') + data2 = mx.sym.Variable('data2') + conv1 = mx.sym.Convolution(data=data1, kernel=(2, 2), num_filter=2, stride=(2, 2)) + conv2 = mx.sym.Convolution(data=data2, kernel=(3, 3), num_filter=3, stride=(1, 1)) + pooling1 = mx.sym.Pooling(data=conv1, kernel=(2, 2), stride=(1, 1), pool_type="avg") + pooling2 = mx.sym.Pooling(data=conv2, kernel=(2, 2), stride=(1, 1), pool_type="max") + flatten1 = mx.sym.flatten(data=pooling1) + flatten2 = mx.sym.flatten(data=pooling2) + sum = mx.sym.sum(data=flatten1, axis=1) + mx.sym.sum(data=flatten2, axis=1) + fc = mx.sym.FullyConnected(data=sum, num_hidden=num_class) + sym = mx.sym.SoftmaxOutput(data=fc, name='softmax') + + dshape1 = (10, 3, 64, 64) + dshape2 = (10, 3, 32, 32) + lshape = (10,) + + mod = mx.mod.Module(symbol=sym, data_names=['data1', 'data2'], + label_names=['softmax_label']) + mod.bind(data_shapes=[('data1', dshape1), ('data2', dshape2)], + label_shapes=[('softmax_label', lshape)]) + mod.init_params() + mod.init_optimizer(optimizer_params={'learning_rate': 0.01}) + + # Train with original data shapes + data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1), + mx.nd.random.uniform(5, 15, dshape2)], + label=[mx.nd.ones(lshape)]) + mod.forward(data_batch) + assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class]) + mod.backward() + mod.update() + + # Train with different batch size + dshape1 = (3, 3, 64, 64) + dshape2 = (3, 3, 32, 32) + lshape = (3,) + data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1), + mx.nd.random.uniform(5, 15, dshape2)], + label=[mx.nd.ones(lshape)]) + mod.forward(data_batch) + assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class]) + mod.backward() + mod.update() + + dshape1 = (20, 3, 64, 64) + dshape2 = (20, 3, 32, 32) + lshape = (20,) + data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(3, 5, dshape1), + mx.nd.random.uniform(10, 25, dshape2)], + label=[mx.nd.ones(lshape)]) + mod.forward(data_batch) + assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class]) + mod.backward() + mod.update() + + #Train with both different batch size and data shapes + dshape1 = (20, 3, 120, 120) + dshape2 = (20, 3, 32, 64) + lshape = (20,) + data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1), + mx.nd.random.uniform(5, 15, dshape2)], + label=[mx.nd.ones(lshape)]) + mod.forward(data_batch) + assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class]) + mod.backward() + mod.update() + + dshape1 = (5, 3, 28, 40) + dshape2 = (5, 3, 24, 16) + lshape = (5,) + data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1), + mx.nd.random.uniform(15, 25, dshape2)], + label=[mx.nd.ones(lshape)]) + mod.forward(data_batch) + assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class]) + mod.backward() + mod.update() + + #Test score + dataset_shape1 = (30, 3, 30, 30) + dataset_shape2 = (30, 3, 20, 40) + labelset_shape = (30,) + + eval_dataiter = mx.io.NDArrayIter(data=[mx.nd.random.uniform(0, 9, dataset_shape1), + mx.nd.random.uniform(15, 25, dataset_shape2)], + label=[mx.nd.ones(labelset_shape)], + batch_size=5) + assert len(mod.score(eval_data=eval_dataiter, eval_metric='acc')) == 1 + + #Test prediction + dshape1 = (1, 3, 30, 30) + dshape2 = (1, 3, 20, 40) + dataset_shape1 = (10, 3, 30, 30) + dataset_shape2 = (10, 3, 20, 40) + + pred_dataiter = mx.io.NDArrayIter(data=[mx.nd.random.uniform(0, 9, dataset_shape1), + mx.nd.random.uniform(15, 25, dataset_shape2)]) + mod.bind(data_shapes=[('data1', dshape1), ('data2', dshape2)], + for_training=False, force_rebind=True) + assert mod.predict(pred_dataiter).shape == tuple([10, num_class]) + +@with_seed() +def test_forward_types(): + #Test forward with other data batch API + Batch = namedtuple('Batch', ['data']) + data = mx.sym.Variable('data') + out = data * 2 + mod = mx.mod.Module(symbol=out, label_names=None) + mod.bind(data_shapes=[('data', (1, 10))]) + mod.init_params() + data1 = [mx.nd.ones((1, 10))] + mod.forward(Batch(data1)) + assert mod.get_outputs()[0].shape == (1, 10) + data2 = [mx.nd.ones((3, 5))] + mod.forward(Batch(data2)) + assert mod.get_outputs()[0].shape == (3, 5) + + #Test forward with other NDArray and np.ndarray inputs + data = mx.sym.Variable('data') + out = data * 2 + mod = mx.mod.Module(symbol=out, label_names=None) + mod.bind(data_shapes=[('data', (1, 10))]) + mod.init_params() + data1 = mx.nd.ones((1, 10)) + assert mod.predict(data1).shape == (1, 10) + data2 = np.ones((1, 10)) + assert mod.predict(data1).shape == (1, 10) + + +def test_reference_single_batch_during_fit(): + """ + When using C++-based iterators, it's important that only a single batch is referenced at a time. Because C++ + iterators are exposed to the Python code through a C API, there is no concept of reference counting. Hence, + typically C++ iterators will deallocate a batch when next() is called on them. So, we need to make sure the Python + code only references a single batch at a time, otherwise the Python code will attempt to access freed memory, + resulting in either (a) garbage accuracy or (b) a segmentation fault. + """ + current_batch_i = None + + class MockBatch(object): + def __init__(self, i): + self.i = i + + @property + def label(self): + global current_batch_i + assert self.i == current_batch_i + + class MockTrainData(object): + def __init__(self, batches): + self._i = 0 + self._batches = batches + self.provide_data = None + self.provide_label = None + self.reset = lambda: None + + def __iter__(self): + self._i = 0 + return self + + def __next__(self): + global current_batch_i + + if self._i < self._batches: + current_batch_i = self._i + self._i += 1 + return MockBatch(current_batch_i) + raise StopIteration + + def next(self): + return self.__next__() + + mod = mx.mod.BaseModule() + + def empty_fn(*args, **kwargs): + pass + mod.bind = empty_fn + mod.init_params = empty_fn + mod.init_optimizer = empty_fn + mod.forward = empty_fn + mod.backward = empty_fn + mod.update = empty_fn + mod.update_metric = empty_fn + mod.get_params = lambda: (None, None) + + train_data = MockTrainData(batches=2) + mod.fit(train_data, num_epoch=1) + +@with_seed() +def test_bucket_module_grad_req(): + batch_size = 2 + def sym_gen(_): + data = mx.symbol.Variable('data') + weight = mx.symbol.Variable('a', shape=(1,), init=mx.init.One()) + sym = mx.sym.make_loss(mx.sym.broadcast_mul(data, weight)) + return sym, ('data',), None + + mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10) + mod.bind(data_shapes=[['data', (batch_size, )]], for_training=True, grad_req='write') + mod.init_params() + + mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))], + label=None, + provide_data=[mx.io.DataDesc(name='data', shape=(batch_size, ), layout='N')], + bucket_key=10)) + assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == batch_size) + + mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))], + label=None, + provide_data=[mx.io.DataDesc(name='data', shape=(batch_size, ), layout='N')], + bucket_key=5)) + assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == batch_size) + + mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10) + mod.bind(data_shapes=[['data', (batch_size, )]], for_training=True, grad_req='add') + mod.init_params() + + mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))], + label=None, + provide_data=[mx.io.DataDesc(name='data', shape=(batch_size,), layout='N')], + bucket_key=10)) + assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == batch_size) + + mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))], + label=None, + provide_data=[mx.io.DataDesc(name='data', shape=(batch_size,), layout='N')], + bucket_key=5)) + assert mod._curr_module._grad_req == 'add' + assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == 2 * batch_size) + + +def test_module_update_no_pragram(): + # test module to do update on layers without params + data_shape = (10, 10) + data = mx.sym.Variable('data') + out = mx.sym.Dropout(data, 0.5) + mod = mx.mod.Module(out) + mod.bind(data_shapes=[('data', data_shape)]) + mod.init_params() + mod.init_optimizer() + data_batch = mx.io.DataBatch([nd.ones(data_shape)]) + mod.forward_backward(data_batch) + mod.update() + assert(mod.get_outputs()[0].shape == data_shape) + + +def test_module_init_optimizer(): + def get_module_idx2name(mod): + idx2name = {} + idx2name.update(enumerate(mod._exec_group.param_names)) + return idx2name + + data = mx.sym.Variable('data') + sym = mx.sym.FullyConnected(data, num_hidden=20, name='fc') + batch_size = 8 + opt_params = {'learning_rate': 1, 'rescale_grad': 1.0 / batch_size} + + # Pass an optimizer str + mod1 = mx.mod.Module(sym, ('data',), None, context=mx.cpu(0)) + mod1.bind(data_shapes=[('data', (batch_size, 20))]) + mod1.init_params() + mod1.init_optimizer(optimizer='sgd', optimizer_params=opt_params) + assert mod1._optimizer.idx2name == get_module_idx2name(mod1) + + # Pass an Optimizer object + mod2 = mx.mod.Module(sym, ('data',), None, context=mx.cpu(0)) + mod2.bind(data_shapes=[('data', (batch_size, 20))]) + mod2.init_params() + opt = mx.optimizer.SGD(**opt_params) + mod2.init_optimizer(optimizer=opt) + assert mod2._optimizer.idx2name == get_module_idx2name(mod2) + diff --git a/tools/caffe_converter/test_converter.py b/tools/caffe_converter/test_converter.py new file mode 100644 index 000000000000..49f8bdb167c2 --- /dev/null +++ b/tools/caffe_converter/test_converter.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Test converted models +""" +import os +import argparse +import sys +import logging +import mxnet as mx +from convert_caffe_modelzoo import convert_caffe_model, get_model_meta_info, download_caffe_model +from compare_layers import convert_and_compare_caffe_to_mxnet + +curr_path = os.path.abspath(os.path.dirname(__file__)) +sys.path.append(os.path.join(curr_path, "../../example/image-classification")) +from test_score import download_data # pylint: disable=wrong-import-position +from score import score # pylint: disable=wrong-import-position +logging.basicConfig(level=logging.DEBUG) + +def test_imagenet_model_performance(model_name, val_data, gpus, batch_size): + """test model performance on imagenet """ + logging.info('test performance of model: %s', model_name) + meta_info = get_model_meta_info(model_name) + [model_name, mean] = convert_caffe_model(model_name, meta_info) + sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0) + acc = [mx.metric.create('acc'), mx.metric.create('top_k_accuracy', top_k=5)] + if isinstance(mean, str): + mean_args = {'mean_img':mean} + else: + mean_args = {'rgb_mean':','.join([str(i) for i in mean])} + + print(val_data) + gpus_string = '' if gpus[0] == -1 else ','.join([str(i) for i in gpus]) + (speed,) = score(model=(sym, arg_params, aux_params), + data_val=val_data, + label_name='prob_label', + metrics=acc, + gpus=gpus_string, + batch_size=batch_size, + max_num_examples=500, + **mean_args) + logging.info('speed : %f image/sec', speed) + for a in acc: + logging.info(a.get()) + max_performance_diff_allowed = 0.03 + assert acc[0].get()[1] > meta_info['top-1-acc'] - max_performance_diff_allowed + assert acc[1].get()[1] > meta_info['top-5-acc'] - max_performance_diff_allowed + + +def test_model_weights_and_outputs(model_name, image_url, gpu): + """ + Run the layer comparison on one of the known caffe models. + :param model_name: available models are listed in convert_caffe_modelzoo.py + :param image_url: image file or url to run inference on + :param gpu: gpu to use, -1 for cpu + """ + + logging.info('test weights and outputs of model: %s', model_name) + meta_info = get_model_meta_info(model_name) + + (prototxt, caffemodel, mean) = download_caffe_model(model_name, meta_info, dst_dir='./model') + convert_and_compare_caffe_to_mxnet(image_url, gpu, prototxt, caffemodel, mean, + mean_diff_allowed=1e-03, max_diff_allowed=1e-01) + + +def main(): + """Entrypoint for test_converter""" + parser = argparse.ArgumentParser(description='Test Caffe converter') + parser.add_argument('--cpu', action='store_true', help='use cpu?') + parser.add_argument('--image_url', type=str, + default='https://github.com/dmlc/web-data/raw/master/mxnet/doc/'\ + 'tutorials/python/predict_image/cat.jpg', + help='input image to test inference, can be either file path or url') + args = parser.parse_args() + if args.cpu: + gpus = [-1] + default_batch_size = 32 + else: + num_gpus = mx.context.num_gpus() + assert num_gpus, 'At least one GPU is needed to run test_converter in GPU mode' + default_batch_size = 32 * num_gpus + + models = ['bvlc_googlenet', 'vgg-16', 'resnet-50'] + + val = download_data() + for m in models: + test_model_weights_and_outputs(m, args.image_url, gpus[0]) + # Build/testing machines tend to be short on GPU memory + this_batch_size = default_batch_size / 4 if m == 'vgg-16' else default_batch_size + test_imagenet_model_performance(m, val, gpus, this_batch_size) + +if __name__ == '__main__': + main()