From 0df09859578bffa7f1b3c42ec8168b678f8719cf Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Thu, 30 May 2019 17:57:13 -0700 Subject: [PATCH 1/2] Add training container in v1alpha2 --- .../RL-cifar10/Dockerfile | 32 ++++ .../RL-cifar10/ModelConstructor.py | 67 +++++++++ .../RL-cifar10/RunTrial.py | 77 ++++++++++ .../RL-cifar10/op_library.py | 141 ++++++++++++++++++ .../RL-cifar10/requirements.txt | 2 + 5 files changed, 319 insertions(+) create mode 100644 examples/v1alpha2/NAS-training-containers/RL-cifar10/Dockerfile create mode 100644 examples/v1alpha2/NAS-training-containers/RL-cifar10/ModelConstructor.py create mode 100644 examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py create mode 100644 examples/v1alpha2/NAS-training-containers/RL-cifar10/op_library.py create mode 100644 examples/v1alpha2/NAS-training-containers/RL-cifar10/requirements.txt diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/Dockerfile b/examples/v1alpha2/NAS-training-containers/RL-cifar10/Dockerfile new file mode 100644 index 00000000000..212eb23bc02 --- /dev/null +++ b/examples/v1alpha2/NAS-training-containers/RL-cifar10/Dockerfile @@ -0,0 +1,32 @@ +ARG cuda_version=9.0 +ARG cudnn_version=7 +FROM nvidia/cuda:${cuda_version}-cudnn${cudnn_version}-devel + +# Install system packages +RUN apt-get update && apt-get install -y software-properties-common && \ + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + bzip2 \ + g++ \ + git \ + graphviz \ + libgl1-mesa-glx \ + libhdf5-dev \ + openmpi-bin \ + python3.5 \ + python3-pip \ + python3-setuptools \ + python3-dev \ + wget && \ + rm -rf /var/lib/apt/lists/* + + +ADD . /app +WORKDIR /app + +RUN pip3 install --upgrade pip +RUN pip3 install --no-cache-dir -r requirements.txt +ENV PYTHONPATH /app + +ENTRYPOINT ["python3.5", "-u", "RunTrial.py"] diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/ModelConstructor.py b/examples/v1alpha2/NAS-training-containers/RL-cifar10/ModelConstructor.py new file mode 100644 index 00000000000..6ef93cc3301 --- /dev/null +++ b/examples/v1alpha2/NAS-training-containers/RL-cifar10/ModelConstructor.py @@ -0,0 +1,67 @@ +import numpy as np +from keras.models import Model +from keras import backend as K +import json +from keras.layers import Input, Conv2D, ZeroPadding2D, concatenate, MaxPooling2D, \ + AveragePooling2D, Dense, Activation, BatchNormalization, GlobalAveragePooling2D, Dropout +from op_library import concat, conv, sp_conv, dw_conv, reduction + + +class ModelConstructor(object): + def __init__(self, arc_json, nn_json): + self.arch = json.loads(arc_json) + nn_config = json.loads(nn_json) + self.num_layers = nn_config['num_layers'] + self.input_sizes = nn_config['input_sizes'] + self.output_size = nn_config['output_sizes'][-1] + self.embedding = nn_config['embedding'] + + def build_model(self): + # a list of the data all layers + all_layers = [0 for _ in range(self.num_layers + 1)] + # a list of all the dimensions of all layers + all_dims = [0 for _ in range(self.num_layers + 1)] + + # ================= Stacking layers ================= + # Input Layer. Layer 0 + input_layer = Input(shape=self.input_sizes) + all_layers[0] = input_layer + + # Intermediate Layers. Starting from layer 1. + for l in range(1, self.num_layers + 1): + input_layers = list() + opt = self.arch[l - 1][0] + opt_config = self.embedding[str(opt)] + skip = self.arch[l - 1][1:l+1] + + # set up the connection to the previous layer first + input_layers.append(all_layers[l - 1]) + + # then add skip connections + for i in range(l - 1): + if l > 1 and skip[i] == 1: + input_layers.append(all_layers[i]) + + layer_input = concat(input_layers) + if opt_config['opt_type'] == 'convolution': + layer_output = conv(layer_input, opt_config) + if opt_config['opt_type'] == 'separable_convolution': + layer_output = sp_conv(layer_input, opt_config) + if opt_config['opt_type'] == 'depthwise_convolution': + layer_output = dw_conv(layer_input, opt_config) + elif opt_config['opt_type'] == 'reduction': + layer_output = reduction(layer_input, opt_config) + + all_layers[l] = layer_output + + # Final Layer + # Global Average Pooling, then Fully connected with softmax. + avgpooled = GlobalAveragePooling2D()(all_layers[self.num_layers]) + dropped = Dropout(0.4)(avgpooled) + logits = Dense(units=self.output_size, + activation='softmax')(dropped) + + # Encapsulate the model + self.model = Model(inputs=input_layer, outputs=logits) + + return self.model diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py b/examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py new file mode 100644 index 00000000000..f6ff7c933e7 --- /dev/null +++ b/examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py @@ -0,0 +1,77 @@ +import keras +import numpy as np +from keras.datasets import cifar10 +from ModelConstructor import ModelConstructor +from keras.utils import to_categorical +from keras.utils import multi_gpu_model +from keras.preprocessing.image import ImageDataGenerator +import argparse +import time + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='TrainingContainer') + parser.add_argument('--architecture', type=str, default="", metavar='N', + help='architecture of the neural network') + parser.add_argument('--nn_config', type=str, default="", metavar='N', + help='configurations and search space embeddings') + parser.add_argument('--num_epochs', type=int, default=10, metavar='N', + help='number of epoches that each child will be trained') + parser.add_argument('--num_gpus', type=int, default=1, metavar='N', + help='number of epoches that each child will be trained') + args = parser.parse_args() + + arch = args.architecture.replace("\'", "\"") + print(">>> arch received by trial") + print(arch) + + nn_config = args.nn_config.replace("\'", "\"") + print(">>> nn_config received by trial") + print(nn_config) + + num_epochs = args.num_epochs + print(">>> num_epochs received by trial") + print(num_epochs) + + num_gpus = args.num_gpus + print(">>> num_gpus received by trial:") + print(num_gpus) + + print("\n>>> Constructing Model...") + constructor = ModelConstructor(arch, nn_config) + test_model = constructor.build_model() + print(">>> Model Constructed Successfully\n") + + if num_gpus > 1: + test_model = multi_gpu_model(test_model, gpus=num_gpus) + + test_model.summary() + test_model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=keras.optimizers.Adam(lr=1e-3, decay=1e-4), + metrics=['accuracy']) + + (x_train, y_train), (x_test, y_test) = cifar10.load_data() + x_train = x_train.astype('float32') + x_test = x_test.astype('float32') + x_train /= 255 + x_test /= 255 + y_train = to_categorical(y_train) + y_test = to_categorical(y_test) + + augmentation = ImageDataGenerator( + width_shift_range=0.1, + height_shift_range=0.1, + horizontal_flip=True) + + aug_data_flow = augmentation.flow(x_train, y_train, batch_size=128) + + print(">>> Data Loaded. Training starts.") + for e in range(num_epochs): + print("\nTotal Epoch {}/{}".format(e+1, num_epochs)) + history = test_model.fit_generator(generator=aug_data_flow, + steps_per_epoch=int(len(x_train)/128)+1, + epochs=1, verbose=1, + validation_data=(x_test, y_test)) + print("Training-Accuracy={}".format(history.history['acc'][-1])) + print("Training-Loss={}".format(history.history['loss'][-1])) + print("Validation-Accuracy={}".format(history.history['val_acc'][-1])) + print("Validation-Loss={}".format(history.history['val_loss'][-1])) diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/op_library.py b/examples/v1alpha2/NAS-training-containers/RL-cifar10/op_library.py new file mode 100644 index 00000000000..cd95429d97b --- /dev/null +++ b/examples/v1alpha2/NAS-training-containers/RL-cifar10/op_library.py @@ -0,0 +1,141 @@ +import numpy as np +from keras import backend as K +from keras.layers import Input, Conv2D, ZeroPadding2D, concatenate, MaxPooling2D, \ + AveragePooling2D, Dense, Activation, BatchNormalization, GlobalAveragePooling2D, \ + SeparableConv2D, DepthwiseConv2D + + +def concat(inputs): + n = len(inputs) + if n == 1: + return inputs[0] + + total_dim = list() + for x in inputs: + total_dim.append(K.int_shape(x)) + total_dim = np.asarray(total_dim) + max_dim = max(total_dim[:, 1]) + + padded_input = [0 for _ in range(n)] + + for i in range(n): + if total_dim[i][1] < max_dim: + diff = max_dim - total_dim[i][1] + half_diff = int(diff / 2) + if diff % 2 == 0: + padded_input[i] = ZeroPadding2D(padding=(half_diff, half_diff))(inputs[i]) + else: + padded_input[i] = ZeroPadding2D(padding=((half_diff, half_diff + 1), + (half_diff, half_diff + 1)))(inputs[i]) + else: + padded_input[i] = inputs[i] + + result = concatenate(inputs=padded_input, axis=-1) + return result + + +def conv(x, config): + parameters = { + "num_filter": 64, + "filter_size": 3, + "stride": 1, + } + for k in parameters.keys(): + if k in config: + parameters[k] = int(config[k]) + + activated = Activation('relu')(x) + + conved = Conv2D( + filters=parameters['num_filter'], + kernel_size=parameters['filter_size'], + strides=parameters['stride'], + padding='same')(activated) + + result = BatchNormalization()(conved) + + return result + + +def sp_conv(x, config): + parameters = { + "num_filter": 64, + "filter_size": 3, + "stride": 1, + "depth_multiplier": 1, + } + + for k in parameters.keys(): + if k in config: + parameters[k] = int(config[k]) + + activated = Activation('relu')(x) + + conved = SeparableConv2D( + filters=parameters['num_filter'], + kernel_size=parameters['filter_size'], + strides=parameters['stride'], + depth_multiplier=parameters['depth_multiplier'], + padding='same')(activated) + + result = BatchNormalization()(conved) + + return result + +def dw_conv(x, config): + parameters = { + "filter_size": 3, + "stride": 1, + "depth_multiplier": 1, + } + for k in parameters.keys(): + if k in config: + parameters[k] = int(config[k]) + + activated = Activation('relu')(x) + + conved = DepthwiseConv2D( + kernel_size=parameters['filter_size'], + strides=parameters['stride'], + depth_multiplier=parameters['depth_multiplier'], + padding='same')(activated) + + result = BatchNormalization()(conved) + + return result + + +def reduction(x, config): + # handle the exteme case where the input has the dimension 1 by 1 and is not reductible + # we will just change the reduction layer to identity layer + # such situation is very likely to appear though + dim = K.int_shape(x) + if dim[1] == 1 or dim[2] == 1: + print("WARNING: One or more dimensions of the input of the reduction layer is 1. It cannot be further reduced. A identity layer will be used instead.") + return x + + parameters = { + 'reduction_type': "max_pooling", + 'pool_size': 2, + 'stride': None, + } + + if 'reduction_type' in config: + parameters['reduction_type'] = config['reduction_type'] + if 'pool_size' in config: + parameters['pool_size'] = int(config['pool_size']) + if 'stride' in config: + parameters['stride'] = int(config['stride']) + + if parameters['reduction_type'] == 'max_pooling': + result = MaxPooling2D( + pool_size=parameters['pool_size'], + strides=parameters['stride'] + )(x) + elif parameters['reduction_type'] == 'avg_pooling': + result = AveragePooling2D( + pool_size=parameters['pool_size'], + strides=parameters['stride'] + )(x) + + return result diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/requirements.txt b/examples/v1alpha2/NAS-training-containers/RL-cifar10/requirements.txt new file mode 100644 index 00000000000..ce7881a31e3 --- /dev/null +++ b/examples/v1alpha2/NAS-training-containers/RL-cifar10/requirements.txt @@ -0,0 +1,2 @@ +tensorflow-gpu==1.12.0 +keras==2.2.4 From 50e58a5908eaa6aa14ea29a2fa9f8eee0c727fb1 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Thu, 30 May 2019 18:11:22 -0700 Subject: [PATCH 2/2] Modify runTrial --- .../v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py b/examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py index f6ff7c933e7..82e5b0e1cef 100644 --- a/examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py +++ b/examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py @@ -17,7 +17,7 @@ parser.add_argument('--num_epochs', type=int, default=10, metavar='N', help='number of epoches that each child will be trained') parser.add_argument('--num_gpus', type=int, default=1, metavar='N', - help='number of epoches that each child will be trained') + help='number of GPU that used for training') args = parser.parse_args() arch = args.architecture.replace("\'", "\"")