Training Container for NAS RL Suggestion in v1alpha2 (#614)

* Add training container in v1alpha2 * Modify runTrial
kubeflow · Jun 4, 2019 · 32d3401 · 32d3401
1 parent cb25807
commit 32d3401
Show file tree

Hide file tree

Showing 5 changed files with 319 additions and 0 deletions.
diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/Dockerfile b/examples/v1alpha2/NAS-training-containers/RL-cifar10/Dockerfile
@@ -0,0 +1,32 @@
+ARG cuda_version=9.0
+ARG cudnn_version=7
+FROM nvidia/cuda:${cuda_version}-cudnn${cudnn_version}-devel
+
+# Install system packages
+RUN apt-get update && apt-get install -y software-properties-common && \
+      add-apt-repository ppa:deadsnakes/ppa && \
+      apt-get update && \
+      apt-get install -y --no-install-recommends \
+      bzip2 \
+      g++ \
+      git \
+      graphviz \
+      libgl1-mesa-glx \
+      libhdf5-dev \
+      openmpi-bin \
+      python3.5 \
+      python3-pip \
+      python3-setuptools \
+      python3-dev \
+      wget && \
+    rm -rf /var/lib/apt/lists/*
+
+
+ADD . /app
+WORKDIR /app
+
+RUN pip3 install --upgrade pip
+RUN pip3 install --no-cache-dir -r requirements.txt
+ENV PYTHONPATH /app
+
+ENTRYPOINT ["python3.5", "-u", "RunTrial.py"]
diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/ModelConstructor.py b/examples/v1alpha2/NAS-training-containers/RL-cifar10/ModelConstructor.py
@@ -0,0 +1,67 @@
+import numpy as np
+from keras.models import Model
+from keras import backend as K
+import json
+from keras.layers import Input, Conv2D, ZeroPadding2D, concatenate, MaxPooling2D, \
+    AveragePooling2D, Dense, Activation, BatchNormalization, GlobalAveragePooling2D, Dropout
+from op_library import concat, conv, sp_conv, dw_conv, reduction
+
+
+class ModelConstructor(object):
+    def __init__(self, arc_json, nn_json):
+        self.arch = json.loads(arc_json)
+        nn_config = json.loads(nn_json)
+        self.num_layers = nn_config['num_layers']
+        self.input_sizes = nn_config['input_sizes']
+        self.output_size = nn_config['output_sizes'][-1]
+        self.embedding = nn_config['embedding']
+
+    def build_model(self):
+        # a list of the data all layers
+        all_layers = [0 for _ in range(self.num_layers + 1)]
+        # a list of all the dimensions of all layers
+        all_dims = [0 for _ in range(self.num_layers + 1)]
+
+        # ================= Stacking layers =================
+        # Input Layer. Layer 0
+        input_layer = Input(shape=self.input_sizes)
+        all_layers[0] = input_layer
+
+        # Intermediate Layers. Starting from layer 1.
+        for l in range(1, self.num_layers + 1):
+            input_layers = list()
+            opt = self.arch[l - 1][0]
+            opt_config = self.embedding[str(opt)]
+            skip = self.arch[l - 1][1:l+1]
+
+            # set up the connection to the previous layer first
+            input_layers.append(all_layers[l - 1])
+
+            # then add skip connections
+            for i in range(l - 1):
+                if l > 1 and skip[i] == 1:
+                    input_layers.append(all_layers[i])
+
+            layer_input = concat(input_layers)
+            if opt_config['opt_type'] == 'convolution':
+                layer_output = conv(layer_input, opt_config)
+            if opt_config['opt_type'] == 'separable_convolution':
+                layer_output = sp_conv(layer_input, opt_config)
+            if opt_config['opt_type'] == 'depthwise_convolution':
+                layer_output = dw_conv(layer_input, opt_config)
+            elif opt_config['opt_type'] == 'reduction':
+                layer_output = reduction(layer_input, opt_config)
+
+            all_layers[l] = layer_output
+
+        # Final Layer
+        # Global Average Pooling, then Fully connected with softmax.
+        avgpooled = GlobalAveragePooling2D()(all_layers[self.num_layers])
+        dropped = Dropout(0.4)(avgpooled)
+        logits = Dense(units=self.output_size,
+                       activation='softmax')(dropped)
+
+        # Encapsulate the model
+        self.model = Model(inputs=input_layer, outputs=logits)
+
+        return self.model
diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py b/examples/v1alpha2/NAS-training-containers/RL-cifar10/RunTrial.py
@@ -0,0 +1,77 @@
+import keras
+import numpy as np
+from keras.datasets import cifar10
+from ModelConstructor import ModelConstructor
+from keras.utils import to_categorical
+from keras.utils import multi_gpu_model
+from keras.preprocessing.image import ImageDataGenerator
+import argparse
+import time
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='TrainingContainer')
+    parser.add_argument('--architecture', type=str, default="", metavar='N',
+                        help='architecture of the neural network')
+    parser.add_argument('--nn_config', type=str, default="", metavar='N',
+                        help='configurations and search space embeddings')
+    parser.add_argument('--num_epochs', type=int, default=10, metavar='N',
+                        help='number of epoches that each child will be trained')
+    parser.add_argument('--num_gpus', type=int, default=1, metavar='N',
+                        help='number of GPU that used for training')
+    args = parser.parse_args()
+
+    arch = args.architecture.replace("\'", "\"")
+    print(">>> arch received by trial")
+    print(arch)
+
+    nn_config = args.nn_config.replace("\'", "\"")
+    print(">>> nn_config received by trial")
+    print(nn_config)
+
+    num_epochs = args.num_epochs
+    print(">>> num_epochs received by trial")
+    print(num_epochs)
+
+    num_gpus = args.num_gpus
+    print(">>> num_gpus received by trial:")
+    print(num_gpus)
+
+    print("\n>>> Constructing Model...")
+    constructor = ModelConstructor(arch, nn_config)
+    test_model = constructor.build_model()
+    print(">>> Model Constructed Successfully\n")
+
+    if num_gpus > 1:
+        test_model = multi_gpu_model(test_model, gpus=num_gpus)
+
+    test_model.summary()
+    test_model.compile(loss=keras.losses.categorical_crossentropy,
+                       optimizer=keras.optimizers.Adam(lr=1e-3, decay=1e-4),
+                       metrics=['accuracy'])
+
+    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
+    x_train = x_train.astype('float32')
+    x_test = x_test.astype('float32')
+    x_train /= 255
+    x_test /= 255
+    y_train = to_categorical(y_train)
+    y_test = to_categorical(y_test)
+
+    augmentation = ImageDataGenerator(
+        width_shift_range=0.1,
+        height_shift_range=0.1,
+        horizontal_flip=True)
+
+    aug_data_flow = augmentation.flow(x_train, y_train, batch_size=128)
+
+    print(">>> Data Loaded. Training starts.")
+    for e in range(num_epochs):
+        print("\nTotal Epoch {}/{}".format(e+1, num_epochs))
+        history = test_model.fit_generator(generator=aug_data_flow,
+                                           steps_per_epoch=int(len(x_train)/128)+1,
+                                           epochs=1, verbose=1,
+                                           validation_data=(x_test, y_test))
+        print("Training-Accuracy={}".format(history.history['acc'][-1]))
+        print("Training-Loss={}".format(history.history['loss'][-1]))
+        print("Validation-Accuracy={}".format(history.history['val_acc'][-1]))
+        print("Validation-Loss={}".format(history.history['val_loss'][-1]))
diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/op_library.py b/examples/v1alpha2/NAS-training-containers/RL-cifar10/op_library.py
@@ -0,0 +1,141 @@
+import numpy as np
+from keras import backend as K
+from keras.layers import Input, Conv2D, ZeroPadding2D, concatenate, MaxPooling2D, \
+    AveragePooling2D, Dense, Activation, BatchNormalization, GlobalAveragePooling2D, \
+    SeparableConv2D, DepthwiseConv2D
+
+
+def concat(inputs):
+    n = len(inputs)
+    if n == 1:
+        return inputs[0]
+
+    total_dim = list()
+    for x in inputs:
+        total_dim.append(K.int_shape(x))
+    total_dim = np.asarray(total_dim)
+    max_dim = max(total_dim[:, 1])
+
+    padded_input = [0 for _ in range(n)]
+
+    for i in range(n):
+        if total_dim[i][1] < max_dim:
+            diff = max_dim - total_dim[i][1]
+            half_diff = int(diff / 2)
+            if diff % 2 == 0:
+                padded_input[i] = ZeroPadding2D(padding=(half_diff, half_diff))(inputs[i])
+            else:
+                padded_input[i] = ZeroPadding2D(padding=((half_diff, half_diff + 1),
+                                                         (half_diff, half_diff + 1)))(inputs[i])
+        else:
+            padded_input[i] = inputs[i]
+
+    result = concatenate(inputs=padded_input, axis=-1)
+    return result
+
+
+def conv(x, config):
+    parameters = {
+        "num_filter":  64,
+        "filter_size":  3,
+        "stride":       1,
+    }
+    for k in parameters.keys():
+        if k in config:
+            parameters[k] = int(config[k])
+
+    activated = Activation('relu')(x)
+
+    conved = Conv2D(
+        filters=parameters['num_filter'],
+        kernel_size=parameters['filter_size'],
+        strides=parameters['stride'],
+        padding='same')(activated)
+
+    result = BatchNormalization()(conved)
+
+    return result
+
+
+def sp_conv(x, config):
+    parameters = {
+        "num_filter":       64,
+        "filter_size":      3,
+        "stride":           1,
+        "depth_multiplier": 1,
+    }
+
+    for k in parameters.keys():
+        if k in config:
+            parameters[k] = int(config[k])
+
+    activated = Activation('relu')(x)
+
+    conved = SeparableConv2D(
+        filters=parameters['num_filter'],
+        kernel_size=parameters['filter_size'],
+        strides=parameters['stride'],
+        depth_multiplier=parameters['depth_multiplier'],
+        padding='same')(activated)
+
+    result = BatchNormalization()(conved)
+
+    return result
+
+def dw_conv(x, config):
+    parameters = {
+        "filter_size":      3,
+        "stride":           1,
+        "depth_multiplier": 1,
+    }
+    for k in parameters.keys():
+        if k in config:
+            parameters[k] = int(config[k])
+
+    activated = Activation('relu')(x)
+
+    conved = DepthwiseConv2D(
+        kernel_size=parameters['filter_size'],
+        strides=parameters['stride'],
+        depth_multiplier=parameters['depth_multiplier'],
+        padding='same')(activated)
+
+    result = BatchNormalization()(conved)
+
+    return result
+
+
+def reduction(x, config):
+    # handle the exteme case where the input has the dimension 1 by 1 and is not reductible
+    # we will just change the reduction layer to identity layer
+    # such situation is very likely to appear though
+    dim = K.int_shape(x)
+    if dim[1] == 1 or dim[2] == 1:
+        print("WARNING: One or more dimensions of the input of the reduction layer is 1. It cannot be further reduced. A identity layer will be used instead.")
+        return x
+
+    parameters = {
+        'reduction_type':   "max_pooling",
+        'pool_size':        2,
+        'stride':           None,
+    }
+
+    if 'reduction_type' in config:
+        parameters['reduction_type'] = config['reduction_type']
+    if 'pool_size' in config:
+        parameters['pool_size'] = int(config['pool_size'])
+    if 'stride' in config:
+        parameters['stride'] = int(config['stride'])
+
+    if parameters['reduction_type'] == 'max_pooling':
+        result = MaxPooling2D(
+            pool_size=parameters['pool_size'],
+            strides=parameters['stride']
+        )(x)
+    elif parameters['reduction_type'] == 'avg_pooling':
+        result = AveragePooling2D(
+            pool_size=parameters['pool_size'],
+            strides=parameters['stride']
+        )(x)
+
+    return result
diff --git a/examples/v1alpha2/NAS-training-containers/RL-cifar10/requirements.txt b/examples/v1alpha2/NAS-training-containers/RL-cifar10/requirements.txt
@@ -0,0 +1,2 @@
+tensorflow-gpu==1.12.0
+keras==2.2.4