diff --git a/.dockerignore b/.dockerignore index 320ecdadf4f..b82996d6553 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,7 +1,6 @@ .git .gitignore docs -examples manifests pkg/ui/*/frontend/node_modules pkg/ui/*/frontend/build diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md index 05cd07aabe6..c8c2f7e1143 100644 --- a/examples/v1beta1/README.md +++ b/examples/v1beta1/README.md @@ -348,7 +348,8 @@ It will stop port-forward process and delete minikube cluster. docker.io/kubeflowkatib/mxnet-mnist ``` -- Pytorch mnist example with saving metrics to the file, [source](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/file-metrics-collector/mnist.py). +- Pytorch mnist example with saving metrics to the file or print them to the StdOut, + [source](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/pytorch-mnist/mnist.py). ``` docker.io/kubeflowkatib/pytorch-mnist @@ -372,13 +373,8 @@ docker.io/kubeflowkatib/enas-cnn-cifar10-cpu docker.io/kubeflowkatib/darts-cnn-cifar10 ``` -- Pytorch operator mnist example, [source](https://github.com/kubeflow/pytorch-operator/blob/master/examples/mnist/mnist.py). - -``` -gcr.io/kubeflow-ci/pytorch-dist-mnist-test -``` - -- Tf operator mnist example, [source](https://github.com/kubeflow/tf-operator/blob/master/examples/v1/mnist_with_summaries/mnist_with_summaries.py). +- TF operator mnist example with writing summary data, + [source](https://github.com/kubeflow/tf-operator/blob/master/examples/v1/mnist_with_summaries/mnist_with_summaries.py). ``` gcr.io/kubeflow-ci/tf-mnist-with-summaries diff --git a/examples/v1beta1/custom-metricscollector-example.yaml b/examples/v1beta1/custom-metricscollector-example.yaml index 3671706b962..a0d9baf74c0 100644 --- a/examples/v1beta1/custom-metricscollector-example.yaml +++ b/examples/v1beta1/custom-metricscollector-example.yaml @@ -66,12 +66,14 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-e294a90 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/opt/mnist/src/mnist.py" + - "python3" + - "/opt/pytorch-mnist/mnist.py" - "--epochs=1" + - "--log-path=/katib/mnist.log" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/file-metrics-collector/Dockerfile b/examples/v1beta1/file-metrics-collector/Dockerfile deleted file mode 100644 index 1038c48cedc..00000000000 --- a/examples/v1beta1/file-metrics-collector/Dockerfile +++ /dev/null @@ -1,13 +0,0 @@ -FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime - -RUN mkdir -p /katib /opt/mnist/src - -WORKDIR /opt/mnist/src -ADD mnist.py /opt/mnist/src - -RUN chgrp -R 0 /opt/mnist \ - && chmod -R g+rwX /opt/mnist \ - && chgrp -R 0 /katib \ - && chmod -R g+rwX /katib - -ENTRYPOINT ["python", "/opt/mnist/src/mnist.py"] diff --git a/examples/v1beta1/file-metricscollector-example.yaml b/examples/v1beta1/file-metricscollector-example.yaml index 1973217c3ac..51b3745d1be 100644 --- a/examples/v1beta1/file-metricscollector-example.yaml +++ b/examples/v1beta1/file-metricscollector-example.yaml @@ -53,12 +53,14 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/pytorch-mnist:v1beta1-e294a90 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/opt/mnist/src/mnist.py" - - "--epochs=1" + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=2" + - "--log-path=/katib/mnist.log" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" restartPolicy: Never diff --git a/examples/v1beta1/mxnet-mnist/Dockerfile b/examples/v1beta1/mxnet-mnist/Dockerfile index 5448f327be7..61ef150f7d0 100644 --- a/examples/v1beta1/mxnet-mnist/Dockerfile +++ b/examples/v1beta1/mxnet-mnist/Dockerfile @@ -1,13 +1,6 @@ -FROM ubuntu:16.04 +FROM mxnet/python:latest_cpu_native_py3 -RUN apt-get update && \ - apt-get install -y wget python3-dev gcc && \ - wget https://bootstrap.pypa.io/get-pip.py && \ - python3 get-pip.py - -RUN pip3 install mxnet - -ADD . /opt/mxnet-mnist +ADD examples/v1beta1/mxnet-mnist /opt/mxnet-mnist WORKDIR /opt/mxnet-mnist RUN chgrp -R 0 /opt/mxnet-mnist \ diff --git a/examples/v1beta1/nas/darts-cnn-cifar10/Dockerfile b/examples/v1beta1/nas/darts-cnn-cifar10/Dockerfile index c8a79d9cd5d..8ff56d8702e 100644 --- a/examples/v1beta1/nas/darts-cnn-cifar10/Dockerfile +++ b/examples/v1beta1/nas/darts-cnn-cifar10/Dockerfile @@ -4,8 +4,7 @@ FROM pytorch/pytorch:1.0-cuda${cuda_version}-cudnn${cudnn_version}-runtime ENV TARGET_DIR /opt/nas/darts-cnn-cifar10 - -ADD . ${TARGET_DIR} +ADD examples/v1beta1/nas/darts-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.cpu b/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.cpu index a7cd04b0798..88bb4ab2a79 100644 --- a/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.cpu +++ b/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.cpu @@ -1,24 +1,12 @@ -FROM tensorflow/tensorflow:1.12.0 +FROM tensorflow/tensorflow:1.15.4-py3 ENV TARGET_DIR /opt/nas/enas-cnn-cifar10 -# Install system packages -RUN apt-get update && apt-get install -y software-properties-common && \ - add-apt-repository ppa:deadsnakes/ppa && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - python3-setuptools \ - python3-dev \ - python3-pip \ - git \ - graphviz \ - wget - -ADD . ${TARGET_DIR} +ADD examples/v1beta1/nas/enas-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} RUN pip3 install --upgrade pip -RUN pip3 install --upgrade --no-cache-dir -r requirements-cpu.txt +RUN pip3 install --upgrade -r requirements.txt ENV PYTHONPATH ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.gpu b/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.gpu index 755fef40f34..d95e3a25e6b 100644 --- a/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.gpu +++ b/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.gpu @@ -1,33 +1,12 @@ -ARG cuda_version=10.0 -ARG cudnn_version=7 -FROM nvidia/cuda:${cuda_version}-cudnn${cudnn_version}-devel +FROM tensorflow/tensorflow:1.15.4-gpu-py3 ENV TARGET_DIR /opt/nas/enas-cnn-cifar10 -# Install system packages -RUN apt-get update && apt-get install -y software-properties-common && \ - add-apt-repository ppa:deadsnakes/ppa && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - bzip2 \ - g++ \ - git \ - graphviz \ - libgl1-mesa-glx \ - libhdf5-dev \ - openmpi-bin \ - python3 \ - python3-pip \ - python3-setuptools \ - python3-dev \ - wget && \ - rm -rf /var/lib/apt/lists/* - -ADD . ${TARGET_DIR} +ADD examples/v1beta1/nas/enas-cnn-cifar10 ${TARGET_DIR} WORKDIR ${TARGET_DIR} RUN pip3 install --upgrade pip -RUN pip3 install --upgrade --no-cache-dir -r requirements-gpu.txt +RUN pip3 install --upgrade -r requirements.txt ENV PYTHONPATH ${TARGET_DIR} RUN chgrp -R 0 ${TARGET_DIR} \ diff --git a/examples/v1beta1/nas/enas-cnn-cifar10/requirements-cpu.txt b/examples/v1beta1/nas/enas-cnn-cifar10/requirements-cpu.txt deleted file mode 100644 index 88baeffd80d..00000000000 --- a/examples/v1beta1/nas/enas-cnn-cifar10/requirements-cpu.txt +++ /dev/null @@ -1,2 +0,0 @@ -tensorflow==1.15.4 -keras==2.2.4 diff --git a/examples/v1beta1/nas/enas-cnn-cifar10/requirements-gpu.txt b/examples/v1beta1/nas/enas-cnn-cifar10/requirements-gpu.txt deleted file mode 100644 index 48e7f6865e4..00000000000 --- a/examples/v1beta1/nas/enas-cnn-cifar10/requirements-gpu.txt +++ /dev/null @@ -1,2 +0,0 @@ -tensorflow-gpu==1.15.4 -keras==2.2.4 diff --git a/examples/v1beta1/nas/enas-cnn-cifar10/requirements.txt b/examples/v1beta1/nas/enas-cnn-cifar10/requirements.txt new file mode 100644 index 00000000000..1a23c027782 --- /dev/null +++ b/examples/v1beta1/nas/enas-cnn-cifar10/requirements.txt @@ -0,0 +1 @@ +keras==2.2.4 diff --git a/examples/v1beta1/pytorch-mnist/Dockerfile b/examples/v1beta1/pytorch-mnist/Dockerfile new file mode 100644 index 00000000000..abcff8d7b58 --- /dev/null +++ b/examples/v1beta1/pytorch-mnist/Dockerfile @@ -0,0 +1,14 @@ +FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime + +ADD examples/v1beta1/pytorch-mnist /opt/pytorch-mnist +WORKDIR /opt/pytorch-mnist + +# Add folder for the logs. +RUN mkdir /katib + +RUN chgrp -R 0 /opt/pytorch-mnist \ + && chmod -R g+rwX /opt/pytorch-mnist \ + && chgrp -R 0 /katib \ + && chmod -R g+rwX /katib + +ENTRYPOINT ["python3", "/opt/pytorch-mnist/mnist.py"] diff --git a/examples/v1beta1/file-metrics-collector/mnist.py b/examples/v1beta1/pytorch-mnist/mnist.py similarity index 55% rename from examples/v1beta1/file-metrics-collector/mnist.py rename to examples/v1beta1/pytorch-mnist/mnist.py index b436f2ca4b0..738fa2c8bf3 100644 --- a/examples/v1beta1/file-metrics-collector/mnist.py +++ b/examples/v1beta1/pytorch-mnist/mnist.py @@ -11,9 +11,13 @@ import torch.nn.functional as F import torch.optim as optim -WORLD_SIZE = int(os.environ.get('WORLD_SIZE', 1)) +# To fix this issue: https://github.com/pytorch/vision/issues/1938. +from six.moves import urllib +opener = urllib.request.build_opener() +opener.addheaders = [("User-agent", "Mozilla/5.0")] +urllib.request.install_opener(opener) -logging.basicConfig(filename='/katib/mnist.log', level=logging.DEBUG) +WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1)) class Net(nn.Module): @@ -45,11 +49,10 @@ def train(args, model, device, train_loader, optimizer, epoch): loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: - msg = 'Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}'.format( + msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item()) - print(msg) - logging.debug(msg) + logging.info(msg) niter = epoch * len(train_loader) + batch_idx @@ -61,12 +64,12 @@ def test(args, model, device, test_loader, epoch): for data, target in test_loader: data, target = data.to(device), target.to(device) output = model(data) - test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss + test_loss += F.nll_loss(output, target, reduction="sum").item() # sum up batch loss pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability correct += pred.eq(target.view_as(pred)).sum().item() test_loss /= len(test_loader.dataset) - logging.info('\n{{metricName: accuracy, metricValue: {:.4f}}};{{metricName: loss, metricValue: {:.4f}}}\n'.format( + logging.info("{{metricName: accuracy, metricValue: {:.4f}}};{{metricName: loss, metricValue: {:.4f}}}\n".format( float(correct) / len(test_loader.dataset), test_loss)) @@ -80,52 +83,70 @@ def is_distributed(): def main(): # Training settings - parser = argparse.ArgumentParser(description='PyTorch MNIST Example') - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') - parser.add_argument('--save-model', action='store_true', default=False, - help='For Saving the current Model') + parser = argparse.ArgumentParser(description="PyTorch MNIST Example") + parser.add_argument("--batch-size", type=int, default=64, metavar="N", + help="input batch size for training (default: 64)") + parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N", + help="input batch size for testing (default: 1000)") + parser.add_argument("--epochs", type=int, default=10, metavar="N", + help="number of epochs to train (default: 10)") + parser.add_argument("--lr", type=float, default=0.01, metavar="LR", + help="learning rate (default: 0.01)") + parser.add_argument("--momentum", type=float, default=0.5, metavar="M", + help="SGD momentum (default: 0.5)") + parser.add_argument("--no-cuda", action="store_true", default=False, + help="disables CUDA training") + parser.add_argument("--seed", type=int, default=1, metavar="S", + help="random seed (default: 1)") + parser.add_argument("--log-interval", type=int, default=10, metavar="N", + help="how many batches to wait before logging training status") + parser.add_argument("--log-path", type=str, default="", + help="Path to save logs. Print to StdOut if log-path is not set") + parser.add_argument("--save-model", action="store_true", default=False, + help="For Saving the current Model") + if dist.is_available(): - parser.add_argument('--backend', type=str, help='Distributed backend', + parser.add_argument("--backend", type=str, help="Distributed backend", choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI], default=dist.Backend.GLOO) args = parser.parse_args() + + # Use this format (%Y-%m-%dT%H:%M:%SZ) to record timestamp of the metrics. + # If log_path is empty print log to StdOut, otherwise print log to the file. + if args.log_path == "": + logging.basicConfig( + format="%(asctime)s %(levelname)-8s %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.DEBUG) + else: + logging.basicConfig( + format="%(asctime)s %(levelname)-8s %(message)s", + datefmt="%Y-%m-%dT%H:%M:%SZ", + level=logging.DEBUG, + filename=args.log_path) + use_cuda = not args.no_cuda and torch.cuda.is_available() if use_cuda: - print('Using CUDA') + print("Using CUDA") torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") if should_distribute(): - print('Using distributed PyTorch with {} backend'.format(args.backend)) + print("Using distributed PyTorch with {} backend".format(args.backend)) dist.init_process_group(backend=args.backend) - kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} + kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} train_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=True, download=True, + datasets.MNIST("../data", train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=False, transform=transforms.Compose([ + datasets.MNIST("../data", train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), @@ -148,5 +169,5 @@ def main(): torch.save(model.state_dict(), "mnist_cnn.pt") -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/v1beta1/pytorchjob-example.yaml b/examples/v1beta1/pytorchjob-example.yaml index 3376d861b73..6c139d6354d 100644 --- a/examples/v1beta1/pytorchjob-example.yaml +++ b/examples/v1beta1/pytorchjob-example.yaml @@ -8,9 +8,9 @@ spec: maxTrialCount: 12 maxFailedTrialCount: 3 objective: - type: maximize - goal: 0.99 - objectiveMetricName: accuracy + type: minimize + goal: 0.001 + objectiveMetricName: loss algorithm: algorithmName: random parameters: @@ -45,11 +45,13 @@ spec: spec: containers: - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" Worker: @@ -59,10 +61,12 @@ spec: spec: containers: - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" diff --git a/manifests/v1beta1/components/katib-controller/trial-template-configmap.yaml b/manifests/v1beta1/components/katib-controller/trial-template-configmap.yaml index 64a1ea0036d..f094efbdf24 100644 --- a/manifests/v1beta1/components/katib-controller/trial-template-configmap.yaml +++ b/manifests/v1beta1/components/katib-controller/trial-template-configmap.yaml @@ -52,11 +52,13 @@ data: spec: containers: - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" Worker: @@ -66,10 +68,12 @@ data: spec: containers: - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" diff --git a/manifests/v1beta1/katib-controller/trial-template-configmap.yaml b/manifests/v1beta1/katib-controller/trial-template-configmap.yaml index 25de70c326e..a82e562a9d6 100644 --- a/manifests/v1beta1/katib-controller/trial-template-configmap.yaml +++ b/manifests/v1beta1/katib-controller/trial-template-configmap.yaml @@ -53,11 +53,13 @@ data: spec: containers: - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" Worker: @@ -67,10 +69,12 @@ data: spec: containers: - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" diff --git a/operators/katib-controller/files/pytorchJobTemplate.yaml b/operators/katib-controller/files/pytorchJobTemplate.yaml index dda8b1d1ab6..7362f61616a 100644 --- a/operators/katib-controller/files/pytorchJobTemplate.yaml +++ b/operators/katib-controller/files/pytorchJobTemplate.yaml @@ -9,11 +9,13 @@ spec: spec: containers: - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" Worker: @@ -23,10 +25,12 @@ spec: spec: containers: - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + # TODO (andreyvelich): Add tag to the image. + image: docker.io/kubeflowkatib/pytorch-mnist:latest imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" + - "python3" + - "/opt/pytorch-mnist/mnist.py" + - "--epochs=1" - "--lr=${trialParameters.learningRate}" - "--momentum=${trialParameters.momentum}" diff --git a/scripts/v1beta1/build.sh b/scripts/v1beta1/build.sh index db35ed677be..714f6da2b65 100755 --- a/scripts/v1beta1/build.sh +++ b/scripts/v1beta1/build.sh @@ -119,16 +119,16 @@ docker build -t ${REGISTRY}/earlystopping-medianstop:${TAG} -t ${REGISTRY}/early echo -e "\nBuilding training container images..." echo -e "\nBuilding mxnet mnist training container example...\n" -(cd examples/${VERSION}/mxnet-mnist && docker build -t ${REGISTRY}/mxnet-mnist:${TAG} -t ${REGISTRY}/mxnet-mnist:latest -f Dockerfile .) +docker build -t ${REGISTRY}/mxnet-mnist:${TAG} -t ${REGISTRY}/mxnet-mnist:latest -f examples/${VERSION}/mxnet-mnist/Dockerfile . echo -e "\nBuilding PyTorch mnist training container example...\n" -(cd examples/${VERSION}/file-metrics-collector && docker build -t ${REGISTRY}/pytorch-mnist:${TAG} -t ${REGISTRY}/pytorch-mnist:latest -f Dockerfile .) +docker build -t ${REGISTRY}/pytorch-mnist:${TAG} -t ${REGISTRY}/pytorch-mnist:latest -f examples/${VERSION}/pytorch-mnist/Dockerfile . echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with GPU support...\n" -(cd examples/${VERSION}/nas/enas-cnn-cifar10 && docker build -t ${REGISTRY}/enas-cnn-cifar10-gpu:${TAG} -t ${REGISTRY}/enas-cnn-cifar10-gpu:latest -f Dockerfile.gpu .) +docker build -t ${REGISTRY}/enas-cnn-cifar10-gpu:${TAG} -t ${REGISTRY}/enas-cnn-cifar10-gpu:latest -f examples/${VERSION}/nas/enas-cnn-cifar10/Dockerfile.gpu . echo -e "\nBuilding Keras CIFAR-10 CNN training container example for ENAS with CPU support...\n" -(cd examples/${VERSION}/nas/enas-cnn-cifar10 && docker build -t ${REGISTRY}/enas-cnn-cifar10-cpu:${TAG} -t ${REGISTRY}/enas-cnn-cifar10-cpu:latest -f Dockerfile.cpu .) +docker build -t ${REGISTRY}/enas-cnn-cifar10-cpu:${TAG} -t ${REGISTRY}/enas-cnn-cifar10-cpu:latest -f examples/${VERSION}/nas/enas-cnn-cifar10/Dockerfile.cpu . echo -e "\nBuilding PyTorch CIFAR-10 CNN training container example for DARTS...\n" -(cd examples/${VERSION}/nas/darts-cnn-cifar10 && docker build -t ${REGISTRY}/darts-cnn-cifar10:${TAG} -t ${REGISTRY}/darts-cnn-cifar10:latest -f Dockerfile .) +docker build -t ${REGISTRY}/darts-cnn-cifar10:${TAG} -t ${REGISTRY}/darts-cnn-cifar10:latest -f examples/${VERSION}/nas/darts-cnn-cifar10/Dockerfile . diff --git a/scripts/v1beta1/update-trial-images.sh b/scripts/v1beta1/update-trial-images.sh new file mode 100755 index 00000000000..b66fc823c0c --- /dev/null +++ b/scripts/v1beta1/update-trial-images.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# Copyright 2021 The Kubeflow Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is used to update images and tags in the Trial templates for all Katib examples and manifests. +# Run ./scripts/v1beta1/update-trial-template-tags.sh -p -t to execute it. +# For example, to update images tags to the latest: +# ./scripts/v1beta1/update-trial-template-tags.sh -p docker.io/kubeflowkatib/ -t latest + +usage() { + echo "Usage: $0 [-p -t ]" 1>&2 + exit 1 +} + +while getopts ":p:t:" opt; do + case $opt in + p) + IMAGE_PREFIX=${OPTARG} + ;; + t) + TAG=${OPTARG} + ;; + *) + usage + ;; + esac +done + +if [[ -z "$IMAGE_PREFIX" || -z "$TAG" ]]; then + echo "Image prefix and tag must be set" + echo "Usage: $0 [-p -t ]" 1>&2 + exit 1 +fi + +echo "Updating Trial template images..." +echo "Image prefix: ${IMAGE_PREFIX}" +echo "Image tag: ${TAG}" + +# Base prefix for the Trial template images. +BASE_IMAGE_PREFIX="docker.io/kubeflowkatib/" + +# End of the each Trial template image. +MXNET_MNIST="mxnet-mnist" +PYTORCH_MNIST="pytorch-mnist" +ENAS_GPU="enas-cnn-cifar10-gpu" +ENAS_CPU="enas-cnn-cifar10-cpu" +DARTS="darts-cnn-cifar10" + +# MXNet mnist. +# For MacOS we should set -i '' to avoid temp files from sed. +if [[ $(uname) == "Darwin" ]]; then + find ./ -regex ".*\.yaml" -exec sed -i '' -e "s@${BASE_IMAGE_PREFIX}${MXNET_MNIST}:.*@${IMAGE_PREFIX}${MXNET_MNIST}:${TAG}@" {} \; + find ./ -regex ".*\.yaml" -exec sed -i '' -e "s@${BASE_IMAGE_PREFIX}${PYTORCH_MNIST}:.*@${IMAGE_PREFIX}${PYTORCH_MNIST}:${TAG}@" {} \; + find ./ -regex ".*\.yaml" -exec sed -i '' -e "s@${BASE_IMAGE_PREFIX}${ENAS_GPU}:.*@${IMAGE_PREFIX}${ENAS_GPU}:${TAG}@" {} \; + find ./ -regex ".*\.yaml" -exec sed -i '' -e "s@${BASE_IMAGE_PREFIX}${ENAS_CPU}:.*@${IMAGE_PREFIX}${ENAS_CPU}:${TAG}@" {} \; + find ./ -regex ".*\.yaml" -exec sed -i '' -e "s@${BASE_IMAGE_PREFIX}${DARTS}:.*@${IMAGE_PREFIX}${DARTS}:${TAG}@" {} \; +else + find ./ -regex ".*\.yaml" -exec sed -i -e "s@${BASE_IMAGE_PREFIX}${MXNET_MNIST}:.*@${IMAGE_PREFIX}${MXNET_MNIST}:${TAG}@" {} \; + find ./ -regex ".*\.yaml" -exec sed -i -e "s@${BASE_IMAGE_PREFIX}${PYTORCH_MNIST}:.*@${IMAGE_PREFIX}${PYTORCH_MNIST}:${TAG}@" {} \; + find ./ -regex ".*\.yaml" -exec sed -i -e "s@${BASE_IMAGE_PREFIX}${ENAS_GPU}:.*@${IMAGE_PREFIX}${ENAS_GPU}:${TAG}@" {} \; + find ./ -regex ".*\.yaml" -exec sed -i -e "s@${BASE_IMAGE_PREFIX}${ENAS_CPU}:.*@${IMAGE_PREFIX}${ENAS_CPU}:${TAG}@" {} \; + find ./ -regex ".*\.yaml" -exec sed -i -e "s@${BASE_IMAGE_PREFIX}${DARTS}:.*@${IMAGE_PREFIX}${DARTS}:${TAG}@" {} \; +fi + +echo "Trial template images has been updated" diff --git a/test/scripts/v1beta1/setup-katib.sh b/test/scripts/v1beta1/setup-katib.sh index 68ef262677b..b394005a5c6 100755 --- a/test/scripts/v1beta1/setup-katib.sh +++ b/test/scripts/v1beta1/setup-katib.sh @@ -65,6 +65,9 @@ sed -i -e "s@docker.io/kubeflowkatib\/earlystopping-medianstop@${ECR_REGISTRY}\/ cat manifests/v1beta1/katib-controller/katib-config.yaml +# Update Trial template images in the examples. +./scripts/v1beta1/update-trial-images.sh -p "${ECR_REGISTRY}/${REPO_NAME}/v1beta1/trial-" -t ${VERSION} + echo "Creating Kubeflow namespace" kubectl create namespace kubeflow diff --git a/test/workflows/components/workflows-v1beta1.libsonnet b/test/workflows/components/workflows-v1beta1.libsonnet index 350b0c48d2f..b55a51c53b5 100644 --- a/test/workflows/components/workflows-v1beta1.libsonnet +++ b/test/workflows/components/workflows-v1beta1.libsonnet @@ -273,6 +273,26 @@ name: "build-earlystopping-medianstop", template: "build-earlystopping-medianstop", }, + { + name: "build-trial-mxnet-mnist", + template: "build-trial-mxnet-mnist", + }, + { + name: "build-trial-pytorch-mnist", + template: "build-trial-pytorch-mnist", + }, + { + name: "build-trial-enas-cnn-cifar10-gpu", + template: "build-trial-enas-cnn-cifar10-gpu", + }, + { + name: "build-trial-enas-cnn-cifar10-cpu", + template: "build-trial-enas-cnn-cifar10-cpu", + }, + { + name: "build-trial-darts-cnn-cifar10", + template: "build-trial-darts-cnn-cifar10", + }, { name: "create-cluster", template: "create-cluster", @@ -321,17 +341,14 @@ name: "run-tfjob-e2e-tests", template: "run-tfjob-e2e-tests", }, - // TODO (andreyvelich): PyTorch training container doesn't work - // because of this: https://github.com/pytorch/vision/issues/1938 - // TorchVision can't download MNIST. - // { - // name: "run-pytorchjob-e2e-tests", - // template: "run-pytorchjob-e2e-tests", - // }, - // { - // name: "run-file-metricscollector-e2e-tests", - // template: "run-file-metricscollector-e2e-tests", - // }, + { + name: "run-pytorchjob-e2e-tests", + template: "run-pytorchjob-e2e-tests", + }, + { + name: "run-file-metricscollector-e2e-tests", + template: "run-file-metricscollector-e2e-tests", + }, { name: "run-never-resume-e2e-tests", template: "run-never-resume-e2e-tests", @@ -464,6 +481,36 @@ "--context=dir://" + katibDir, "--destination=" + registry + "/katib/v1beta1/earlystopping-medianstop:$(PULL_BASE_SHA)", ]), // build early stopping median stop + $.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("build-trial-mxnet-mnist", kanikoExecutorImage, [ + "/kaniko/executor", + "--dockerfile=" + katibDir + "/examples/v1beta1/mxnet-mnist/Dockerfile", + "--context=dir://" + katibDir, + "--destination=" + registry + "/katib/v1beta1/trial-mxnet-mnist:$(PULL_BASE_SHA)", + ]), // build Trial mxnet mnist + $.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("build-trial-pytorch-mnist", kanikoExecutorImage, [ + "/kaniko/executor", + "--dockerfile=" + katibDir + "/examples/v1beta1/pytorch-mnist/Dockerfile", + "--context=dir://" + katibDir, + "--destination=" + registry + "/katib/v1beta1/trial-pytorch-mnist:$(PULL_BASE_SHA)", + ]), // build Trial PyTorch mnist + $.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("build-trial-enas-cnn-cifar10-gpu", kanikoExecutorImage, [ + "/kaniko/executor", + "--dockerfile=" + katibDir + "/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.gpu", + "--context=dir://" + katibDir, + "--destination=" + registry + "/katib/v1beta1/trial-enas-cnn-cifar10-gpu:$(PULL_BASE_SHA)", + ]), // build Trial enas cnn cifar10 with GPU support + $.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("build-trial-enas-cnn-cifar10-cpu", kanikoExecutorImage, [ + "/kaniko/executor", + "--dockerfile=" + katibDir + "/examples/v1beta1/nas/enas-cnn-cifar10/Dockerfile.cpu", + "--context=dir://" + katibDir, + "--destination=" + registry + "/katib/v1beta1/trial-enas-cnn-cifar10-cpu:$(PULL_BASE_SHA)", + ]), // build Trial enas cnn cifar10 with CPU support + $.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("build-trial-darts-cnn-cifar10", kanikoExecutorImage, [ + "/kaniko/executor", + "--dockerfile=" + katibDir + "/examples/v1beta1/nas/darts-cnn-cifar10/Dockerfile", + "--context=dir://" + katibDir, + "--destination=" + registry + "/katib/v1beta1/trial-darts-cnn-cifar10:$(PULL_BASE_SHA)", + ]), // build Trial darts cnn cifar10 $.parts(namespace, name, overrides).e2e(prow_env, bucket).buildTemplate("setup-katib", testWorkerImage, [ "test/scripts/v1beta1/setup-katib.sh", ]), // check Katib readiness and deploy it