diff --git a/.github/workflows/publish-example-images.yaml b/.github/workflows/publish-example-images.yaml index 7bbf3e2170..8e61ef9c4f 100644 --- a/.github/workflows/publish-example-images.yaml +++ b/.github/workflows/publish-example-images.yaml @@ -55,8 +55,9 @@ jobs: - component-name: mxnet-auto-tuning dockerfile: examples/mxnet/tune/Dockerfile context: examples/mxnet/tune -# TODO (tenzen-y): Fix the below broken Dockerfiles -# - component-name: pytorch-dist-mnist-mpi -# dockerfile: examples/pytorch/mnist/Dockerfile-mpi -# - component-name: pytorch-dist-mnist -# dockerfile: examples/pytorch/mnist/Dockerfile + - component-name: pytorch-dist-mnist + dockerfile: examples/pytorch/mnist/Dockerfile + context: examples/pytorch/mnist + - component-name: pytorch-dist-mnist-mpi + dockerfile: examples/pytorch/mnist/Dockerfile-mpi + context: examples/pytorch/mnist diff --git a/examples/pytorch/elastic/imagenet/Dockerfile b/examples/pytorch/elastic/imagenet/Dockerfile index c272176c2a..25ecc49577 100644 --- a/examples/pytorch/elastic/imagenet/Dockerfile +++ b/examples/pytorch/elastic/imagenet/Dockerfile @@ -1,4 +1,7 @@ -ARG BASE_IMAGE=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime +# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms. +# PyTorch=2.2.0, cuda=12.3.2 +# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01 +ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3 FROM $BASE_IMAGE WORKDIR /workspace diff --git a/examples/pytorch/elastic/imagenet/imagenet.py b/examples/pytorch/elastic/imagenet/imagenet.py index e3b943bb90..dcab82c8ce 100644 --- a/examples/pytorch/elastic/imagenet/imagenet.py +++ b/examples/pytorch/elastic/imagenet/imagenet.py @@ -63,13 +63,12 @@ import torchvision.datasets as datasets import torchvision.models as models import torchvision.transforms as transforms -from torch.distributed.elastic.utils.data import ElasticDistributedSampler from torch.distributed.elastic.multiprocessing.errors import record +from torch.distributed.elastic.utils.data import ElasticDistributedSampler from torch.nn.parallel import DistributedDataParallel from torch.optim import SGD from torch.utils.data import DataLoader - model_names = sorted( name for name in models.__dict__ @@ -146,6 +145,7 @@ help="checkpoint file path, to load and save to", ) + @record def main(): args = parser.parse_args() @@ -164,9 +164,7 @@ def main(): ) # resume from checkpoint if one exists; - state = load_checkpoint( - args.checkpoint_file, args.arch, model, optimizer - ) + state = load_checkpoint(args.checkpoint_file, args.arch, model, optimizer) start_epoch = state.epoch + 1 print(f"=> start_epoch: {start_epoch}, best_acc1: {state.best_acc1}") diff --git a/examples/pytorch/mnist/Dockerfile b/examples/pytorch/mnist/Dockerfile index bed8fb15a9..92e6c5e220 100644 --- a/examples/pytorch/mnist/Dockerfile +++ b/examples/pytorch/mnist/Dockerfile @@ -1,12 +1,15 @@ -FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime +# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms. +# PyTorch=2.2.0, cuda=12.3.2 +# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01 +FROM nvcr.io/nvidia/pytorch:24.01-py3 -RUN pip install tensorboardX==1.6.0 +RUN pip install tensorboardX==2.6.2 RUN mkdir -p /opt/mnist WORKDIR /opt/mnist/src ADD mnist.py /opt/mnist/src/mnist.py -RUN chgrp -R 0 /opt/mnist \ - && chmod -R g+rwX /opt/mnist +RUN chgrp -R 0 /opt/mnist \ + && chmod -R g+rwX /opt/mnist ENTRYPOINT ["python", "/opt/mnist/src/mnist.py"] diff --git a/examples/pytorch/mnist/Dockerfile-mpi b/examples/pytorch/mnist/Dockerfile-mpi index a0eb5d41e6..57b69ef692 100644 --- a/examples/pytorch/mnist/Dockerfile-mpi +++ b/examples/pytorch/mnist/Dockerfile-mpi @@ -1,52 +1,15 @@ -FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04 -ARG PYTHON_VERSION=3.6 +# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms. +# PyTorch=2.2.0, cuda=12.3.2 +# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01 +FROM nvcr.io/nvidia/pytorch:24.01-py3 -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - cmake \ - git \ - curl \ - vim \ - wget \ - ca-certificates \ - openssh-client \ - libjpeg-dev \ - libpng-dev &&\ - rm -rf /var/lib/apt/lists/* +RUN pip install tensorboardX==2.6.2 +RUN mkdir -p /opt/mnist -RUN wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \ - gunzip -c openmpi-3.0.0.tar.gz | tar xf - && \ - cd openmpi-3.0.0 && \ - ./configure --prefix=/home/.openmpi --with-cuda && \ - make all install +WORKDIR /opt/mnist/src +ADD mnist.py /opt/mnist/src/mnist.py -ENV PATH="$PATH:/home/.openmpi/bin" -ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" +RUN chgrp -R 0 /opt/mnist \ + && chmod -R g+rwX /opt/mnist -RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value -RUN wget -O ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - /opt/conda/bin/conda update conda && \ - /opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include cython typing && \ - /opt/conda/bin/conda clean -ya -ENV PATH /opt/conda/bin:$PATH -# This must be done before pip so that requirements.txt is available -WORKDIR /opt/pytorch - -RUN git clone --recursive https://github.com/pytorch/pytorch - -RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ - CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ - cd pytorch/ && \ - pip install -v . - -RUN /opt/conda/bin/conda config --set ssl_verify False -RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org -RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org torchvision tensorboardX==1.6.0 - -WORKDIR /var -ADD mnist.py /var - -ENTRYPOINT ["mpirun", "-n", "1", "--allow-run-as-root", "python", "/var/mnist.py"] +ENTRYPOINT ["mpirun", "-n", "1", "--allow-run-as-root", "python", "/opt/mnist/src/mnist.py"] diff --git a/examples/pytorch/mnist/mnist.py b/examples/pytorch/mnist/mnist.py index 372ac953fc..4ccd051999 100644 --- a/examples/pytorch/mnist/mnist.py +++ b/examples/pytorch/mnist/mnist.py @@ -3,14 +3,14 @@ import argparse import os -from tensorboardX import SummaryWriter -from torchvision import datasets, transforms import torch import torch.distributed as dist import torch.nn as nn import torch.nn.functional as F import torch.optim as optim +from tensorboardX import SummaryWriter from torch.utils.data import DistributedSampler +from torchvision import datasets, transforms class Net(nn.Module): @@ -185,8 +185,7 @@ def main(): print(f"World Size: {os.environ['WORLD_SIZE']}. Rank: {os.environ['RANK']}") dist.init_process_group(backend=args.backend) - Distributor = nn.parallel.DistributedDataParallel - model = Distributor(model) + model = nn.parallel.DistributedDataParallel(model) # Get FashionMNIST train and test dataset. train_ds = datasets.FashionMNIST( diff --git a/examples/pytorch/pytorch_cuda_docker/Dockerfile b/examples/pytorch/pytorch_cuda_docker/Dockerfile deleted file mode 100644 index 596a9907f4..0000000000 --- a/examples/pytorch/pytorch_cuda_docker/Dockerfile +++ /dev/null @@ -1,54 +0,0 @@ -FROM nvidia/cuda:9.2-base-ubuntu16.04 - -# Install some basic utilities -RUN apt-get update && apt-get install -y \ - curl \ - ca-certificates \ - sudo \ - git \ - bzip2 \ - libx11-6 \ - && rm -rf /var/lib/apt/lists/* - -# Create a working directory -RUN mkdir /app -WORKDIR /var - -# Create a non-root user and switch to it - - -# All users can use /home/user as their home directory -ENV HOME=/var -RUN chmod 777 /var - -# Install Miniconda -RUN curl -so ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-4.5.11-Linux-x86_64.sh \ - && chmod +x ~/miniconda.sh \ - && ~/miniconda.sh -b -p ~/miniconda \ - && rm ~/miniconda.sh -ENV PATH=/var/miniconda/bin:$PATH -ENV CONDA_AUTO_UPDATE_CONDA=false - -# Create a Python 3.6 environment -RUN /var/miniconda/bin/conda create -y --name py36 python=3.6.9 \ - && /var/miniconda/bin/conda clean -ya -ENV CONDA_DEFAULT_ENV=py36 -ENV CONDA_PREFIX=/var/miniconda/envs/$CONDA_DEFAULT_ENV -ENV PATH=$CONDA_PREFIX/bin:$PATH -RUN /var/miniconda/bin/conda install conda-build=3.18.9=py36_3 \ - && /var/miniconda/bin/conda clean -ya - -# CUDA 9.2-specific steps -RUN conda install -y -c pytorch \ - cudatoolkit=9.2 \ - "pytorch=1.2.0=py3.6_cuda9.2.148_cudnn7.6.2_0" \ - "torchvision=0.4.0=py36_cu92" \ - && conda clean -ya - -# Install HDF5 Python bindings -RUN conda install -y h5py=2.8.0 \ - && conda clean -ya -RUN pip install h5py-cache==1.0 - -# Install Torchnet, a high-level framework for PyTorch -RUN pip install torchnet==0.0.4 diff --git a/examples/pytorch/simple.yaml b/examples/pytorch/simple.yaml index 232d0ed69f..e1a4f3ba2b 100644 --- a/examples/pytorch/simple.yaml +++ b/examples/pytorch/simple.yaml @@ -16,7 +16,7 @@ spec: imagePullPolicy: Always command: - "python3" - - "/opt/pytorch-mnist/mnist.py" + - "/opt/mnist/src/mnist.py" - "--epochs=1" Worker: replicas: 1 @@ -29,5 +29,5 @@ spec: imagePullPolicy: Always command: - "python3" - - "/opt/pytorch-mnist/mnist.py" + - "/opt/mnist/src/mnist.py" - "--epochs=1" diff --git a/examples/pytorch/smoke-dist/Dockerfile b/examples/pytorch/smoke-dist/Dockerfile index 0b8178fee1..2760ee2e6f 100644 --- a/examples/pytorch/smoke-dist/Dockerfile +++ b/examples/pytorch/smoke-dist/Dockerfile @@ -1,4 +1,7 @@ -FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime +# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms. +# PyTorch=2.2.0, cuda=12.3.2 +# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01 +FROM nvcr.io/nvidia/pytorch:24.01-py3 RUN mkdir -p /opt/mlkube COPY examples/pytorch/smoke-dist/dist_sendrecv.py /opt/mlkube/ diff --git a/examples/pytorch/smoke-dist/dist_sendrecv.py b/examples/pytorch/smoke-dist/dist_sendrecv.py index 5161ae1c89..e4c7e35eae 100644 --- a/examples/pytorch/smoke-dist/dist_sendrecv.py +++ b/examples/pytorch/smoke-dist/dist_sendrecv.py @@ -1,16 +1,9 @@ import logging import os -import json + import torch import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim - -from math import ceil -from random import Random -from torch.autograd import Variable -from torchvision import datasets, transforms + def run(): """ Simple Send/Recv for testing Master <--> Workers communication """