Upgrade PyTorchJob examples to PyTorch v2 (kubeflow#2024)

* refactor: upgrade pytorch job examples to pytorch v2 Signed-off-by: champon1020 <nagatelu1020@gmail.com> * fix: remove torch.compile and update base image of Dockerfile Signed-off-by: champon1020 <nagatelu1020@gmail.com> * fix: comment out pytorch mnist Dockerfiles in the config of CI Signed-off-by: champon1020 <nagatelu1020@gmail.com> * fix: minor changes * add Dockerfile context to github workflow yaml * add commenets to the head of Dockerfile Signed-off-by: champon1020 <nagatelu1020@gmail.com> --------- Signed-off-by: champon1020 <nagatelu1020@gmail.com> Signed-off-by: deepanker13 <deepanker.gupta@nutanix.com>
deepanker13 · Apr 8, 2024 · b5f8803 · b5f8803
1 parent 32934c3
commit b5f8803
Show file tree

Hide file tree

Showing 10 changed files with 42 additions and 133 deletions.
diff --git a/.github/workflows/publish-example-images.yaml b/.github/workflows/publish-example-images.yaml
@@ -55,8 +55,9 @@ jobs:
           - component-name: mxnet-auto-tuning
             dockerfile: examples/mxnet/tune/Dockerfile
             context: examples/mxnet/tune
-# TODO (tenzen-y): Fix the below broken Dockerfiles
-#          - component-name: pytorch-dist-mnist-mpi
-#            dockerfile: examples/pytorch/mnist/Dockerfile-mpi
-#          - component-name: pytorch-dist-mnist
-#            dockerfile: examples/pytorch/mnist/Dockerfile
+          - component-name: pytorch-dist-mnist
+            dockerfile: examples/pytorch/mnist/Dockerfile
+            context: examples/pytorch/mnist
+          - component-name: pytorch-dist-mnist-mpi
+            dockerfile: examples/pytorch/mnist/Dockerfile-mpi
+            context: examples/pytorch/mnist
diff --git a/examples/pytorch/elastic/imagenet/Dockerfile b/examples/pytorch/elastic/imagenet/Dockerfile
@@ -1,4 +1,7 @@
-ARG BASE_IMAGE=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime
+# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
+# PyTorch=2.2.0, cuda=12.3.2
+# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
 FROM $BASE_IMAGE
 
 WORKDIR /workspace

diff --git a/examples/pytorch/elastic/imagenet/imagenet.py b/examples/pytorch/elastic/imagenet/imagenet.py
@@ -63,13 +63,12 @@
 import torchvision.datasets as datasets
 import torchvision.models as models
 import torchvision.transforms as transforms
-from torch.distributed.elastic.utils.data import ElasticDistributedSampler
 from torch.distributed.elastic.multiprocessing.errors import record
+from torch.distributed.elastic.utils.data import ElasticDistributedSampler
 from torch.nn.parallel import DistributedDataParallel
 from torch.optim import SGD
 from torch.utils.data import DataLoader
 
-
 model_names = sorted(
     name
     for name in models.__dict__
@@ -146,6 +145,7 @@
     help="checkpoint file path, to load and save to",
 )
 
+
 @record
 def main():
     args = parser.parse_args()
@@ -164,9 +164,7 @@ def main():
     )
 
     # resume from checkpoint if one exists;
-    state = load_checkpoint(
-        args.checkpoint_file, args.arch, model, optimizer
-    )
+    state = load_checkpoint(args.checkpoint_file, args.arch, model, optimizer)
 
     start_epoch = state.epoch + 1
     print(f"=> start_epoch: {start_epoch}, best_acc1: {state.best_acc1}")

diff --git a/examples/pytorch/mnist/Dockerfile b/examples/pytorch/mnist/Dockerfile
@@ -1,12 +1,15 @@
-FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime
+# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
+# PyTorch=2.2.0, cuda=12.3.2
+# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
+FROM nvcr.io/nvidia/pytorch:24.01-py3
 
-RUN pip install tensorboardX==1.6.0
+RUN pip install tensorboardX==2.6.2
 RUN mkdir -p /opt/mnist
 
 WORKDIR /opt/mnist/src
 ADD mnist.py /opt/mnist/src/mnist.py
 
-RUN  chgrp -R 0 /opt/mnist \
-  && chmod -R g+rwX /opt/mnist
+RUN chgrp -R 0 /opt/mnist \
+    && chmod -R g+rwX /opt/mnist
 
 ENTRYPOINT ["python", "/opt/mnist/src/mnist.py"]
diff --git a/examples/pytorch/mnist/Dockerfile-mpi b/examples/pytorch/mnist/Dockerfile-mpi
@@ -1,52 +1,15 @@
-FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
-ARG PYTHON_VERSION=3.6
+# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
+# PyTorch=2.2.0, cuda=12.3.2
+# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
+FROM nvcr.io/nvidia/pytorch:24.01-py3
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
-         build-essential \
-         cmake \
-         git \
-         curl \
-         vim \
-         wget \
-         ca-certificates \
-         openssh-client \
-         libjpeg-dev \
-         libpng-dev &&\
-     rm -rf /var/lib/apt/lists/*
+RUN pip install tensorboardX==2.6.2
+RUN mkdir -p /opt/mnist
 
-RUN wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
-    gunzip -c openmpi-3.0.0.tar.gz | tar xf - && \
-    cd openmpi-3.0.0 && \
-    ./configure --prefix=/home/.openmpi --with-cuda && \
-    make all install
+WORKDIR /opt/mnist/src
+ADD mnist.py /opt/mnist/src/mnist.py
 
-ENV PATH="$PATH:/home/.openmpi/bin"
-ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
+RUN chgrp -R 0 /opt/mnist \
+    && chmod -R g+rwX /opt/mnist
 
-RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
-RUN wget -O ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-     chmod +x ~/miniconda.sh && \
-     ~/miniconda.sh -b -p /opt/conda && \
-     rm ~/miniconda.sh && \
-     /opt/conda/bin/conda update conda && \
-     /opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include cython typing && \
-     /opt/conda/bin/conda clean -ya
-ENV PATH /opt/conda/bin:$PATH
-# This must be done before pip so that requirements.txt is available
-WORKDIR /opt/pytorch
-
-RUN git clone --recursive https://github.com/pytorch/pytorch
-
-RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
-    CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
-    cd pytorch/ && \
-    pip install -v .
-
-RUN /opt/conda/bin/conda config --set ssl_verify False
-RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
-RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org torchvision tensorboardX==1.6.0
-
-WORKDIR /var
-ADD mnist.py /var
-
-ENTRYPOINT ["mpirun", "-n", "1", "--allow-run-as-root", "python", "/var/mnist.py"]
+ENTRYPOINT ["mpirun", "-n", "1", "--allow-run-as-root", "python", "/opt/mnist/src/mnist.py"]
diff --git a/examples/pytorch/mnist/mnist.py b/examples/pytorch/mnist/mnist.py
@@ -3,14 +3,14 @@
 import argparse
 import os
 
-from tensorboardX import SummaryWriter
-from torchvision import datasets, transforms
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
+from tensorboardX import SummaryWriter
 from torch.utils.data import DistributedSampler
+from torchvision import datasets, transforms
 
 
 class Net(nn.Module):
@@ -185,8 +185,7 @@ def main():
     print(f"World Size: {os.environ['WORLD_SIZE']}. Rank: {os.environ['RANK']}")
 
     dist.init_process_group(backend=args.backend)
-    Distributor = nn.parallel.DistributedDataParallel
-    model = Distributor(model)
+    model = nn.parallel.DistributedDataParallel(model)
 
     # Get FashionMNIST train and test dataset.
     train_ds = datasets.FashionMNIST(

diff --git a/examples/pytorch/pytorch_cuda_docker/Dockerfile b/examples/pytorch/pytorch_cuda_docker/Dockerfile
diff --git a/examples/pytorch/simple.yaml b/examples/pytorch/simple.yaml
@@ -16,7 +16,7 @@ spec:
               imagePullPolicy: Always
               command:
                 - "python3"
-                - "/opt/pytorch-mnist/mnist.py"
+                - "/opt/mnist/src/mnist.py"
                 - "--epochs=1"
     Worker:
       replicas: 1
@@ -29,5 +29,5 @@ spec:
               imagePullPolicy: Always
               command:
                 - "python3"
-                - "/opt/pytorch-mnist/mnist.py"
+                - "/opt/mnist/src/mnist.py"
                 - "--epochs=1"
diff --git a/examples/pytorch/smoke-dist/Dockerfile b/examples/pytorch/smoke-dist/Dockerfile
@@ -1,4 +1,7 @@
-FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime
+# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
+# PyTorch=2.2.0, cuda=12.3.2
+# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
+FROM nvcr.io/nvidia/pytorch:24.01-py3
 
 RUN mkdir -p /opt/mlkube
 COPY examples/pytorch/smoke-dist/dist_sendrecv.py /opt/mlkube/

diff --git a/examples/pytorch/smoke-dist/dist_sendrecv.py b/examples/pytorch/smoke-dist/dist_sendrecv.py
@@ -1,16 +1,9 @@
 import logging
 import os
-import json
+
 import torch
 import torch.distributed as dist
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-
-from math import ceil
-from random import Random
-from torch.autograd import Variable
-from torchvision import datasets, transforms
+
 
 def run():
     """ Simple Send/Recv for testing Master <--> Workers communication """