Skip to content

Commit

Permalink
Upgrade PyTorchJob examples to PyTorch v2 (kubeflow#2024)
Browse files Browse the repository at this point in the history
* refactor: upgrade pytorch job examples to pytorch v2

Signed-off-by: champon1020 <nagatelu1020@gmail.com>

* fix: remove torch.compile and update base image of Dockerfile

Signed-off-by: champon1020 <nagatelu1020@gmail.com>

* fix: comment out pytorch mnist Dockerfiles in the config of CI

Signed-off-by: champon1020 <nagatelu1020@gmail.com>

* fix: minor changes

* add Dockerfile context to github workflow yaml

* add commenets to the head of Dockerfile

Signed-off-by: champon1020 <nagatelu1020@gmail.com>

---------

Signed-off-by: champon1020 <nagatelu1020@gmail.com>
Signed-off-by: deepanker13 <deepanker.gupta@nutanix.com>
  • Loading branch information
champon1020 authored and deepanker13 committed Apr 8, 2024
1 parent 32934c3 commit b5f8803
Show file tree
Hide file tree
Showing 10 changed files with 42 additions and 133 deletions.
11 changes: 6 additions & 5 deletions .github/workflows/publish-example-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@ jobs:
- component-name: mxnet-auto-tuning
dockerfile: examples/mxnet/tune/Dockerfile
context: examples/mxnet/tune
# TODO (tenzen-y): Fix the below broken Dockerfiles
# - component-name: pytorch-dist-mnist-mpi
# dockerfile: examples/pytorch/mnist/Dockerfile-mpi
# - component-name: pytorch-dist-mnist
# dockerfile: examples/pytorch/mnist/Dockerfile
- component-name: pytorch-dist-mnist
dockerfile: examples/pytorch/mnist/Dockerfile
context: examples/pytorch/mnist
- component-name: pytorch-dist-mnist-mpi
dockerfile: examples/pytorch/mnist/Dockerfile-mpi
context: examples/pytorch/mnist
5 changes: 4 additions & 1 deletion examples/pytorch/elastic/imagenet/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
ARG BASE_IMAGE=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
# PyTorch=2.2.0, cuda=12.3.2
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
FROM $BASE_IMAGE

WORKDIR /workspace
Expand Down
8 changes: 3 additions & 5 deletions examples/pytorch/elastic/imagenet/imagenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,12 @@
import torchvision.datasets as datasets
import torchvision.models as models
import torchvision.transforms as transforms
from torch.distributed.elastic.utils.data import ElasticDistributedSampler
from torch.distributed.elastic.multiprocessing.errors import record
from torch.distributed.elastic.utils.data import ElasticDistributedSampler
from torch.nn.parallel import DistributedDataParallel
from torch.optim import SGD
from torch.utils.data import DataLoader


model_names = sorted(
name
for name in models.__dict__
Expand Down Expand Up @@ -146,6 +145,7 @@
help="checkpoint file path, to load and save to",
)


@record
def main():
args = parser.parse_args()
Expand All @@ -164,9 +164,7 @@ def main():
)

# resume from checkpoint if one exists;
state = load_checkpoint(
args.checkpoint_file, args.arch, model, optimizer
)
state = load_checkpoint(args.checkpoint_file, args.arch, model, optimizer)

start_epoch = state.epoch + 1
print(f"=> start_epoch: {start_epoch}, best_acc1: {state.best_acc1}")
Expand Down
11 changes: 7 additions & 4 deletions examples/pytorch/mnist/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
# PyTorch=2.2.0, cuda=12.3.2
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
FROM nvcr.io/nvidia/pytorch:24.01-py3

RUN pip install tensorboardX==1.6.0
RUN pip install tensorboardX==2.6.2
RUN mkdir -p /opt/mnist

WORKDIR /opt/mnist/src
ADD mnist.py /opt/mnist/src/mnist.py

RUN chgrp -R 0 /opt/mnist \
&& chmod -R g+rwX /opt/mnist
RUN chgrp -R 0 /opt/mnist \
&& chmod -R g+rwX /opt/mnist

ENTRYPOINT ["python", "/opt/mnist/src/mnist.py"]
59 changes: 11 additions & 48 deletions examples/pytorch/mnist/Dockerfile-mpi
Original file line number Diff line number Diff line change
@@ -1,52 +1,15 @@
FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
ARG PYTHON_VERSION=3.6
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
# PyTorch=2.2.0, cuda=12.3.2
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
FROM nvcr.io/nvidia/pytorch:24.01-py3

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
vim \
wget \
ca-certificates \
openssh-client \
libjpeg-dev \
libpng-dev &&\
rm -rf /var/lib/apt/lists/*
RUN pip install tensorboardX==2.6.2
RUN mkdir -p /opt/mnist

RUN wget https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-3.0.0.tar.gz && \
gunzip -c openmpi-3.0.0.tar.gz | tar xf - && \
cd openmpi-3.0.0 && \
./configure --prefix=/home/.openmpi --with-cuda && \
make all install
WORKDIR /opt/mnist/src
ADD mnist.py /opt/mnist/src/mnist.py

ENV PATH="$PATH:/home/.openmpi/bin"
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/"
RUN chgrp -R 0 /opt/mnist \
&& chmod -R g+rwX /opt/mnist

RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
RUN wget -O ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda update conda && \
/opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include cython typing && \
/opt/conda/bin/conda clean -ya
ENV PATH /opt/conda/bin:$PATH
# This must be done before pip so that requirements.txt is available
WORKDIR /opt/pytorch

RUN git clone --recursive https://github.com/pytorch/pytorch

RUN TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
cd pytorch/ && \
pip install -v .

RUN /opt/conda/bin/conda config --set ssl_verify False
RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pythonhosted.org
RUN pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org torchvision tensorboardX==1.6.0

WORKDIR /var
ADD mnist.py /var

ENTRYPOINT ["mpirun", "-n", "1", "--allow-run-as-root", "python", "/var/mnist.py"]
ENTRYPOINT ["mpirun", "-n", "1", "--allow-run-as-root", "python", "/opt/mnist/src/mnist.py"]
7 changes: 3 additions & 4 deletions examples/pytorch/mnist/mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
import argparse
import os

from tensorboardX import SummaryWriter
from torchvision import datasets, transforms
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tensorboardX import SummaryWriter
from torch.utils.data import DistributedSampler
from torchvision import datasets, transforms


class Net(nn.Module):
Expand Down Expand Up @@ -185,8 +185,7 @@ def main():
print(f"World Size: {os.environ['WORLD_SIZE']}. Rank: {os.environ['RANK']}")

dist.init_process_group(backend=args.backend)
Distributor = nn.parallel.DistributedDataParallel
model = Distributor(model)
model = nn.parallel.DistributedDataParallel(model)

# Get FashionMNIST train and test dataset.
train_ds = datasets.FashionMNIST(
Expand Down
54 changes: 0 additions & 54 deletions examples/pytorch/pytorch_cuda_docker/Dockerfile

This file was deleted.

4 changes: 2 additions & 2 deletions examples/pytorch/simple.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
imagePullPolicy: Always
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
- "/opt/mnist/src/mnist.py"
- "--epochs=1"
Worker:
replicas: 1
Expand All @@ -29,5 +29,5 @@ spec:
imagePullPolicy: Always
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
- "/opt/mnist/src/mnist.py"
- "--epochs=1"
5 changes: 4 additions & 1 deletion examples/pytorch/smoke-dist/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
FROM pytorch/pytorch:1.0-cuda10.0-cudnn7-runtime
# We need to use the nvcr.io/nvidia/pytorch image as a base image to support both linux/amd64 and linux_arm64 platforms.
# PyTorch=2.2.0, cuda=12.3.2
# Ref: https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-01.html#rel-24-01
FROM nvcr.io/nvidia/pytorch:24.01-py3

RUN mkdir -p /opt/mlkube
COPY examples/pytorch/smoke-dist/dist_sendrecv.py /opt/mlkube/
Expand Down
11 changes: 2 additions & 9 deletions examples/pytorch/smoke-dist/dist_sendrecv.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,9 @@
import logging
import os
import json

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from math import ceil
from random import Random
from torch.autograd import Variable
from torchvision import datasets, transforms


def run():
""" Simple Send/Recv for testing Master <--> Workers communication """
Expand Down

0 comments on commit b5f8803

Please sign in to comment.