Skip to content

Commit

Permalink
Add TransformerEngine to PT 2.0 training images (#3315)
Browse files Browse the repository at this point in the history
  • Loading branch information
arjkesh authored Sep 26, 2023
1 parent 70a089d commit 5241309
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 32 deletions.
60 changes: 30 additions & 30 deletions pytorch/training/buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,21 @@ images:
# target: ec2
# context:
# <<: *TRAINING_CONTEXT
# BuildEC2GPUPTTrainPy3cu121DockerImage:
# <<: *TRAINING_REPOSITORY
# build: &PYTORCH_GPU_TRAINING_PY3 false
# image_size_baseline: 19700
# device_type: &DEVICE_TYPE gpu
# python_version: &DOCKER_PYTHON_VERSION py3
# tag_python_version: &TAG_PYTHON_VERSION py310
# cuda_version: &CUDA_VERSION cu121
# os_version: &OS_VERSION ubuntu20.04
# tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
# *DEVICE_TYPE ]
# target: ec2
# context:
# <<: *TRAINING_CONTEXT
BuildEC2GPUPTTrainPy3cu121DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 19700
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py310
cuda_version: &CUDA_VERSION cu121
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: ec2
context:
<<: *TRAINING_CONTEXT
# BuildEC2GPUPTTrainPy3cu118DockerImage:
# <<: *TRAINING_REPOSITORY
# build: &PYTORCH_GPU_TRAINING_PY3 false
Expand Down Expand Up @@ -84,21 +84,21 @@ images:
# target: sagemaker
# context:
# <<: *TRAINING_CONTEXT
BuildSageMakerGPUPTTrainPy3DockerImage:
<<: *TRAINING_REPOSITORY
build: &PYTORCH_GPU_TRAINING_PY3 false
image_size_baseline: 21500
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py310
cuda_version: &CUDA_VERSION cu118
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
context:
<<: *TRAINING_CONTEXT
# BuildSageMakerGPUPTTrainPy3DockerImage:
# <<: *TRAINING_REPOSITORY
# build: &PYTORCH_GPU_TRAINING_PY3 false
# image_size_baseline: 21500
# device_type: &DEVICE_TYPE gpu
# python_version: &DOCKER_PYTHON_VERSION py3
# tag_python_version: &TAG_PYTHON_VERSION py310
# cuda_version: &CUDA_VERSION cu118
# os_version: &OS_VERSION ubuntu20.04
# tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
# *DEVICE_TYPE ]
# target: sagemaker
# context:
# <<: *TRAINING_CONTEXT
# BuildPyTorchExampleGPUTrainPy3cu121DockerImage:
# <<: *TRAINING_REPOSITORY
# build: &PYTORCH_GPU_TRAINING_PY3 false
Expand Down
15 changes: 13 additions & 2 deletions pytorch/training/docker/2.0/py3/cu121/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ ENV PATH /opt/conda/bin:$PATH
# 5.2 is G3 EC2 instance, 7.5 is G4*, 7.0 is p3*, 8.0 is P4*, 8.6 is G5* and 9.0 is P5*
ENV TORCH_CUDA_ARCH_LIST="5.2;7.0+PTX;7.5;8.0;8.6;9.0"
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
ENV CUDNN_VERSION=8.9.3.28
ENV NCCL_VERSION=2.18.3
ENV EFA_VERSION=1.24.1
ENV GDRCOPY_VERSION=2.3.1
Expand All @@ -68,6 +69,8 @@ RUN apt-get update \
build-essential \
ca-certificates \
cmake \
libcudnn8=$CUDNN_VERSION-1+cuda12.1 \
libcudnn8-dev=$CUDNN_VERSION-1+cuda12.1 \
curl \
emacs \
git \
Expand Down Expand Up @@ -133,7 +136,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
# Adding package for studio kernels
ipykernel \
# patch CVE
"cryptography>=41.0.2" \
"cryptography>=41.0.4" \
# patch CVE
"pillow>=9.4" \
"mpi4py>=3.1.4,<3.2" \
Expand Down Expand Up @@ -268,7 +271,7 @@ RUN /opt/conda/bin/mamba install -y -c conda-forge \
&& /opt/conda/bin/mamba clean -afy

# Patches
RUN pip install "pillow>=9.5" opencv-python
RUN pip install "pillow>=9.5" opencv-python huggingface_hub
RUN /opt/conda/bin/mamba install -y -c conda-forge \
"requests>=2.31.0" \
&& /opt/conda/bin/mamba clean -afy
Expand All @@ -292,6 +295,14 @@ RUN pip install packaging \
&& cd .. \
&& rm -rf apex

# Install flash attn and NVIDIA transformer engine
ENV NVTE_FRAMEWORK=pytorch
# Install flash-attn using instructions from https://github.com/Dao-AILab/flash-attention#installation-and-features
# Set MAX_JOBS=4 to avoid OOM issues in installation process
RUN MAX_JOBS=4 pip install flash-attn==2.0.4 --no-build-isolation
# Install TE using instructions from https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/installation.html
RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@release_v0.12

RUN HOME_DIR=/root \
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \
&& unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \
Expand Down
10 changes: 10 additions & 0 deletions test/dlc_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1025,6 +1025,11 @@ def skip_pt110():
pass


@pytest.fixture(scope="session")
def pt21_and_above_only():
pass


@pytest.fixture(scope="session")
def pt18_and_above_only():
pass
Expand Down Expand Up @@ -1154,6 +1159,10 @@ def framework_version_within_limit(metafunc_obj, image):
"skip_pt110" in metafunc_obj.fixturenames
and is_equal_to_framework_version("1.10.*", image, image_framework_name)
)
pt21_requirement_failed = (
"pt21_and_above_only" in metafunc_obj.fixturenames
and is_below_framework_version("2.1", image, image_framework_name)
)
pt18_requirement_failed = (
"pt18_and_above_only" in metafunc_obj.fixturenames
and is_below_framework_version("1.8", image, image_framework_name)
Expand Down Expand Up @@ -1181,6 +1190,7 @@ def framework_version_within_limit(metafunc_obj, image):
or below_pt113_requirement_failed
or pt111_requirement_failed
or not_pt110_requirement_failed
or pt21_requirement_failed
or pt18_requirement_failed
or pt17_requirement_failed
or pt16_requirement_failed
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

set -ex

git clone --branch release_v0.12 https://github.com/NVIDIA/TransformerEngine.git
cd TransformerEngine/tests/pytorch

pip install pytest==6.2.5 onnxruntime==1.13.1 onnx
pytest -v -s test_sanity.py
PYTORCH_JIT=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s test_numerics.py
NVTE_TORCH_COMPILE=0 pytest -v -s test_onnx_export.py
pytest -v -s test_jit.py
44 changes: 44 additions & 0 deletions test/dlc_tests/ec2/pytorch/training/test_pytorch_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,3 +620,47 @@ def test_pytorch_standalone_hpu(
container_name="ec2_training_habana_pytorch_container",
enable_habana_async_execution=True,
)


@pytest.mark.usefixtures("feature_aws_framework_present")
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.integration("cudnn")
@pytest.mark.model("N/A")
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True)
def test_pytorch_cudnn_match_gpu(
pytorch_training, ec2_connection, region, gpu_only, ec2_instance_type, pt21_and_above_only
):
"""
PT 2.1 reintroduces a dependency on CUDNN to support NVDA TransformerEngine. This test is to ensure that torch CUDNN matches system CUDNN in the container.
"""
container_name = "pt_cudnn_test"
ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
ec2_connection.run(f"docker pull -q {pytorch_training}", hide=True)
ec2_connection.run(
f"nvidia-docker run --name {container_name} -itd {pytorch_training}", hide=True
)
major_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_MAJOR'"
minor_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_MINOR'"
patch_cmd = "cat /usr/include/cudnn_version.h | grep '#define CUDNN_PATCHLEVEL'"
major = ec2_connection.run(
f"nvidia-docker exec --user root {container_name} bash -c '{major_cmd}'", hide=True
).stdout.split()[-1]
minor = ec2_connection.run(
f"nvidia-docker exec --user root {container_name} bash -c '{minor_cmd}'", hide=True
).stdout.split()[-1]
patch = ec2_connection.run(
f"nvidia-docker exec --user root {container_name} bash -c '{patch_cmd}'", hide=True
).stdout.split()[-1]

cudnn_from_torch = ec2_connection.run(
f"nvidia-docker exec --user root {container_name} python -c 'from torch.backends import cudnn; print(cudnn.version())'",
hide=True,
).stdout.strip()

if len(patch) == 1:
patch = f"0{patch}"

system_cudnn = f"{major}{minor}{patch}"
assert (
system_cudnn == cudnn_from_torch
), f"System CUDNN {system_cudnn} and torch cudnn {cudnn_from_torch} do not match. Please downgrade system CUDNN or recompile torch with correct CUDNN verson."
33 changes: 33 additions & 0 deletions test/dlc_tests/ec2/test_transformerengine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os

import pytest

import test.test_utils.ec2 as ec2_utils
from test.test_utils import CONTAINER_TESTS_PREFIX, is_pr_context, is_efa_dedicated
from test.test_utils.ec2 import get_efa_ec2_instance_type, filter_efa_instance_type

PT_TE_TESTS_CMD = os.path.join(
CONTAINER_TESTS_PREFIX, "transformerengine", "testPTTransformerEngine"
)


EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION = get_efa_ec2_instance_type(
default="p4d.24xlarge",
filter_function=filter_efa_instance_type,
)


@pytest.mark.processor("gpu")
@pytest.mark.model("N/A")
@pytest.mark.integration("transformerengine")
@pytest.mark.usefixtures("sagemaker")
@pytest.mark.allow_p4de_use
@pytest.mark.parametrize("ec2_instance_type,region", EC2_EFA_GPU_INSTANCE_TYPE_AND_REGION)
@pytest.mark.skipif(
is_pr_context() and not is_efa_dedicated(),
reason="Skip heavy instance test in PR context unless explicitly enabled",
)
def test_pytorch_transformerengine(
pytorch_training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only
):
ec2_utils.execute_ec2_training_test(ec2_connection, pytorch_training, PT_TE_TESTS_CMD)

0 comments on commit 5241309

Please sign in to comment.