Skip to content

Commit

Permalink
enable ci pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
gurpreet-dhami committed Nov 5, 2024
1 parent 1efaebf commit c390b79
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 39 deletions.
74 changes: 74 additions & 0 deletions Dockerfile_rocm
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
ARG BASE_DOCKER=rocm/pytorch:latest
FROM $BASE_DOCKER

ARG PYTORCH_ROCM_ARCH_OVERRIDE="gfx942"
ENV WORKSPACE_DIR=/workspace
ENV STAGE_DIR=/workspace/installs
RUN mkdir -p $WORKSPACE_DIR
RUN mkdir -p ${STAGE_DIR}
WORKDIR $WORKSPACE_DIR

RUN pip3 install \
numpy==1.26.4 \
scipy \
einops \
flask-restful \
nltk \
pytest \
pytest-cov \
pytest_mock \
pytest-csv \
pytest-random-order \
sentencepiece \
wrapt \
zarr \
wandb \
tensorstore==0.1.45 \
pytest_mock \
pybind11 \
setuptools==69.5.1 \
datasets \
tiktoken \
pynvml

RUN pip3 install "huggingface_hub[cli]"
RUN python3 -m nltk.downloader punkt_tab


# Install Causal-Conv1d and its dependencies
WORKDIR ${STAGE_DIR}
ENV CAUSAL_CONV1D_FORCE_BUILD=TRUE
ENV MAMBA_FORCE_BUILD=TRUE
ENV HIP_ARCHITECTURES=${PYTORCH_ROCM_ARCH_OVERRIDE}
RUN git clone https://github.com/Dao-AILab/causal-conv1d causal-conv1d &&\
cd causal-conv1d &&\
git show --oneline -s &&\
pip install .

# Install mamba
WORKDIR ${STAGE_DIR}
RUN git clone https://github.com/state-spaces/mamba mamba &&\
cd mamba &&\
git show --oneline -s &&\
pip install --no-build-isolation .

# Clone TE repo and submodules
WORKDIR ${STAGE_DIR}
ENV NVTE_FRAMEWORK=pytorch
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH_OVERRIDE}
ENV NVTE_USE_HIPBLASLT=1
RUN git clone --recursive https://github.com/ROCmSoftwarePlatform/TransformerEngine-private.git &&\
cd TransformerEngine-private &&\
pip install .

WORKDIR $WORKSPACE_DIR
RUN git clone https://github.com/ROCm/Megatron-LM.git Megatron-LM &&\
cd Megatron-LM &&\
git checkout rocm_dev &&\
pip install -e .

WORKDIR $WORKSPACE_DIR/Megatron-LM

# record configuration for posterity
RUN pip list

67 changes: 28 additions & 39 deletions Jenkinsfile
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -12,55 +12,44 @@ def show_node_info() {
"""
}

def clean_up_docker() {
sh 'docker ps -a || true' // "|| true" suppresses errors
sh 'docker kill $(docker ps -q) || true'
sh 'docker rm $(docker ps -a -q) || true'
sh 'docker rmi $(docker images -q) || true'
sh 'docker system prune -af --volumes || true'
}

def clean_up_docker_container() {
sh 'docker ps -a || true' // "|| true" suppresses errors
sh 'docker kill $(docker ps -q) || true'
}
DOCKER_IMAGE = "megatron-lm"
CONTAINER_NAME = "megatron-lm-container"
DOCKER_BUILD_ARGS = "--build-arg PYTORCH_ROCM_ARCH_OVERRIDE=gfx90a"
DOCKER_RUN_ARGS = "--workdir /workspace/Megatron-LM --entrypoint /workspace/Megatron-LM/run_unit_tests.sh"

//makes sure multiple builds are not triggered for branch indexing
def resetbuild() {
if(currentBuild.getBuildCauses().toString().contains('BranchIndexingCause')) {
def milestonesList = []
def build = currentBuild

while(build != null) {
if(build.getBuildCauses().toString().contains('BranchIndexingCause')) {
milestonesList.add(0, build.number)
}
build = build.previousBuildInProgress
}

for (buildNum in milestonesList) {
milestone(buildNum)
}
DOCKER_RUN_CMD= "docker run --rm -t --network host -u root --group-add video --cap-add=SYS_PTRACE --cap-add SYS_ADMIN --device /dev/fuse --security-opt seccomp=unconfined --security-opt apparmor=unconfined --ipc=host --device=/dev/kfd --device=/dev/dri"
pipeline {
parameters {
string(name: 'TEST_NODE_LABEL', defaultValue: 'MI250', description: 'Node or Label to launch Jenkins Job')
}
}

pipeline {
agent any
agent {node {label "${params.TEST_NODE_LABEL}"}}

stages {
stage('Build') {
stage('Build Docker Image') {
steps {
echo 'Building..'
show_node_info()
script {
sh "docker build -f Dockerfile_rocm -t ${DOCKER_IMAGE} ${DOCKER_BUILD_ARGS} ."
}
}
}
}
stage('Test') {

stage('Run Docker Container') {
steps {
echo 'Testing..'
script {
sh "${DOCKER_RUN_CMD} ${DOCKER_ARGS} --name ${CONTAINER_NAME} ${DOCKER_IMAGE} "
}
}
}
stage('Deploy') {
steps {
show_node_info()
}

post {
always {
//Cleanup
archiveArtifacts 'test_report.csv'
script {
sh "docker rmi ${DOCKER_IMAGE}"
}
}
}
Expand Down
5 changes: 5 additions & 0 deletions run_unit_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

set -x
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
torchrun --nproc_per_node=8 -m pytest -m "not flaky and not nternal and not failing_on_rocm_mi250 and not failing_on_rocm" --csv test_report.csv Megatron-LM/tests/unit_tests/

0 comments on commit c390b79

Please sign in to comment.