Dockerfile.finetune

FROM pytorch/pytorch:2.2.1-cuda12.1-cudnn8-devel

WORKDIR /workspace

RUN apt-get update && apt-get -y install git git-lfs cmake libsndfile1 ffmpeg wget
# hugging face hub
RUN pip install -U "huggingface_hub[cli]"
# nemo toolkit
RUN pip install Cython packaging
RUN pip install "nemo_toolkit[all]"
# megatron-core
RUN pip install "megatron-core==0.9.0"
# nemo-run
RUN pip install git+https://github.com/NVIDIA/NeMo-Run.git@6b00501e68d56070bb4fb9808ab11bb8d0f03b51
# flash-attn
# see https://github.com/NVIDIA/TransformerEngine/issues/1236
RUN pip install "flash-attn<2.5.7"
# nvidia transformer-engine
RUN pip install "transformer-engine[pytorch]==1.11.0"
# nvidia apex
RUN pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings \
"--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex.git@24.04.01
# make sure networkx is pinned to a known compatible version
RUN pip install "networkx==3.4.2"
# get quickstart script
ADD "https://raw.githubusercontent.com/jxtngx/nemo-lab/refs/heads/main/scripts/tutorials/nemo/finetune_llama3_8b.py" /workspace/finetune_llama3_8b.py