flexflow · goliaro · Sep 4, 2024 · Oct 23, 2023 · Oct 24, 2023 · Oct 24, 2023
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -52,13 +52,14 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.16
         if: ${{ matrix.gpu_backend == 'cuda' }}
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
           use-github-cache: "false"
+          log-file-suffix: 'cmake_${{matrix.gpu_backend}}.txt'
 
       - name: Install system dependencies
         run: .github/workflows/helpers/install_dependencies.sh
@@ -156,11 +157,12 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.16
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           use-github-cache: "false"
+          log-file-suffix: 'makefile_${{matrix.gpu_backend}}.txt'
 
       - name: Install system dependencies
         run: .github/workflows/helpers/install_dependencies.sh
@@ -169,7 +171,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v2
         with:
           activate-environment: flexflow
-          environment-file: conda/environment.yml
+          environment-file: conda/flexflow.yml
           auto-activate-base: false
 
       - name: Build FlexFlow

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -181,6 +181,16 @@ jobs:
           ../config/config.linux
           make -j
 
+      - name: Run PEFT tests
+        run: |
+          export PATH=$CONDA_PREFIX/bin:$PATH
+          export CUDNN_DIR=/usr/local/cuda
+          export CUDA_DIR=/usr/local/cuda
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+
+          source ./build/set_python_envs.sh
+          ./tests/peft_test.sh
+
       - name: Run inference tests
         env:
           CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}

diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh
@@ -5,8 +5,11 @@ set -x
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
+ubuntu_version=$(lsb_release -rs)
+ubuntu_version=${ubuntu_version//./}
+
 # Install CUDNN
-cuda_version=${1:-11.8.0}
+cuda_version=${1:-12.1.1}
 cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
 echo "Installing CUDNN for CUDA version: ${cuda_version} ..."
 CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
@@ -44,8 +47,11 @@ elif [[ "$cuda_version" == "11.7" ]]; then
 elif [[ "$cuda_version" == "11.8" ]]; then
     CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
     CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
-elif [[ "$cuda_version" == "12.0" ]]; then
-    echo "CUDNN support for CUDA version 12.0 not yet added"
+elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then
+    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+    CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+else
+    echo "CUDNN support for CUDA version above 12.5 not yet added"
     exit 1
 fi
 wget -c -q $CUDNN_LINK
@@ -55,6 +61,17 @@ if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version"
     sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include
     sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib
     rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
+elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then
+    wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+    sudo dpkg -i cuda-keyring_1.1-1_all.deb
+    sudo apt update -y
+    rm -f cuda-keyring_1.1-1_all.deb
+    sudo dpkg -i $CUDNN_TARBALL_NAME
+    sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/
+    sudo apt update -y
+    sudo apt install -y libcudnn8
+    sudo apt install -y libcudnn8-dev
+    sudo apt install -y libcudnn8-samples
 else
     sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
 fi

diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh
@@ -8,13 +8,13 @@ cd "${BASH_SOURCE[0]%/*}"
 # Add NCCL key ring
 ubuntu_version=$(lsb_release -rs)
 ubuntu_version=${ubuntu_version//./}
-wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"
-sudo dpkg -i cuda-keyring_1.0-1_all.deb
+wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
 sudo apt update -y
-rm -f cuda-keyring_1.0-1_all.deb
+rm -f cuda-keyring_1.1-1_all.deb
 
 # Install NCCL
-cuda_version=${1:-11.8.0}
+cuda_version=${1:-12.1.1}
 cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
 echo "Installing NCCL for CUDA version: ${cuda_version} ..."
 

diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml
@@ -38,7 +38,7 @@ jobs:
     # 10h timeout, instead of default of 360min (6h)
     timeout-minutes: 600
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
@@ -87,7 +87,7 @@ jobs:
     runs-on: self-hosted
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
       options: --gpus all --shm-size=8192m
     # 10h timeout, instead of default of 360min (6h)
     timeout-minutes: 600
@@ -138,7 +138,7 @@ jobs:
     runs-on: self-hosted
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version

diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml
@@ -44,10 +44,10 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.16
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
           use-github-cache: "false"
 

diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml
@@ -23,13 +23,13 @@ jobs:
     strategy:
       matrix:
         gpu_backend: ["cuda", "hip_rocm"]
-        gpu_backend_version: ["11.8", "5.6"]
+        gpu_backend_version: ["12.0", "5.6"]
         python_version: ["3.11"]
         exclude:
           - gpu_backend: "cuda"
             gpu_backend_version: "5.6"
           - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.8"
+            gpu_backend_version: "12.0"
       fail-fast: false
     steps:
       - name: Checkout Git Repository

diff --git a/.gitignore b/.gitignore
@@ -187,4 +187,9 @@ gpt_tokenizer
 python/flexflow/version.txt
 
 inference_tensors
+hf_peft_tensors
+lora_training_logs
+
+Untitled-1.ipynb
+Untitled-2.ipynb
 tests/inference/python_test_configs/*.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -567,6 +567,7 @@ if(NOT BUILD_LEGION_ONLY)
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/peft)
   endif()
 
 

diff --git a/conda/flexflow.yml b/conda/flexflow.yml
@@ -25,3 +25,10 @@ dependencies:
     - sentencepiece
     - einops
     - requests
+    - scipy
+    - bitsandbytes
+    - datasets
+    - accelerate
+    - loralib
+    - triton
+    - peft
diff --git a/config/config.inc b/config/config.inc
@@ -197,7 +197,7 @@ fi
 
 # set ROCM path
 if [ -n "$ROCM_PATH" ]; then
-  SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}"
+  SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH} -DHIP_ROOT_DIR=${ROCM_PATH}"
 fi
 
 ADD_ROCM_TO_PATH=""

diff --git a/docker/build.sh b/docker/build.sh
@@ -56,15 +56,14 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
     cuda_version_input=${cuda_version}.3
   elif [[ "$cuda_version" == @(11.8) ]]; then 
     cuda_version_input=${cuda_version}.0
+  elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+    # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
+    cuda_version=12.2
+    cuda_version_input=${cuda_version}.2
   else
     echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
     exit 1
   fi
-  # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
-  if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    cuda_version=12.2
-    cuda_version_input=${cuda_version}.2
-  fi
   echo "Building $image docker image with CUDA $cuda_version"
   ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04"
   gpu_backend_version="-${cuda_version}"

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
@@ -94,6 +94,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1
 RUN conda install pytorch torchvision torchaudio -c pytorch
 RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
+# PEFT-related
+RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
 
 # Install Rust
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y

diff --git a/docker/run.sh b/docker/run.sh
@@ -58,7 +58,7 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
     fi
   fi
   # Check that CUDA version is supported
-  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
+  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
     echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
     exit 1
   fi

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
 #include "legion.h"
 #include <cstddef>
 #include <cstdlib>
@@ -36,13 +37,27 @@ using BeamSearchBatchConfigFuture = Legion::Future;
 using TreeVerifyBatchConfigFuture = Legion::Future;
 using BeamInferenceResultFuture = Legion::Future;
 
+struct OptimizerTasks {
+  bool compute_gradients = true;
+  bool reset_gradients_to_zero = false;
+  bool update_weights = false;
+  bool save_updated_weights = false;
+};
+
+void set_optimizer_tasks(OptimizerTasks &tasks,
+                         int max_training_steps,
+                         int completed_training_steps,
+                         int gradient_accumulation_steps);
+
 class BatchConfig {
 public:
   using RequestGuid = size_t;
   using TokenId = int;
   BatchConfig();
   int num_active_requests() const;
   int num_active_tokens() const;
+  int num_active_infr_tokens() const;
+  int num_active_peft_tokens() const;
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_verify_tokens_per_batch();
@@ -56,26 +71,43 @@ class BatchConfig {
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
-  static int const MAX_NUM_REQUESTS = 64;
+  static int const MAX_NUM_REQUESTS = 65;
   static int const MAX_NUM_TOKENS = 1024;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
 
   //  Set by update
-  int num_tokens;
+
+  int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0;
   // number of tokens in prompt phase, start offset of tokens in inc_decoding
   // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
-  int num_generation_tokens;
+  int num_generation_tokens = 0;
 
   struct PerRequestInfo {
+    PerRequestInfo() {
+      first_token_depth_in_request = 0;
+      first_token_offset_in_batch = 0;
+      num_tokens_in_batch = 0;
+      max_sequence_length = 0;
+      request_guid = 0;
+      prompt_phase = false;
+      batch_config_request_id = -1;
+      peft_model_id = PEFTModelID::NO_ID;
+      peft_bwd = false;
+      optimizer_tasks = {true, false, false, false};
+    }
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
     int max_sequence_length;
 
     // request id in batch config:
-    int batch_config_request_id;
+    int batch_config_request_id = -1;
     bool prompt_phase = false;
     RequestGuid request_guid;
+    // PEFT fields
+    PEFTModelID peft_model_id;
+    bool peft_bwd;
+    OptimizerTasks optimizer_tasks;
   };
   struct PerTokenInfo {
     int abs_depth_in_request;
@@ -102,6 +134,7 @@ class BatchConfig {
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
+  PerTokenInfo labelsInfo[MAX_NUM_TOKENS];
 
   bool request_completed[MAX_NUM_REQUESTS];
   bool request_running[MAX_NUM_REQUESTS];
@@ -129,6 +162,7 @@ class TreeVerifyBatchConfig : public BatchConfig {
 struct InferenceResult {
   static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
   BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
+  float finetuning_loss;
 };
 
 class BeamSearchBatchConfig : public BatchConfig {