diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d05856f1a9..ef5961bc87 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -52,13 +52,14 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.16
         if: ${{ matrix.gpu_backend == 'cuda' }}
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
           use-github-cache: "false"
+          log-file-suffix: 'cmake_${{matrix.gpu_backend}}.txt'
 
       - name: Install system dependencies
         run: .github/workflows/helpers/install_dependencies.sh
@@ -156,11 +157,12 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.16
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           use-github-cache: "false"
+          log-file-suffix: 'makefile_${{matrix.gpu_backend}}.txt'
 
       - name: Install system dependencies
         run: .github/workflows/helpers/install_dependencies.sh
@@ -169,7 +171,7 @@ jobs:
         uses: conda-incubator/setup-miniconda@v2
         with:
           activate-environment: flexflow
-          environment-file: conda/environment.yml
+          environment-file: conda/flexflow.yml
           auto-activate-base: false
 
       - name: Build FlexFlow
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
index c7d0cd72cb..00ca2df603 100644
--- a/.github/workflows/gpu-ci.yml
+++ b/.github/workflows/gpu-ci.yml
@@ -181,6 +181,16 @@ jobs:
           ../config/config.linux
           make -j
 
+      - name: Run PEFT tests
+        run: |
+          export PATH=$CONDA_PREFIX/bin:$PATH
+          export CUDNN_DIR=/usr/local/cuda
+          export CUDA_DIR=/usr/local/cuda
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib
+
+          source ./build/set_python_envs.sh
+          ./tests/peft_test.sh
+      
       - name: Run inference tests
         env:
           CPP_INFERENCE_TESTS: ${{ vars.CPP_INFERENCE_TESTS }}
diff --git a/.github/workflows/helpers/install_cudnn.sh b/.github/workflows/helpers/install_cudnn.sh
index 7c11a4a420..73b8e88418 100755
--- a/.github/workflows/helpers/install_cudnn.sh
+++ b/.github/workflows/helpers/install_cudnn.sh
@@ -5,8 +5,11 @@ set -x
 # Cd into directory holding this script
 cd "${BASH_SOURCE[0]%/*}"
 
+ubuntu_version=$(lsb_release -rs)
+ubuntu_version=${ubuntu_version//./}
+
 # Install CUDNN
-cuda_version=${1:-11.8.0}
+cuda_version=${1:-12.1.1}
 cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
 echo "Installing CUDNN for CUDA version: ${cuda_version} ..."
 CUDNN_LINK=http://developer.download.nvidia.com/compute/redist/cudnn/v8.0.5/cudnn-11.1-linux-x64-v8.0.5.39.tgz
@@ -44,8 +47,11 @@ elif [[ "$cuda_version" == "11.7" ]]; then
 elif [[ "$cuda_version" == "11.8" ]]; then
     CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
     CUDNN_TARBALL_NAME=cudnn-linux-x86_64-8.7.0.84_cuda11-archive.tar.xz
-elif [[ "$cuda_version" == "12.0" ]]; then
-    echo "CUDNN support for CUDA version 12.0 not yet added"
+elif [[ "$cuda_version" == "12.0" || "$cuda_version" == "12.1" || "$cuda_version" == "12.2" || "$cuda_version" == "12.3" || "$cuda_version" == "12.4" || "$cuda_version" == "12.5" ]]; then
+    CUDNN_LINK=https://developer.download.nvidia.com/compute/redist/cudnn/v8.8.0/local_installers/12.0/cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+    CUDNN_TARBALL_NAME=cudnn-local-repo-ubuntu2004-8.8.0.121_1.0-1_amd64.deb
+else
+    echo "CUDNN support for CUDA version above 12.5 not yet added"
     exit 1
 fi
 wget -c -q $CUDNN_LINK
@@ -55,6 +61,17 @@ if [[ "$cuda_version" == "11.6" || "$cuda_version" == "11.7" || "$cuda_version"
     sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/include/* /usr/local/include
     sudo cp -r "$CUDNN_EXTRACTED_TARBALL_NAME"/lib/* /usr/local/lib
     rm -rf "$CUDNN_EXTRACTED_TARBALL_NAME"
+elif [[ "$CUDNN_TARBALL_NAME" == *.deb ]]; then
+    wget -c -q "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+    sudo dpkg -i cuda-keyring_1.1-1_all.deb
+    sudo apt update -y
+    rm -f cuda-keyring_1.1-1_all.deb
+    sudo dpkg -i $CUDNN_TARBALL_NAME
+    sudo cp /var/cudnn-local-repo-ubuntu2004-8.8.0.121/cudnn-local-A9E17745-keyring.gpg /usr/share/keyrings/
+    sudo apt update -y
+    sudo apt install -y libcudnn8
+    sudo apt install -y libcudnn8-dev
+    sudo apt install -y libcudnn8-samples
 else
     sudo tar -xzf $CUDNN_TARBALL_NAME -C /usr/local
 fi
diff --git a/.github/workflows/helpers/install_nccl.sh b/.github/workflows/helpers/install_nccl.sh
index ca88668d84..ae6793ea2a 100755
--- a/.github/workflows/helpers/install_nccl.sh
+++ b/.github/workflows/helpers/install_nccl.sh
@@ -8,13 +8,13 @@ cd "${BASH_SOURCE[0]%/*}"
 # Add NCCL key ring
 ubuntu_version=$(lsb_release -rs)
 ubuntu_version=${ubuntu_version//./}
-wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"
-sudo dpkg -i cuda-keyring_1.0-1_all.deb
+wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.1-1_all.deb"
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
 sudo apt update -y
-rm -f cuda-keyring_1.0-1_all.deb
+rm -f cuda-keyring_1.1-1_all.deb
 
 # Install NCCL
-cuda_version=${1:-11.8.0}
+cuda_version=${1:-12.1.1}
 cuda_version=$(echo "${cuda_version}" | cut -f1,2 -d'.')
 echo "Installing NCCL for CUDA version: ${cuda_version} ..."
 
diff --git a/.github/workflows/multinode-test.yml b/.github/workflows/multinode-test.yml
index 226f953b38..2fc527bf08 100644
--- a/.github/workflows/multinode-test.yml
+++ b/.github/workflows/multinode-test.yml
@@ -38,7 +38,7 @@ jobs:
     # 10h timeout, instead of default of 360min (6h)
     timeout-minutes: 600
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
@@ -87,7 +87,7 @@ jobs:
     runs-on: self-hosted
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
       options: --gpus all --shm-size=8192m
     # 10h timeout, instead of default of 360min (6h)
     timeout-minutes: 600
@@ -138,7 +138,7 @@ jobs:
     runs-on: self-hosted
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.0:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
diff --git a/.github/workflows/pip-install.yml b/.github/workflows/pip-install.yml
index 3562134987..d5acbfc2e1 100644
--- a/.github/workflows/pip-install.yml
+++ b/.github/workflows/pip-install.yml
@@ -44,10 +44,10 @@ jobs:
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install CUDA
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.16
         id: cuda-toolkit
         with:
-          cuda: "11.8.0"
+          cuda: "12.1.1"
           # Disable caching of the CUDA binaries, since it does not give us any significant performance improvement
           use-github-cache: "false"
 
diff --git a/.github/workflows/prebuild-legion.yml b/.github/workflows/prebuild-legion.yml
index 267daaee6b..633fb00eb8 100644
--- a/.github/workflows/prebuild-legion.yml
+++ b/.github/workflows/prebuild-legion.yml
@@ -23,13 +23,13 @@ jobs:
     strategy:
       matrix:
         gpu_backend: ["cuda", "hip_rocm"]
-        gpu_backend_version: ["11.8", "5.6"]
+        gpu_backend_version: ["12.0", "5.6"]
         python_version: ["3.11"]
         exclude:
           - gpu_backend: "cuda"
             gpu_backend_version: "5.6"
           - gpu_backend: "hip_rocm"
-            gpu_backend_version: "11.8"
+            gpu_backend_version: "12.0"
       fail-fast: false
     steps:
       - name: Checkout Git Repository
diff --git a/.gitignore b/.gitignore
index 7f6a3c4137..cc34c1a7b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -187,4 +187,9 @@ gpt_tokenizer
 python/flexflow/version.txt
 
 inference_tensors
+hf_peft_tensors
+lora_training_logs
+
+Untitled-1.ipynb
+Untitled-2.ipynb
 tests/inference/python_test_configs/*.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c82a53644e..f06969ae04 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -567,6 +567,7 @@ if(NOT BUILD_LEGION_ONLY)
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/peft)
   endif()
 
 
diff --git a/conda/flexflow.yml b/conda/flexflow.yml
index 67ef6b3419..091ba929e4 100644
--- a/conda/flexflow.yml
+++ b/conda/flexflow.yml
@@ -25,3 +25,10 @@ dependencies:
     - sentencepiece
     - einops
     - requests
+    - scipy
+    - bitsandbytes
+    - datasets
+    - accelerate
+    - loralib
+    - triton
+    - peft
diff --git a/config/config.inc b/config/config.inc
index 7d7b2db9cf..6431eaf136 100644
--- a/config/config.inc
+++ b/config/config.inc
@@ -197,7 +197,7 @@ fi
 
 # set ROCM path
 if [ -n "$ROCM_PATH" ]; then
-  SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH}"
+  SET_ROCM_PATH="-DROCM_PATH=${ROCM_PATH} -DHIP_ROOT_DIR=${ROCM_PATH}"
 fi
 
 ADD_ROCM_TO_PATH=""
diff --git a/docker/build.sh b/docker/build.sh
index 8ecacbc6d4..b68860712f 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -56,15 +56,14 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
     cuda_version_input=${cuda_version}.3
   elif [[ "$cuda_version" == @(11.8) ]]; then 
     cuda_version_input=${cuda_version}.0
+  elif [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+    # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
+    cuda_version=12.2
+    cuda_version_input=${cuda_version}.2
   else
     echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
     exit 1
   fi
-  # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
-  if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    cuda_version=12.2
-    cuda_version_input=${cuda_version}.2
-  fi
   echo "Building $image docker image with CUDA $cuda_version"
   ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04"
   gpu_backend_version="-${cuda_version}"
diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
index cef619ad68..3434916d6b 100644
--- a/docker/flexflow-environment/Dockerfile
+++ b/docker/flexflow-environment/Dockerfile
@@ -94,6 +94,8 @@ RUN conda install -c conda-forge cmake make pillow cmake-build-extension pybind1
 RUN conda install pytorch torchvision torchaudio -c pytorch
 RUN conda install -c conda-forge onnx transformers>=4.31.0 sentencepiece einops
 RUN pip3 install tensorflow notebook
+# PEFT-related
+RUN pip3 install scipy bitsandbytes datasets accelerate loralib triton peft
 
 # Install Rust
 RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
diff --git a/docker/run.sh b/docker/run.sh
index 666c8e1121..cf105a10c8 100755
--- a/docker/run.sh
+++ b/docker/run.sh
@@ -58,7 +58,7 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
     fi
   fi
   # Check that CUDA version is supported
-  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
+  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
     echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
     exit 1
   fi
diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
index 009d1c250a..873fed0bdb 100644
--- a/include/flexflow/batch_config.h
+++ b/include/flexflow/batch_config.h
@@ -16,6 +16,7 @@
 #pragma once
 
 #include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
 #include "legion.h"
 #include <cstddef>
 #include <cstdlib>
@@ -36,6 +37,18 @@ using BeamSearchBatchConfigFuture = Legion::Future;
 using TreeVerifyBatchConfigFuture = Legion::Future;
 using BeamInferenceResultFuture = Legion::Future;
 
+struct OptimizerTasks {
+  bool compute_gradients = true;
+  bool reset_gradients_to_zero = false;
+  bool update_weights = false;
+  bool save_updated_weights = false;
+};
+
+void set_optimizer_tasks(OptimizerTasks &tasks,
+                         int max_training_steps,
+                         int completed_training_steps,
+                         int gradient_accumulation_steps);
+
 class BatchConfig {
 public:
   using RequestGuid = size_t;
@@ -43,6 +56,8 @@ class BatchConfig {
   BatchConfig();
   int num_active_requests() const;
   int num_active_tokens() const;
+  int num_active_infr_tokens() const;
+  int num_active_peft_tokens() const;
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_verify_tokens_per_batch();
@@ -56,26 +71,43 @@ class BatchConfig {
   // Maximum possible values for different parameters
   // These maximum values are used for copying BatchConfig
   // across workers
-  static int const MAX_NUM_REQUESTS = 64;
+  static int const MAX_NUM_REQUESTS = 65;
   static int const MAX_NUM_TOKENS = 1024;
   static int const MAX_SPEC_TREE_TOKEN_NUM = 64;
 
   //  Set by update
-  int num_tokens;
+
+  int num_tokens = 0, num_peft_tokens = 0, num_peft_label_tokens = 0;
   // number of tokens in prompt phase, start offset of tokens in inc_decoding
   // phase. num_tokens - num_prompt_tokens = num_generation_tokens;
-  int num_generation_tokens;
+  int num_generation_tokens = 0;
 
   struct PerRequestInfo {
+    PerRequestInfo() {
+      first_token_depth_in_request = 0;
+      first_token_offset_in_batch = 0;
+      num_tokens_in_batch = 0;
+      max_sequence_length = 0;
+      request_guid = 0;
+      prompt_phase = false;
+      batch_config_request_id = -1;
+      peft_model_id = PEFTModelID::NO_ID;
+      peft_bwd = false;
+      optimizer_tasks = {true, false, false, false};
+    }
     int first_token_depth_in_request;
     int first_token_offset_in_batch;
     int num_tokens_in_batch;
     int max_sequence_length;
 
     // request id in batch config:
-    int batch_config_request_id;
+    int batch_config_request_id = -1;
     bool prompt_phase = false;
     RequestGuid request_guid;
+    // PEFT fields
+    PEFTModelID peft_model_id;
+    bool peft_bwd;
+    OptimizerTasks optimizer_tasks;
   };
   struct PerTokenInfo {
     int abs_depth_in_request;
@@ -102,6 +134,7 @@ class BatchConfig {
   BitMask causalMask[MAX_NUM_REQUESTS];
   PerRequestInfo requestsInfo[MAX_NUM_REQUESTS];
   PerTokenInfo tokensInfo[MAX_NUM_TOKENS];
+  PerTokenInfo labelsInfo[MAX_NUM_TOKENS];
 
   bool request_completed[MAX_NUM_REQUESTS];
   bool request_running[MAX_NUM_REQUESTS];
@@ -129,6 +162,7 @@ class TreeVerifyBatchConfig : public BatchConfig {
 struct InferenceResult {
   static int const MAX_NUM_TOKENS = BatchConfig::MAX_NUM_TOKENS;
   BatchConfig::TokenId token_ids[MAX_NUM_TOKENS];
+  float finetuning_loss;
 };
 
 class BeamSearchBatchConfig : public BatchConfig {
diff --git a/include/flexflow/config.h b/include/flexflow/config.h
index 2c11ae1131..dd9d657117 100644
--- a/include/flexflow/config.h
+++ b/include/flexflow/config.h
@@ -65,6 +65,25 @@ constexpr ParameterSyncType CHOSEN_SYNC_TYPE = ParameterSyncType::PS;
 #endif
 
 class FFConfig;
+class MemoryAllocator;
+class PEFTWeightAllocator;
+
+struct CombinedBatchConfigMetaStruct {
+  BatchConfig::PerTokenInfo tokens_info[BatchConfig::MAX_NUM_TOKENS];
+  BatchConfig::PerRequestInfo requestsInfo[BatchConfig::MAX_NUM_REQUESTS];
+  BatchConfig::BitMask causalMask[BatchConfig::MAX_NUM_REQUESTS];
+  bool request_completed[BatchConfig::MAX_NUM_REQUESTS];
+
+  BeamSearchBatchConfig::BeamSearchPerTokenInfo
+      beamTokenInfo[BeamSearchBatchConfig::MAX_NUM_TOKENS +
+                    BeamSearchBatchConfig::MAX_SPEC_TREE_TOKEN_NUM *
+                        BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+  BeamSearchBatchConfig::BeamSearchPerRequestInfo
+      beamRequestsInfo[BeamSearchBatchConfig::MAX_NUM_REQUESTS];
+
+  TreeVerifyBatchConfig::CommittedTokensInfo
+      committed_tokens[TreeVerifyBatchConfig::MAX_NUM_TOKENS];
+};
 
 struct FFHandler {
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
@@ -76,18 +95,18 @@ struct FFHandler {
 #endif
   void *workSpace;
   size_t workSpaceSize;
-  void *batch_config_metadata;
+  CombinedBatchConfigMetaStruct *batch_config_metadata;
 
   // request info + token info + topolopgy mask info
-  size_t batch_config_metadata_size =
-      sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-      sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-      sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-      sizeof(BatchConfig::causalMask) +
-      sizeof(TreeVerifyBatchConfig::committed_tokens) +
-      sizeof(BatchConfig::request_completed);
+  size_t batch_config_metadata_size = sizeof(CombinedBatchConfigMetaStruct);
   void *offload_reserve_space;
   size_t offload_reserve_space_size;
+  // PEFT related fields
+  MemoryAllocator *peft_activation_allocator;
+  size_t peft_activation_reserve_space_size;
+  PEFTWeightAllocator *peft_weight_allocator;
+  size_t peft_weight_reserve_space_size;
+  // Quantization fields
   DataType quantization_type;
   bool allowTensorOpMathConversion;
 #ifdef FF_USE_NCCL
@@ -98,6 +117,8 @@ struct FFHandler {
 struct FFInitInfo {
   size_t workSpaceSize;
   size_t offload_reserve_space_size;
+  size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   DataType quantization_type;
   bool allowTensorOpMathConversion;
   // int myRank, allRanks;
@@ -155,6 +176,10 @@ class FFConfig {
   bool cpu_offload;
   size_t offload_reserve_space_size;
   DataType quantization_type;
+  // PEFT related fields
+  bool enable_peft;
+  size_t peft_activation_reserve_space_size;
+  size_t peft_weight_reserve_space_size;
   // Control parallelizable dimensions
   bool only_data_parallel;
   bool enable_sample_parallel;
diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
index 512645e624..24b722c36f 100644
--- a/include/flexflow/ffconst.h
+++ b/include/flexflow/ffconst.h
@@ -46,6 +46,12 @@ enum LossType {
   LOSS_IDENTITY = 54,
 };
 
+enum OptimizerType {
+  OPTIMIZER_TYPE_NONE = 60,
+  OPTIMIZER_TYPE_SGD = 61,
+  OPTIMIZER_TYPE_ADAM = 62,
+};
+
 enum CompMode {
   COMP_MODE_TRAINING = 70,
   COMP_MODE_INFERENCE = 71,
@@ -72,6 +78,11 @@ enum InferenceMode {
   TREE_VERIFY_MODE = 2003,
 };
 
+enum RequestType {
+  REQ_INFERENCE = 4001,
+  REQ_FINETUNING = 4002,
+};
+
 // This is consistent with TASO's OpType
 // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138
 enum OperatorType {
@@ -172,6 +183,8 @@ enum OperatorType {
   OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION,
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SAMPLING,
+  // PEFT Ops
+  OP_LORA,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,
@@ -179,6 +192,7 @@ enum OperatorType {
   OP_REDUCTION,
   OP_PIPELINE,
   OP_ALLREDUCE,
+  OP_PARALLEL_IDENTITY,
   OP_FUSED_PARALLEL,
   OP_INVALID,
 };
@@ -193,36 +207,37 @@ enum ModelType {
 };
 
 enum PMParameter {
-  PM_OP_TYPE,            // AnyOp
-  PM_NUM_INPUTS,         // AnyOp
-  PM_NUM_OUTPUTS,        // AnyOp
-  PM_GROUP,              // Conv2D
-  PM_KERNEL_H,           // Conv2D, Pool2D
-  PM_KERNEL_W,           // Conv2D, Pool2D
-  PM_STRIDE_H,           // Conv2D, Pool2D
-  PM_STRIDE_W,           // Conv2D, Pool2D
-  PM_PADDING_H,          // Conv2D, Pool2D
-  PM_PADDING_W,          // Conv2D, Pool2D
-  PM_ACTI,               // Conv2D, Pool2D
-  PM_NUMDIM,             // Concat, Transpose
-  PM_AXIS,               // Concat, Split
-  PM_PERM,               // Transpose
-  PM_OUTSHUFFLE,         // Transpose
-  PM_MERGE_GCONV_COUNT,  // MergeGConv
-  PM_AXES,               // Squeeze, Unsqueeze, Reduce*
-  PM_KEEP_DIMS,          // Reduce*
-  PM_EPSILON,            // BatchNorm
-  PM_REPARTITION_DIM,    // Repartition
-  PM_REPARTITION_DEGREE, // Repartition
-  PM_REPLICATE_DIM,      // Replicate
-  PM_REPLICATE_DEGREE,   // Replicate
-  PM_COMBINE_DIM,        // Combine
-  PM_COMBINE_DEGREE,     // Combine
-  PM_REDUCTION_DIM,      // Reduction
-  PM_REDUCTION_DEGREE,   // Reduction
-  PM_ALLREDUCE_DIM,      // AllReduce
-  PM_SOFTMAX_DIM,        // Softmax
-  PM_NUM_HEADS,          // MultiHeadAttention
+  PM_OP_TYPE,               // AnyOp
+  PM_NUM_INPUTS,            // AnyOp
+  PM_NUM_OUTPUTS,           // AnyOp
+  PM_GROUP,                 // Conv2D
+  PM_KERNEL_H,              // Conv2D, Pool2D
+  PM_KERNEL_W,              // Conv2D, Pool2D
+  PM_STRIDE_H,              // Conv2D, Pool2D
+  PM_STRIDE_W,              // Conv2D, Pool2D
+  PM_PADDING_H,             // Conv2D, Pool2D
+  PM_PADDING_W,             // Conv2D, Pool2D
+  PM_ACTI,                  // Conv2D, Pool2D
+  PM_NUMDIM,                // Concat, Transpose
+  PM_AXIS,                  // Concat, Split
+  PM_PERM,                  // Transpose
+  PM_OUTSHUFFLE,            // Transpose
+  PM_MERGE_GCONV_COUNT,     // MergeGConv
+  PM_AXES,                  // Squeeze, Unsqueeze, Reduce*
+  PM_KEEP_DIMS,             // Reduce*
+  PM_EPSILON,               // BatchNorm
+  PM_REPARTITION_DIM,       // Repartition
+  PM_REPARTITION_DEGREE,    // Repartition
+  PM_REPLICATE_DIM,         // Replicate
+  PM_REPLICATE_DEGREE,      // Replicate
+  PM_COMBINE_DIM,           // Combine
+  PM_COMBINE_DEGREE,        // Combine
+  PM_REDUCTION_DIM,         // Reduction
+  PM_REDUCTION_DEGREE,      // Reduction
+  PM_ALLREDUCE_DIM,         // AllReduce
+  PM_PARALLEL_IDENTITY_DIM, // AllReduce
+  PM_SOFTMAX_DIM,           // Softmax
+  PM_NUM_HEADS,             // MultiHeadAttention
   PM_INVALID,
   PM_PARALLEL_DIM,
   PM_PARALLEL_DEGREE,
@@ -268,5 +283,7 @@ enum {
   TENSOR_GUID_LAST_VALID = 3999999,
   PARALLEL_TENSOR_GUID_FIRST_VALID = 4000000,
   NODE_GUID_FIRST_VALID = 5000000,
+  PEFT_MODEL_ID_FIRST_VALID = 6000000,
+  PEFT_MODEL_ID_LAST_VALID = 6999999
 };
 #endif // _FLEXFLOW_CONST_H_
diff --git a/include/flexflow/fftype.h b/include/flexflow/fftype.h
index 1cd90fda26..3e482b8d67 100644
--- a/include/flexflow/fftype.h
+++ b/include/flexflow/fftype.h
@@ -3,6 +3,8 @@
 
 #include "flexflow/ffconst.h"
 #include <cstddef>
+#include <functional>
+#include <iostream>
 
 namespace FlexFlow {
 
@@ -18,6 +20,29 @@ class LayerID {
   size_t id, transformer_layer_id, model_id;
 };
 
+class PEFTModelID {
+public:
+  static const PEFTModelID NO_ID;
+  PEFTModelID();
+  PEFTModelID(size_t id);
+  bool is_valid_id() const;
+  friend bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  PEFTModelID const &peft_model_id);
+
+public:
+  size_t id;
+};
+
 }; // namespace FlexFlow
 
+namespace std {
+template <>
+struct hash<FlexFlow::PEFTModelID> {
+  size_t operator()(FlexFlow::PEFTModelID const &n) const {
+    return n.id;
+  }
+};
+} // namespace std
+
 #endif // _FF_TYPE_H
diff --git a/include/flexflow/flexflow_c.h b/include/flexflow/flexflow_c.h
index 0b74b7fce4..52b4b3d362 100644
--- a/include/flexflow/flexflow_c.h
+++ b/include/flexflow/flexflow_c.h
@@ -55,6 +55,11 @@ FF_NEW_OPAQUE_TYPE(flexflow_inference_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_request_manager_t);
 FF_NEW_OPAQUE_TYPE(flexflow_file_data_loader_t);
 FF_NEW_OPAQUE_TYPE(flexflow_generation_result_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_optimizer_config_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_sgd_optimizer_config_t);
+// FF_NEW_OPAQUE_TYPE(flexflow_lora_adam_optimizer_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_lora_linear_config_t);
+FF_NEW_OPAQUE_TYPE(flexflow_peft_model_id_t);
 
 // -----------------------------------------------------------------------
 // FFConfig
@@ -270,6 +275,7 @@ flexflow_tensor_t *
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            char const *name);
 
 flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
@@ -281,6 +287,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     bool elementwise_affine,
     float eps,
     bool use_bias,
+    bool inplace_residual,
     char const *name);
 
 flexflow_tensor_t
@@ -565,6 +572,7 @@ flexflow_tensor_t *
                                          const flexflow_tensor_t input2_,
                                          float eps,
                                          int dim,
+                                         bool inplace_residual,
                                          char const *name);
 
 flexflow_tensor_t flexflow_model_add_arg_top_k(flexflow_model_t handle_,
@@ -590,6 +598,9 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
                                             bool beam_search,
                                             char const *name);
 
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+    flexflow_model_t handle_, const flexflow_lora_linear_config_t peft_config_);
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle,
                                       flexflow_sgd_optimizer_t optimizer);
 
@@ -613,11 +624,16 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle, int id);
 
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
-                             char const **input_text,
-                             int max_num_chars,
-                             char **output_text,
-                             int max_seq_length,
-                             int **output_length_and_tokens);
+                             enum RequestType *request_types,
+                             char const **input_texts,
+                             char **output_texts,
+                             int *max_seq_lengths,
+                             flexflow_peft_model_id_t *peft_model_ids,
+                             char const **dataset_filepaths,
+                             int *training_steps,
+                             int **output_length_and_tokens,
+                             int *num_finetuning_losses,
+                             float *finetuning_losses);
 
 void flexflow_model_set_position_offset(flexflow_model_t handle, int offset);
 
@@ -978,6 +994,9 @@ void flexflow_request_manager_set_max_spec_tree_token_num(
 void flexflow_request_manager_set_max_sequence_length(
     flexflow_request_manager_t handle_, int max_seq_length);
 
+void flexflow_request_manager_set_enable_peft_finetuning(
+    flexflow_request_manager_t handle_, bool enable_peft_finetuning_);
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
@@ -1036,6 +1055,113 @@ void flexflow_file_data_loader_destroy(flexflow_file_data_loader_t handle_);
 void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
                                             flexflow_model_t model_handle_);
 
+// // -----------------------------------------------------------------------
+// // LoraSGDOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_sgd_optimizer_config_t
+// flexflow_lora_sgd_optimizer_config_create(
+//     double lr, double momentum, bool nesterov, bool weight_decay);
+
+// void flexflow_lora_sgd_optimizer_config_destroy(
+//     flexflow_lora_sgd_optimizer_config_t handle_);
+
+// // -----------------------------------------------------------------------
+// // LoraAdamOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_adam_optimizer_config_t
+//     flexflow_lora_adam_optimizer_config_create(double alpha,
+//                                                double beta1,
+//                                                double beta2,
+//                                                double weight_decay,
+//                                                double epsilon);
+
+// void flexflow_lora_adam_optimizer_config_destroy(
+//     flexflow_lora_adam_optimizer_config_t handle_);
+
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+    flexflow_lora_linear_config_create(char const *cache_folder_,
+                                       char const *peft_model_id_,
+                                       bool trainable,
+                                       bool init_lora_weights,
+                                       char const *base_model_name_or_path,
+                                       char const *precision,
+                                       int rank,
+                                       float lora_alpha,
+                                       float lora_dropout,
+                                       int num_target_modules,
+                                       char const **target_modules_,
+                                       enum OptimizerType optimizer_type,
+                                       float sgd_learning_rate,
+                                       float sgd_momentum,
+                                       bool sgd_nesterov,
+                                       float sgd_weight_decay,
+                                       float adam_alpha,
+                                       float adam_beta1,
+                                       float adam_beta2,
+                                       float adam_weight_decay,
+                                       float adam_epsilon);
+
+void flexflow_lora_linear_config_destroy(flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_cache_folder(
+    flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_peft_model_id(
+    flexflow_lora_linear_config_t handle_);
+
+int flexflow_lora_linear_config_get_rank(flexflow_lora_linear_config_t handle_);
+
+float flexflow_lora_linear_config_get_lora_alpha(
+    flexflow_lora_linear_config_t handle_);
+
+float flexflow_lora_linear_config_get_lora_dropout(
+    flexflow_lora_linear_config_t handle_);
+
+bool flexflow_lora_linear_config_get_trainable(
+    flexflow_lora_linear_config_t handle_);
+
+bool flexflow_lora_linear_config_get_init_lora_weights(
+    flexflow_lora_linear_config_t handle_);
+
+char const **flexflow_lora_linear_config_get_target_modules(
+    flexflow_lora_linear_config_t handle_, int *num_target_modules);
+
+char const *flexflow_lora_linear_config_get_base_model_name_or_path(
+    flexflow_lora_linear_config_t handle_);
+
+char const *flexflow_lora_linear_config_get_precision(
+    flexflow_lora_linear_config_t handle_);
+
+void flexflow_lora_linear_config_set_lora_alpha(
+    flexflow_lora_linear_config_t handle_, float value);
+
+void flexflow_lora_linear_config_set_lora_dropout(
+    flexflow_lora_linear_config_t handle_, float value);
+
+void flexflow_lora_linear_config_set_trainable(
+    flexflow_lora_linear_config_t handle_, bool value);
+
+void flexflow_lora_linear_config_set_init_lora_weights(
+    flexflow_lora_linear_config_t handle_, bool value);
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create();
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(unsigned long id);
+
+flexflow_peft_model_id_t flexflow_peft_model_id_no_id();
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/flexflow/inference.h b/include/flexflow/inference.h
index f24a797ffd..ba4101c173 100644
--- a/include/flexflow/inference.h
+++ b/include/flexflow/inference.h
@@ -40,6 +40,7 @@ struct GenerationResult {
   std::string output_text;
   std::vector<TokenId> input_tokens;
   std::vector<TokenId> output_tokens;
+  std::vector<float> finetuning_losses;
 };
 
 #include <string>
diff --git a/include/flexflow/layer.h b/include/flexflow/layer.h
index 69a57e4e1c..c3dbcac422 100644
--- a/include/flexflow/layer.h
+++ b/include/flexflow/layer.h
@@ -49,7 +49,7 @@ class Layer {
   Tensor outputs[MAX_NUM_OUTPUTS];
   Tensor inputs[MAX_NUM_INPUTS];
   Tensor weights[MAX_NUM_WEIGHTS];
-  bool trainableInputs[MAX_NUM_INPUTS];
+  // bool trainable_inputs[MAX_NUM_INPUTS];
   int numInputs, numWeights, numOutputs;
   bool profiling;
   bool inference_debugging;
diff --git a/include/flexflow/model.h b/include/flexflow/model.h
index 6dda67bbfe..4ad735ef7d 100644
--- a/include/flexflow/model.h
+++ b/include/flexflow/model.h
@@ -108,19 +108,31 @@ enum TaskIDs {
   LAYERNORM_FWD_TASK_ID,
   LAYERNORM_INF_TASK_ID,
   LAYERNORM_BWD_TASK_ID,
+  LAYERNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_LAYERNORM_INIT_TASK_ID,
   RESIDUAL_LAYERNORM_INF_TASK_ID,
+  RESIDUAL_LAYERNORM_BWD_TASK_ID,
+  RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
   ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
   ADD_BIAS_RESIDUAL_LAYERNORM_INF_TASK_ID,
+  ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+  ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
   SIGMOID_SILU_MULTI_INIT_TASK_ID,
   SIGMOID_SILU_MULTI_INF_TASK_ID,
+  SIGMOID_SILU_MULTI_BWD_TASK_ID,
+  SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
   LINEAR_INIT_TASK_ID,
   LINEAR_INIT_PARA_TASK_ID,
   LINEAR_INF_TASK_ID,
+  LINEAR_PEFT_BWD_TASK_ID,
   LINEAR_FWD_TASK_ID,
   LINEAR_BWD_TASK_ID,
   LINEAR_BWD2_TASK_ID,
   LINEAR_UPD_TASK_ID,
+  LORA_LINEAR_INIT_TASK_ID,
+  LORA_LINEAR_REG_TASK_ID,
+  LORA_LINEAR_INF_TASK_ID,
+  LORA_LINEAR_PEFT_BWD_TASK_ID,
   FLAT_INIT_TASK_ID,
   FLAT_FWD_TASK_ID,
   FLAT_BWD_TASK_ID,
@@ -128,6 +140,7 @@ enum TaskIDs {
   SOFTMAX_FWD_TASK_ID,
   SOFTMAX_BWD_TASK_ID,
   SOFTMAX_INF_TASK_ID,
+  SOFTMAX_PEFT_BWD_TASK_ID,
   CONCAT_INIT_TASK_ID,
   CONCAT_FWD_TASK_ID,
   CONCAT_BWD_TASK_ID,
@@ -163,20 +176,26 @@ enum TaskIDs {
   RMSNORM_INIT_TASK_ID,
   RMSNORM_FWD_TASK_ID,
   RMSNORM_INF_TASK_ID,
+  RMSNORM_BWD_TASK_ID,
+  RMSNORM_PEFT_BWD_TASK_ID,
   RESIDUAL_RMSNORM_INIT_TASK_ID,
   RESIDUAL_RMSNORM_INF_TASK_ID,
+  RESIDUAL_RMSNORM_BWD_TASK_ID,
+  RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
   BEAM_TOPK_INIT_TASK_ID,
   BEAM_TOPK_INF_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_FWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_BWD_TASK_ID,
   INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
+  INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   SPEC_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
   TREE_INC_MULTIHEAD_SELF_ATTENTION_INIT_TASK_ID,
   TREE_INC_MULTIHEAD_SELF_ATTENTION_INF_TASK_ID,
   MSELOSS_BWD_TASK_ID,
   FUSEDOP_INIT_TASK_ID,
+  FUSEDOP_PEFT_BWD_TASK_ID,
   FUSEDOP_FWD_TASK_ID,
   FUSEDOP_BWD_TASK_ID,
   FUSEDOP_INF_TASK_ID,
@@ -224,10 +243,13 @@ enum TaskIDs {
   REPARTITION_BWD_TASK_ID,
   COMBINE_INIT_TASK_ID,
   COMBINE_FWD_TASK_ID,
+  COMBINE_INF_TASK_ID,
   COMBINE_BWD_TASK_ID,
+  COMBINE_PEFT_BWD_TASK_ID,
   REPLICATE_INIT_TASK_ID,
   REPLICATE_FWD_TASK_ID,
   REPLICATE_BWD_TASK_ID,
+  REPLICATE_PEFT_BWD_TASK_ID,
   REDUCTION_INIT_TASK_ID,
   REDUCTION_FWD_TASK_ID,
   REDUCTION_BWD_TASK_ID,
@@ -235,9 +257,15 @@ enum TaskIDs {
   PIPELINE_FWD_TASK_ID,
   PIPELINE_BWD_TASK_ID,
   ALLREDUCE_INIT_TASK_ID,
-  ALLREDUCE_INF_TASK_ID,
   ALLREDUCE_FWD_TASK_ID,
   ALLREDUCE_BWD_TASK_ID,
+  ALLREDUCE_INF_TASK_ID,
+  ALLREDUCE_PEFT_BWD_TASK_ID,
+  PARALLEL_IDENTITY_INIT_TASK_ID,
+  PARALLEL_IDENTITY_FWD_TASK_ID,
+  PARALLEL_IDENTITY_BWD_TASK_ID,
+  PARALLEL_IDENTITY_INF_TASK_ID,
+  PARALLEL_IDENTITY_PEFT_BWD_TASK_ID,
   FUSED_PARALLELOP_INIT_TASK_ID,
   FUSED_PARALLELOP_FWD_TASK_ID,
   FUSED_PARALLELOP_BWD_TASK_ID,
@@ -327,6 +355,7 @@ class ResidualLayerNorm;
 class AddBiasResidualLayerNorm;
 class SigmoidSiluMulti;
 class Linear;
+class LoraLinear;
 class MultiHeadAttention;
 class IncMultiHeadSelfAttention;
 class TreeIncMultiHeadSelfAttention;
@@ -349,9 +378,12 @@ class Repartition;
 class Reduction;
 class Replicate;
 class AllReduce;
+class ParallelIdentity;
 class FusedParallelOp;
 class ParallelOpInfo;
 
+struct Request;
+
 // TODO: Move to an appropriate place
 /*
   This is used to create a type that recursively replaces value type
@@ -561,6 +593,7 @@ class FFModel {
                            bool elementwise_affine,
                            float eps,
                            bool use_bias = true,
+                           bool inplace_residual = false,
                            DataType data_type = DT_NONE,
                            char const *name = NULL);
   // Add a add_bias_residual_layer_norm layer
@@ -571,6 +604,7 @@ class FFModel {
                                     bool elementwise_affine,
                                     float eps,
                                     bool use_bias = true,
+                                    bool inplace_residual = false,
                                     DataType data_type = DT_NONE,
                                     char const *name = NULL);
   // Add a sigmoid_silu_multi layer
@@ -599,6 +633,7 @@ class FFModel {
                          Tensor *outputs,
                          float eps,
                          int dim,
+                         bool inplace_residual = false,
                          DataType data_type = DT_NONE,
                          char const *name = NULL);
   // Add a beam search top k layer
@@ -808,10 +843,13 @@ class FFModel {
       bool position_bias = false,
       char const *name = NULL);
   // ========================================
+  // PEFT Layers
+  // ========================================
+  PEFTModelID *add_lora_layer(LoraLinearConfig const peft_config);
+  // ========================================
   // Inference APIs
   // ========================================
-  std::vector<GenerationResult> generate(std::vector<std::string> &prompts,
-                                         int max_seq_length);
+  std::vector<GenerationResult> generate(std::vector<Request> const &requests);
 
   Tensor create_tensor_legion_ordering(int num_dim,
                                        int const dims[],
@@ -1103,6 +1141,9 @@ class FFModel {
   Legion::IndexSpace get_task_is(Legion::Domain const &domain) const;
   Legion::IndexSpace get_task_is(ParallelConfig const &pc) const;
   Legion::IndexSpace get_task_is(MachineView const &view) const;
+  bool need_to_add_combine(int layer_idx) const;
+  bool need_to_add_allreduce(int layer_idx) const;
+  bool need_to_add_parallel_identity(int layer_idx) const;
   bool is_mlp_block(int layer_idx) const;
   void create_operators_from_layers();
   Op *create_operator_from_layer(Layer *layer,
@@ -1117,7 +1158,7 @@ class FFModel {
   void clear_graph_search_cache();
 
 public:
-  size_t op_global_guid, layer_global_guid;
+  size_t op_global_guid, layer_global_guid, peft_model_global_guid;
   size_t tensor_global_guid, parallel_tensor_global_guid, node_global_guid;
   size_t current_transformer_layer_id;
   // positional embedding start offset
@@ -1137,6 +1178,12 @@ class FFModel {
   std::vector<Layer *> layers;
   std::vector<Op *> operators;
   std::vector<ParallelTensor> parameters;
+  // PEFT related
+  std::unordered_map<Layer *, Layer *> base_layer_to_peft_layer;
+  std::unordered_map<Layer *, std::vector<PEFTModelID>> peft_layer_to_peft_id;
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  //   std::vector<Op *> peft_operators;
+
   FFHandler handlers[MAX_NUM_WORKERS];
   Legion::Future current_metrics;
   // Cached operators: key: operator hash, value: operator pointer
@@ -1195,6 +1242,10 @@ class FFModel {
           SigmoidSiluMulti *>,
       std::unordered_map<std::pair<ParallelTensorShape, LinearParams>,
                          Linear *>,
+      std::unordered_map<
+          std::pair<std::pair<ParallelTensorShape, ParallelTensorShape>,
+                    LoraLinearParams>,
+          LoraLinear *>,
       std::unordered_map<std::pair<ParallelTensorShape, Pool2DParams>,
                          Pool2D *>,
       std::unordered_map<std::pair<std::tuple<ParallelTensorShape,
@@ -1245,6 +1296,8 @@ class FFModel {
                          Combine *>,
       std::unordered_map<std::pair<ParallelTensorShape, AllReduceParams>,
                          AllReduce *>,
+      std::unordered_map<std::pair<ParallelTensorShape, ParallelIdentityParams>,
+                         ParallelIdentity *>,
       std::unordered_map<std::pair<ParallelTensorShape, FusedParallelOpParams>,
                          FusedParallelOp *>>
       cached_ops;
diff --git a/include/flexflow/op_meta.h b/include/flexflow/op_meta.h
index 60785a1e29..d31c12b16c 100644
--- a/include/flexflow/op_meta.h
+++ b/include/flexflow/op_meta.h
@@ -9,7 +9,7 @@ class Op;
 
 class OpMeta {
 public:
-  OpMeta(FFHandler _handle);
+  // OpMeta(FFHandler _handle);
   OpMeta(FFHandler _handle, Op const *op);
 
 public:
@@ -17,9 +17,11 @@ class OpMeta {
   bool profiling; // Measure the run time of the task
   bool inference_debugging;
   int decoding_step;
+  int bwd_step;
   char op_name[MAX_OPNAME];
   LayerID layer_guid;
-  bool trainableInputs[MAX_NUM_INPUTS];
+  bool trainable_inputs[MAX_NUM_INPUTS];
+  bool reset_input_grads[MAX_NUM_INPUTS];
   DataType input_type[MAX_NUM_INPUTS];
   DataType weight_type[MAX_NUM_WEIGHTS];
   DataType output_type[MAX_NUM_OUTPUTS];
diff --git a/include/flexflow/operator.h b/include/flexflow/operator.h
index 311699d926..1a5af67b36 100644
--- a/include/flexflow/operator.h
+++ b/include/flexflow/operator.h
@@ -7,7 +7,9 @@
 #include "flexflow/machine_view.h"
 #include "flexflow/parallel_tensor.h"
 #include "flexflow/utils/dot/record_formatter.h"
+#include <filesystem>
 #include <vector>
+namespace fs = std::filesystem;
 
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -29,6 +31,11 @@ enum class MappingRecordType { INPUT_OUTPUT, INPUT_WEIGHT };
 
 enum class MappingOperation { PARTITION, REPLICATE };
 
+fs::path get_dst_folder(std::string const &subdir,
+                        int step_idx = 0,
+                        int shard_idx = 0,
+                        bool before_kernel = false);
+
 /** @brief  A class to keep track of a dimension relation between two tensors
  * used by an operator.
  *
@@ -236,11 +243,18 @@ class Op {
     Legion::FutureMap empty_map;
     return empty_map;
   };
+  virtual Legion::FutureMap peft_bwd(FFModel const &,
+                                     BatchConfigFuture const &,
+                                     std::vector<ParallelTensor> const &,
+                                     std::vector<ParallelTensor> const &,
+                                     MachineView const *mv = nullptr) {
+    assert(false);
+  }
   virtual void print_layer(FFModel const &model) = 0;
   template <typename OpMetaType>
   static std::string get_op_name_without_uid(OpMetaType *m) {
     std::string op_name_without_uid = std::string(m->op_name);
-    size_t last_underscore = op_name_without_uid.length() - 1;
+    size_t last_underscore = op_name_without_uid.length();
     for (int i = op_name_without_uid.length() - 1; i > 0; i--) {
       if (!(std::isdigit(m->op_name[i]) || m->op_name[i] == '_')) {
         break;
@@ -248,7 +262,9 @@ class Op {
         last_underscore = i;
       }
     }
-    op_name_without_uid.erase(last_underscore);
+    if (last_underscore < op_name_without_uid.length()) {
+      op_name_without_uid.erase(last_underscore);
+    }
     return op_name_without_uid;
   }
   template <typename OpMetaType>
@@ -259,31 +275,42 @@ class Op {
       std::vector<GenericTensorAccessorR> input_tensors,
       std::vector<GenericTensorAccessorR> weight_tensors,
       std::vector<GenericTensorAccessorR> output_tensors,
+      bool fwd_pass = true,
       bool before_kernel = false) {
-    // Check if output directory exists, and create it if it does not
-    char const *folder_path = "./inference_tensors";
-    struct stat st = {0};
-    if (stat(folder_path, &st) == -1) {
-      // Directory does not exist, create it
-      mkdir(folder_path, 0700);
-    }
-    // output base filepath, shared by all tensors from the same operator
+    // get operator name and print it
     std::string op_name_without_uid = get_op_name_without_uid(m);
-    std::string base_filepath =
-        "./inference_tensors/model_" + std::to_string(m->layer_guid.model_id) +
-        "_decoding-step_" + std::to_string(m->decoding_step) + "_layer-num_" +
-        std::to_string(m->layer_guid.transformer_layer_id) + "_layer-name_" +
-        op_name_without_uid + "_shard-id_" + std::to_string(shard_id);
-    if (before_kernel) {
-      base_filepath += "_pre";
+    std::cout << (fwd_pass ? "INF " : "BWD ") << op_name_without_uid
+              << std::endl;
+    // build the path to save the tensor
+    fs::path dst_filepath;
+    if (fwd_pass) {
+      dst_filepath =
+          get_dst_folder("fwd", m->decoding_step, shard_id, before_kernel);
+    } else {
+      dst_filepath =
+          get_dst_folder("bwd", m->bwd_step, shard_id, before_kernel);
+    }
+    if (m->layer_guid.model_id > 0) {
+      assert(false && "Model ID > 0 not supported yet");
     }
+    std::string layername = "layers." +
+                            std::to_string(m->layer_guid.transformer_layer_id) +
+                            "." + op_name_without_uid;
+    dst_filepath /= layername;
+
     // save batch config, if passed
     if (bc != nullptr) {
-      bc->save_to_file(base_filepath + "_batch-config");
+      bc->save_to_file(dst_filepath.string() + ".batch_config");
     }
+
     // save all inputs
     for (int i = 0; i < input_tensors.size(); i++) {
-      std::string filename = base_filepath + "_input_" + std::to_string(i);
+      std::string filename = dst_filepath.string() + ".input_";
+      if (fwd_pass) {
+        filename += std::to_string(i);
+      } else {
+        filename += "gradient_" + std::to_string(i);
+      }
       if (input_tensors[i].data_type == DT_FLOAT) {
         save_tensor(input_tensors[i].get_float_ptr(),
                     input_tensors[i].domain.get_volume(),
@@ -304,10 +331,17 @@ class Op {
         assert(false && "Tensor data type not supported");
       }
     }
-    // only dump the weights once
-    if (m->decoding_step == 0) {
+
+    // only dump the weights in the forward pass, at the first step
+    // note that we do not save the weight gradients, since we only support
+    // finetuning LoRA weights, which are not FF tensors.
+    if (fwd_pass && m->decoding_step == 0) {
+      fs::path dst_filepath_weights =
+          get_dst_folder("weights", m->decoding_step, shard_id, before_kernel) /
+          layername;
       for (int i = 0; i < weight_tensors.size(); i++) {
-        std::string filename = base_filepath + "_weight_" + std::to_string(i);
+        std::string filename =
+            dst_filepath_weights.string() + ".weight_" + std::to_string(i);
         if (weight_tensors[i].data_type == DT_FLOAT) {
           save_tensor(weight_tensors[i].get_float_ptr(),
                       weight_tensors[i].domain.get_volume(),
@@ -329,9 +363,15 @@ class Op {
         }
       }
     }
+
     // save all outputs
     for (int i = 0; i < output_tensors.size(); i++) {
-      std::string filename = base_filepath + "_output_" + std::to_string(i);
+      std::string filename = dst_filepath.string() + ".output_";
+      if (fwd_pass) {
+        filename += std::to_string(i);
+      } else {
+        filename += "gradient_" + std::to_string(i);
+      }
       if (output_tensors[i].data_type == DT_FLOAT) {
         save_tensor(output_tensors[i].get_float_ptr(),
                     output_tensors[i].domain.get_volume(),
@@ -354,7 +394,11 @@ class Op {
     }
     // increase count of decoding steps
     if (!before_kernel) {
-      m->decoding_step++;
+      if (fwd_pass) {
+        m->decoding_step++;
+      } else {
+        m->bwd_step++;
+      }
     }
   }
   virtual bool measure_operator_cost(Simulator *sim,
@@ -448,7 +492,8 @@ class Op {
   ParallelTensor outputs[MAX_NUM_OUTPUTS];
   ParallelTensor inputs[MAX_NUM_INPUTS];
   ParallelParameter weights[MAX_NUM_WEIGHTS];
-  bool trainableInputs[MAX_NUM_INPUTS];
+  bool trainable_inputs[MAX_NUM_INPUTS];
+  bool reset_input_grads[MAX_NUM_INPUTS];
   OpMeta *meta[MAX_NUM_WORKERS];
   std::map<ParallelTensor, OpMeta *[MAX_NUM_WORKERS]> inference_meta;
   int numInputs, numWeights, numOutputs;
diff --git a/include/flexflow/operator_params.h b/include/flexflow/operator_params.h
index 5b187839ef..673f78ad46 100644
--- a/include/flexflow/operator_params.h
+++ b/include/flexflow/operator_params.h
@@ -23,6 +23,7 @@
 #include "flexflow/ops/inc_multihead_self_attention_params.h"
 #include "flexflow/ops/layer_norm_params.h"
 #include "flexflow/ops/linear_params.h"
+#include "flexflow/ops/lora_linear_params.h"
 #include "flexflow/ops/pool_2d_params.h"
 #include "flexflow/ops/reduce_params.h"
 #include "flexflow/ops/reshape_params.h"
@@ -40,6 +41,7 @@
 #include "flexflow/parallel_ops/allreduce_params.h"
 #include "flexflow/parallel_ops/combine_params.h"
 #include "flexflow/parallel_ops/fused_parallel_op_params.h"
+#include "flexflow/parallel_ops/parallel_identity_params.h"
 #include "flexflow/parallel_ops/partition_params.h"
 #include "flexflow/parallel_ops/reduction_params.h"
 #include "flexflow/parallel_ops/replicate_params.h"
@@ -67,6 +69,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        AddBiasResidualLayerNormParams,
                                        SigmoidSiluMultiParams,
                                        LinearParams,
+                                       LoraLinearParams,
                                        MultiHeadAttentionParams,
                                        IncMultiHeadSelfAttentionParams,
                                        BeamTopKParams,
@@ -89,6 +92,7 @@ using OperatorParameters = mp::variant<AggregateParams,
                                        ReductionParams,
                                        CombineParams,
                                        AllReduceParams,
+                                       ParallelIdentityParams,
                                        FusedParallelOpParams>;
 
 tl::optional<OperatorParameters> get_op_parameters(Op const *op);
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm.h b/include/flexflow/ops/add_bias_residual_layer_norm.h
index bb470376c3..9510ac0f28 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm.h
@@ -24,8 +24,10 @@ class AddBiasResidualLayerNorm : public Op {
                            bool _elementwise_affine,
                            bool _use_bias,
                            float _eps,
+                           bool _inplace_residual,
                            bool allocate_weights,
                            char const *name);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
@@ -38,6 +40,11 @@ class AddBiasResidualLayerNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -61,6 +68,14 @@ class AddBiasResidualLayerNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -76,21 +91,55 @@ class AddBiasResidualLayerNorm : public Op {
                                T const *gamma_ptr,
                                T const *beta_ptr,
                                ffStream_t stream);
-  static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
-                                       int attn_bias_dim,
-                                       int residual_volume,
+  static void inference_kernel_wrapper(AddBiasResidualLayerNormMeta *m,
+                                       BatchConfig const *bc,
                                        GenericTensorAccessorR const &input,
+                                       GenericTensorAccessorR const &attn_bias,
+                                       GenericTensorAccessorR const &residual,
                                        GenericTensorAccessorW &added_output,
                                        GenericTensorAccessorW &output,
-                                       GenericTensorAccessorR const &residual,
-                                       GenericTensorAccessorR const &attn_bias,
                                        GenericTensorAccessorR const &gamma,
                                        GenericTensorAccessorR const &beta);
+  template <typename T>
+  static void backward_kernel(AddBiasResidualLayerNormMeta const *m,
+                              T const *output_grad_ptr,
+                              T const *added_output_ptr,
+                              T *input_grad_ptr,
+                              T *residual_grad_ptr,
+                              T *attn_bias_grad_ptr,
+                              T const *gamma_ptr,
+                              T *gamma_grad_ptr,
+                              T *beta_grad_ptr,
+                              ffStream_t stream);
+  static void
+      backward_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorR &added_output,
+                              GenericTensorAccessorW &input_grad,
+                              GenericTensorAccessorW const &residual_grad,
+                              GenericTensorAccessorW const &attn_bias_grad,
+                              GenericTensorAccessorR const &gamma,
+                              GenericTensorAccessorW const &gamma_grad,
+                              GenericTensorAccessorW const &beta_grad);
+  template <typename T>
+  static void peft_bwd_kernel(AddBiasResidualLayerNormMeta const *m,
+                              T const *output_grad_ptr,
+                              T *input_grad_ptr,
+                              T *residual_grad_ptr,
+                              T const *gamma_ptr,
+                              ffStream_t stream);
+  static void
+      peft_bwd_kernel_wrapper(AddBiasResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorW &input_grad,
+                              GenericTensorAccessorW const &residual_grad,
+                              GenericTensorAccessorR const &gamma);
 
 public:
   bool elementwise_affine, use_bias;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   std::vector<int> axes;
 };
 
@@ -105,8 +154,12 @@ class AddBiasResidualLayerNormMeta : public OpMeta {
   bool elementwise_affine, use_bias;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/add_bias_residual_layer_norm_params.h b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
index 87fe2fb562..840f521b01 100644
--- a/include/flexflow/ops/add_bias_residual_layer_norm_params.h
+++ b/include/flexflow/ops/add_bias_residual_layer_norm_params.h
@@ -12,6 +12,7 @@ struct AddBiasResidualLayerNormParams {
   bool elementwise_affine;
   float eps;
   bool use_bias;
+  bool inplace_residual;
   char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &) const;
diff --git a/include/flexflow/ops/aggregate.h b/include/flexflow/ops/aggregate.h
index 3ba4f414d1..283e9a4290 100644
--- a/include/flexflow/ops/aggregate.h
+++ b/include/flexflow/ops/aggregate.h
@@ -11,9 +11,11 @@ namespace FlexFlow {
 #define AGGREGATE_MAX_BATCH_SIZE 64
 #define AGGREGATE_MAX_N 128
 
+class Aggregate;
+
 class AggregateMeta : public OpMeta {
 public:
-  AggregateMeta(FFHandler handle, int n);
+  AggregateMeta(FFHandler handle, Aggregate const *aggr);
   ~AggregateMeta(void);
   float **dev_exp_preds;
   float **dev_exp_grads;
diff --git a/include/flexflow/ops/aggregate_spec.h b/include/flexflow/ops/aggregate_spec.h
index 4302dd0733..a9f651b620 100644
--- a/include/flexflow/ops/aggregate_spec.h
+++ b/include/flexflow/ops/aggregate_spec.h
@@ -11,9 +11,11 @@ namespace FlexFlow {
 #define AGGREGATE_SPEC_MAX_BATCH_SIZE 32
 #define AGGREGATE_SPEC_MAX_N 12
 
+class AggregateSpec;
+
 class AggregateSpecMeta : public OpMeta {
 public:
-  AggregateSpecMeta(FFHandler handle, int n);
+  AggregateSpecMeta(FFHandler handle, AggregateSpec const *agg);
   ~AggregateSpecMeta(void);
   float **dev_region_ptrs;
 };
diff --git a/include/flexflow/ops/argmax.h b/include/flexflow/ops/argmax.h
index 298059e3ed..eca9943d20 100644
--- a/include/flexflow/ops/argmax.h
+++ b/include/flexflow/ops/argmax.h
@@ -17,6 +17,7 @@ class ArgMaxMeta : public OpMeta {
   size_t temp_storage_bytes = 0;
   int *d_offsets;
   void *d_out;
+  float *d_loss;
   Realm::RegionInstance reserveInst;
   ArgMaxMeta(FFHandler handler,
              Op const *op,
@@ -89,18 +90,22 @@ class ArgMax : public Op {
                              CostMetrics &cost_metrics) const override;
   template <typename DT>
   static void forward_kernel(ArgMaxMeta const *m,
-                             DT *input_ptr,
+                             BatchConfig const *bc,
+                             DT const *input_ptr,
                              int *indices_ptr,
                              float *prob_ptr,
                              int *parent_ptr,
                              int length,
                              int batch_size,
+                             float *loss,
                              ffStream_t stream);
   static void forward_kernel_wrapper(ArgMaxMeta const *m,
-                                     GenericTensorAccessorW const &input,
+                                     BatchConfig const *bc,
+                                     GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &indices,
                                      GenericTensorAccessorW const &parent,
-                                     int batch_size);
+                                     int batch_size,
+                                     float *loss);
   Params get_params() const;
 
 public:
diff --git a/include/flexflow/ops/cache.h b/include/flexflow/ops/cache.h
index 1fbb1fa059..4f0b94ee5c 100644
--- a/include/flexflow/ops/cache.h
+++ b/include/flexflow/ops/cache.h
@@ -5,9 +5,11 @@
 
 namespace FlexFlow {
 
+class Cache;
+
 class CacheMeta : public OpMeta {
 public:
-  CacheMeta(FFHandler handle);
+  CacheMeta(FFHandler handle, Cache const *c);
   float cache_score;
 };
 
diff --git a/include/flexflow/ops/element_unary.h b/include/flexflow/ops/element_unary.h
index ddef59549c..043b5d19a7 100644
--- a/include/flexflow/ops/element_unary.h
+++ b/include/flexflow/ops/element_unary.h
@@ -12,9 +12,11 @@
 
 namespace FlexFlow {
 
+class ElementUnary;
+
 class ElementUnaryMeta : public OpMeta {
 public:
-  ElementUnaryMeta(FFHandler handle);
+  ElementUnaryMeta(FFHandler handle, ElementUnary const *unary);
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t inputTensor, outputTensor;
   cudnnActivationDescriptor_t actiDesc;
diff --git a/include/flexflow/ops/embedding.h b/include/flexflow/ops/embedding.h
index ed89fcf37a..c90e1773e0 100644
--- a/include/flexflow/ops/embedding.h
+++ b/include/flexflow/ops/embedding.h
@@ -60,6 +60,11 @@ class Embedding : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   // void update(const FFModel&);
   void print_layer(FFModel const &model) override {
     assert(0);
diff --git a/include/flexflow/ops/experts.h b/include/flexflow/ops/experts.h
index d68957d890..1ed4678a5b 100644
--- a/include/flexflow/ops/experts.h
+++ b/include/flexflow/ops/experts.h
@@ -6,20 +6,11 @@
 
 namespace FlexFlow {
 
+class Experts;
+
 class ExpertsMeta : public OpMeta {
 public:
-  ExpertsMeta(FFHandler handler,
-              int _num_experts,
-              int _experts_start_idx,
-              int _data_dim,
-              int _out_dim,
-              int _experts_num_layers,
-              int _experts_internal_dim_size,
-              int _effective_batch_size,
-              int _num_chosen_experts,
-              float _alpha,
-              bool _use_bias,
-              ActiMode _activation);
+  ExpertsMeta(FFHandler handler, Experts const *e);
   ~ExpertsMeta(void);
 
   // Thrust helper arrays
@@ -138,7 +129,7 @@ class Experts : public Op {
                                      float *output,
                                      float const *weights,
                                      float const *biases,
-                                     int num_active_tokens,
+                                     int num_active_infr_tokens,
                                      int chosen_experts,
                                      int batch_size,
                                      int out_dim);
diff --git a/include/flexflow/ops/fused.h b/include/flexflow/ops/fused.h
index a8326e9ab4..02ab1db7b5 100644
--- a/include/flexflow/ops/fused.h
+++ b/include/flexflow/ops/fused.h
@@ -49,6 +49,11 @@ class FusedOp : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -60,6 +65,10 @@ class FusedOp : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
diff --git a/include/flexflow/ops/groupby.h b/include/flexflow/ops/groupby.h
index ec6cdfb9ab..73025216cd 100644
--- a/include/flexflow/ops/groupby.h
+++ b/include/flexflow/ops/groupby.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class Group_by;
+
 class GroupByMeta : public OpMeta {
 public:
-  GroupByMeta(FFHandler handle, int n, float _alpha);
+  GroupByMeta(FFHandler handle, Group_by const *gb);
   ~GroupByMeta(void);
   float alpha;
   float **dev_region_ptrs;
diff --git a/include/flexflow/ops/inc_multihead_self_attention.h b/include/flexflow/ops/inc_multihead_self_attention.h
index 43dc527bc8..f77df7c456 100644
--- a/include/flexflow/ops/inc_multihead_self_attention.h
+++ b/include/flexflow/ops/inc_multihead_self_attention.h
@@ -96,6 +96,11 @@ class IncMultiHeadSelfAttention : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -109,17 +114,27 @@ class IncMultiHeadSelfAttention : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &mv,
                              CostMetrics &cost_metrics) const override;
-
-  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta const *m,
+  static void inference_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
                                        BatchConfig const *bc,
                                        int shard_id,
                                        GenericTensorAccessorR const &input,
                                        GenericTensorAccessorR const &weight,
                                        GenericTensorAccessorW const &output,
                                        GenericTensorAccessorR const &bias);
+  static void peft_bwd_kernel_wrapper(IncMultiHeadSelfAttentionMeta *m,
+                                      BatchConfig const *bc,
+                                      int shard_id,
+                                      GenericTensorAccessorW const &input_grad,
+                                      GenericTensorAccessorR const &weight,
+                                      GenericTensorAccessorR const &output_grad,
+                                      GenericTensorAccessorR const &bias);
   Params get_params() const;
 
 public:
@@ -204,6 +219,10 @@ class IncMultiHeadSelfAttentionMeta : public OpMeta {
   //  typedef hipFloatComplex attFloatComplex;
   hipFloatComplex *complex_input;
 #endif
+  // PEFT specific fields
+  void *softmax_activation_buffer;
+  void *query_activation_buffer;
+  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/kernels/batch_matmul_kernels.h b/include/flexflow/ops/kernels/batch_matmul_kernels.h
index 4de774ee06..c3923c4d4b 100644
--- a/include/flexflow/ops/kernels/batch_matmul_kernels.h
+++ b/include/flexflow/ops/kernels/batch_matmul_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class BatchMatmul;
+
 class BatchMatmulMeta : public OpMeta {
 public:
-  BatchMatmulMeta(FFHandler handler);
+  BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm);
   int a_seq_length_dim, b_seq_length_dim;
 };
 
diff --git a/include/flexflow/ops/kernels/cast_kernels.h b/include/flexflow/ops/kernels/cast_kernels.h
index 3001d913ca..d601601ea2 100644
--- a/include/flexflow/ops/kernels/cast_kernels.h
+++ b/include/flexflow/ops/kernels/cast_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Cast;
+
 class CastMeta : public OpMeta {
 public:
-  CastMeta(FFHandler handle);
+  CastMeta(FFHandler handle, Cast const *cast);
   DataType input_data_type, output_data_type;
 };
 
diff --git a/include/flexflow/ops/kernels/concat_kernels.h b/include/flexflow/ops/kernels/concat_kernels.h
index 4da6aaf5e2..4562ae871a 100644
--- a/include/flexflow/ops/kernels/concat_kernels.h
+++ b/include/flexflow/ops/kernels/concat_kernels.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class Concat;
+
 class ConcatMeta : public OpMeta {
 public:
-  ConcatMeta(FFHandler handle) : OpMeta(handle){};
+  ConcatMeta(FFHandler handle, Concat const *cc);
   int legion_axis;
 };
 
diff --git a/include/flexflow/ops/kernels/conv_2d_kernels.h b/include/flexflow/ops/kernels/conv_2d_kernels.h
index 7b2a0fe135..f83e4687d7 100644
--- a/include/flexflow/ops/kernels/conv_2d_kernels.h
+++ b/include/flexflow/ops/kernels/conv_2d_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Conv2D;
+
 class Conv2DMeta : public OpMeta {
 public:
-  Conv2DMeta(FFHandler handler);
+  Conv2DMeta(FFHandler handler, Conv2D const *conv);
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   cudnnTensorDescriptor_t inputTensor, biasTensor, outputTensor;
   cudnnFilterDescriptor_t filterDesc;
diff --git a/include/flexflow/ops/kernels/flat_kernels.h b/include/flexflow/ops/kernels/flat_kernels.h
index caf817512d..6aa5a13b42 100644
--- a/include/flexflow/ops/kernels/flat_kernels.h
+++ b/include/flexflow/ops/kernels/flat_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Flat;
+
 class FlatMeta : public OpMeta {
 public:
-  FlatMeta(FFHandler handle) : OpMeta(handle){};
+  FlatMeta(FFHandler handle, Flat const *flat);
 };
 
 namespace Kernels {
diff --git a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
index d1e0e050b2..3d122d4bc5 100644
--- a/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
+++ b/include/flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh
@@ -385,6 +385,25 @@ inline __device__ void zero(T &dst) {
   dst = tmp.raw;
 }
 
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL(unsigned mask, T var, int srcLane, int width=warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_sync(mask, var, srcLane, width);
+#else
+  return __shfl(var, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width=warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_xor_sync(mask, var, laneMask, width);
+#else
+  return __shfl_xor(var, laneMask, width);
+#endif
+}
+
+
 template <int THREADS_PER_KEY, typename K_vec, int N>
 inline __device__ float qk_dot_(K_vec const (&q)[N], K_vec const (&k)[N]) {
   // use float32 to get better accuracy
@@ -401,7 +420,7 @@ inline __device__ float qk_dot_(K_vec const (&q)[N], K_vec const (&k)[N]) {
   float qk = sum(qk_vec);
 #pragma unroll
   for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) {
-    qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+    qk += WARP_SHFL_XOR(uint32_t(-1), qk, mask);
   }
   return qk;
 }
@@ -423,7 +442,7 @@ inline __device__ float block_sum(float *red_smem, float sum) {
 // Compute the sum per warp.
 #pragma unroll
   for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
-    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+    sum += WARP_SHFL_XOR(uint32_t(-1), sum, mask);
   }
 
   // Warp leaders store the data to shared memory.
@@ -442,11 +461,11 @@ inline __device__ float block_sum(float *red_smem, float sum) {
 // Parallel reduction inside the warp.
 #pragma unroll
   for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+    sum += WARP_SHFL_XOR(uint32_t(-1), sum, mask);
   }
 
   // Broadcast to other threads.
-  return __shfl_sync(uint32_t(-1), sum, 0);
+  return WARP_SHFL(uint32_t(-1), sum, 0);
 }
 
 template <typename DT>
diff --git a/include/flexflow/ops/kernels/linear_kernels.h b/include/flexflow/ops/kernels/linear_kernels.h
index a5fdc7c602..90e50a0c9a 100644
--- a/include/flexflow/ops/kernels/linear_kernels.h
+++ b/include/flexflow/ops/kernels/linear_kernels.h
@@ -35,6 +35,9 @@ class LinearMeta : public OpMeta {
   float kernel_reg_lambda;
   bool use_bias, add_bias_only_once;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *output_activation_buffer;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 namespace Kernels {
@@ -48,6 +51,23 @@ void forward_kernel_wrapper(LinearMeta const *m,
                             int in_dim,
                             int out_dim,
                             int batch_size);
+void inference_kernel_wrapper(LinearMeta *m,
+                              BatchConfig const *bc,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *filter_ptr,
+                              void const *bias_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int batch_size);
+void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             void *input_grad_ptr,
+                             void *output_grad_ptr,
+                             void const *kernel_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int num_infr_tokens,
+                             int num_peft_tokens);
 void backward_kernel_wrapper(LinearMeta const *m,
                              void const *input_ptr,
                              void *input_grad_ptr,
@@ -73,6 +93,16 @@ void forward_kernel(LinearMeta const *m,
                     int batch_size,
                     ffStream_t stream);
 template <typename DT>
+void peft_bwd_kernel(LinearMeta const *m,
+                     void *input_grad_ptr,
+                     void *output_grad_ptr,
+                     void const *kernel_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream);
+template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
                      void *input_grad_ptr,
@@ -85,6 +115,7 @@ void backward_kernel(LinearMeta const *m,
                      int out_dim,
                      int batch_size,
                      ffStream_t stream);
+
 template <typename DT>
 __global__ void build_one_ptr(DT *one_ptr, int batch_size);
 } // namespace Internal
diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
new file mode 100644
index 0000000000..5360b5f8ea
--- /dev/null
+++ b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -0,0 +1,77 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
+#define _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
+
+#include "flexflow/accessor.h"
+#include "flexflow/device.h"
+#include "flexflow/fftype.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/ops/lora_linear.h"
+
+namespace FlexFlow {
+
+struct LoraLinearWeight {
+  // weights
+  void *w0_ptr, *w1_ptr;
+  // gradients
+  void *w0_grad_ptr, *w1_grad_ptr;
+  // v values for SGD optimizer (when using momentum)
+  void *w0_v_values_ptr, *w1_v_values_ptr;
+  int in_dim, out_dim, rank, num_shards;
+};
+
+struct LoraLinearModelState {
+  LoraLinearWeight weights;
+  LoraOptimizerConfig const *optimizer_config;
+  float lora_alpha;
+  std::string cache_folder;
+  // Huggingface model ID (for download and/or upload)
+  std::string peft_model_id;
+};
+
+class LoraLinearMeta : public OpMeta {
+public:
+  LoraLinearMeta(FFHandler handle, LoraLinear const *li);
+  ~LoraLinearMeta(void);
+  // PEFT related fields
+  void *low_rank_activation;
+  void *input_activation;
+  std::unordered_map<PEFTModelID, LoraLinearModelState> model_state;
+  size_t allocated_peft_buffer_size1 = 0, allocated_peft_buffer_size2 = 0;
+};
+
+namespace Kernels {
+namespace LoraLinear {
+void init_kernel_wrapper(LoraLinearMeta *m, int seed);
+void inference_kernel_wrapper(LoraLinearMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output);
+void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
+
+namespace Internal {
+template <typename DT>
+void init_kernel(LoraLinearMeta *m, int seed, ffStream_t stream);
+template <typename DT>
+void inference_kernel(LoraLinearMeta *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int in_dim,
+                      int out_dim,
+                      ffStream_t stream);
+template <typename DT>
+void peft_bwd_kernel(LoraLinearMeta *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int in_dim,
+                     int out_dim,
+                     ffStream_t stream);
+} // namespace Internal
+} // namespace LoraLinear
+} // namespace Kernels
+} // namespace FlexFlow
+#endif // _FLEXFLOW_OPS_KERNELS_LORA_LINEAR_KERNELS_H
diff --git a/include/flexflow/ops/kernels/pool_2d_kernels.h b/include/flexflow/ops/kernels/pool_2d_kernels.h
index 7f73a8295d..c5a954763e 100644
--- a/include/flexflow/ops/kernels/pool_2d_kernels.h
+++ b/include/flexflow/ops/kernels/pool_2d_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Pool2D;
+
 class Pool2DMeta : public OpMeta {
 public:
-  Pool2DMeta(FFHandler handle);
+  Pool2DMeta(FFHandler handle, Pool2D const *pool);
   ffTensorDescriptor_t inputTensor, outputTensor;
   ffActivationDescriptor_t actiDesc;
   ffPoolingDescriptor_t poolDesc;
diff --git a/include/flexflow/ops/kernels/reshape_kernels.h b/include/flexflow/ops/kernels/reshape_kernels.h
index e6c8c4d569..5b6fa5be19 100644
--- a/include/flexflow/ops/kernels/reshape_kernels.h
+++ b/include/flexflow/ops/kernels/reshape_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Reshape;
+
 class ReshapeMeta : public OpMeta {
 public:
-  ReshapeMeta(FFHandler handler);
+  ReshapeMeta(FFHandler handler, Reshape const *reshape);
   DataType data_type;
 };
 
@@ -44,4 +46,4 @@ void backward_kernel(T *input_grad_ptr,
 } // namespace Kernels
 } // namespace FlexFlow
 
-#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
\ No newline at end of file
+#endif // _FLEXFLOW_OPS_KERNELS_RESHAPE_KERNELS_H
diff --git a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
index 0eef4ca72b..fd4e0ecf1d 100644
--- a/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/residual_rms_norm_kernels.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_OPS_KERNELS_RESIDUAL_RMSNORM_KERNELS_H
 
 #include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
@@ -31,13 +32,14 @@ class ResidualRMSNormMeta : public OpMeta {
   void *rms_ptr;
   void *norm_ptr;
 
-  float alpha;
-  float beta;
-
+  bool inplace_residual;
   int in_dim;
   int batch_size;
   int num_elements;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 namespace Kernels {
@@ -48,6 +50,28 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &residual_output,
                             GenericTensorAccessorW const &output);
+void inference_kernel_wrapper(ResidualRMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &residual_output,
+                              GenericTensorAccessorW const &output);
+void backward_kernel_wrapper(
+    ResidualRMSNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &residual_output_rms_input,
+    GenericTensorAccessorW const &residual_input0_grad,
+    GenericTensorAccessorW const &residual_input1_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorW const &weight_grad);
+void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad_0,
+                             GenericTensorAccessorR const &output_grad_1,
+                             GenericTensorAccessorW const &input_grad_0,
+                             GenericTensorAccessorW const &input_grad_1,
+                             GenericTensorAccessorR const &weight);
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/kernels/rms_norm_kernels.h b/include/flexflow/ops/kernels/rms_norm_kernels.h
index 35c5aa69fa..475b6d94ed 100644
--- a/include/flexflow/ops/kernels/rms_norm_kernels.h
+++ b/include/flexflow/ops/kernels/rms_norm_kernels.h
@@ -2,6 +2,7 @@
 #define _FLEXFLOW_OPS_KERNELS_RMSNORM_KERNELS_H
 
 #include "flexflow/accessor.h"
+#include "flexflow/batch_config.h"
 #include "flexflow/device.h"
 #include "flexflow/fftype.h"
 #include "flexflow/op_meta.h"
@@ -31,13 +32,13 @@ class RMSNormMeta : public OpMeta {
   void *rms_ptr;
   void *norm_ptr;
 
-  float alpha;
-  float beta;
-
   int in_dim;
   int batch_size;
   int num_elements;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 namespace Kernels {
@@ -46,6 +47,22 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorR const &weight,
                             GenericTensorAccessorW const &output);
+void inference_kernel_wrapper(RMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &output);
+void backward_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight,
+                             GenericTensorAccessorW const &weight_grad);
+void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight);
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/ops/kernels/softmax_kernels.h b/include/flexflow/ops/kernels/softmax_kernels.h
index 8cfaf3c586..0b7f1090f6 100644
--- a/include/flexflow/ops/kernels/softmax_kernels.h
+++ b/include/flexflow/ops/kernels/softmax_kernels.h
@@ -23,20 +23,30 @@ class SoftmaxMeta : public OpMeta {
   bool profiling;
   bool inference_debugging;
   int dim;
-  DataType input_type, output_type;
 };
 
 namespace Kernels {
 namespace Softmax {
-template <typename DT>
+
 void forward_kernel_wrapper(SoftmaxMeta const *m,
-                            DT const *input_ptr,
-                            DT *output_ptr);
-template <typename DT>
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output);
+
 void backward_kernel_wrapper(SoftmaxMeta const *m,
-                             DT *input_grad_ptr,
-                             DT const *output_grad_ptr,
-                             size_t num_elements);
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
+
+void inference_kernel_wrapper(SoftmaxMeta const *m,
+                              BatchConfig const *bc,
+                              bool is_last_op,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad);
+
+void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
 
 namespace Internal {
 template <typename DT>
@@ -46,10 +56,28 @@ void forward_kernel(SoftmaxMeta const *m,
                     ffStream_t stream);
 
 template <typename DT>
-void backward_kernel(DT *input_grad_ptr,
+void backward_kernel(SoftmaxMeta const *m,
+                     DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      size_t num_elements,
                      ffStream_t stream);
+
+template <typename DT>
+void inference_kernel(SoftmaxMeta const *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int num_classes,
+                      ffStream_t stream);
+
+template <typename DT>
+void peft_bwd_kernel(SoftmaxMeta const *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int num_classes,
+                     ffStream_t stream);
+
 } // namespace Internal
 } // namespace Softmax
 } // namespace Kernels
diff --git a/include/flexflow/ops/kernels/transpose_kernels.h b/include/flexflow/ops/kernels/transpose_kernels.h
index 7ff6163b30..a2c8ff0483 100644
--- a/include/flexflow/ops/kernels/transpose_kernels.h
+++ b/include/flexflow/ops/kernels/transpose_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Transpose;
+
 class TransposeMeta : public OpMeta {
 public:
-  TransposeMeta(FFHandler handler) : OpMeta(handler){};
+  TransposeMeta(FFHandler handler, Transpose const *transpose);
   int num_dim;
   int perm[MAX_TENSOR_DIM];
 };
diff --git a/include/flexflow/ops/layer_norm.h b/include/flexflow/ops/layer_norm.h
index 9e48d81190..b5e9538ea6 100644
--- a/include/flexflow/ops/layer_norm.h
+++ b/include/flexflow/ops/layer_norm.h
@@ -37,6 +37,11 @@ class LayerNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -67,6 +72,10 @@ class LayerNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void backward_task(Legion::Task const *task,
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
@@ -81,11 +90,6 @@ class LayerNorm : public Op {
                              T const *gamma_ptr,
                              T const *beta_ptr,
                              ffStream_t stream);
-  static void forward_kernel_wrapper(LayerNormMeta const *m,
-                                     GenericTensorAccessorR const &input,
-                                     GenericTensorAccessorW &output,
-                                     GenericTensorAccessorR const &gamma,
-                                     GenericTensorAccessorR const &beta);
   template <typename T>
   static void backward_kernel(LayerNormMeta const *m,
                               T const *output_grad_ptr,
@@ -96,13 +100,34 @@ class LayerNorm : public Op {
                               T *beta_grad_ptr,
                               ffStream_t stream);
   template <typename T>
+  static void peft_bwd_kernel(LayerNormMeta const *m,
+                              T const *output_grad_ptr,
+                              T *input_grad_ptr,
+                              T const *gamma_ptr,
+                              ffStream_t stream);
+
+  static void forward_kernel_wrapper(LayerNormMeta const *m,
+                                     GenericTensorAccessorR const &input,
+                                     GenericTensorAccessorW &output,
+                                     GenericTensorAccessorR const &gamma,
+                                     GenericTensorAccessorR const &beta);
   static void backward_kernel_wrapper(LayerNormMeta const *m,
-                                      T const *output_grad_ptr,
-                                      T const *input_ptr,
-                                      T *input_grad_ptr,
-                                      T const *gamma_ptr,
-                                      T *gamma_grad_ptr,
-                                      T *beta_grad_ptr);
+                                      GenericTensorAccessorR const &output_grad,
+                                      GenericTensorAccessorR const &input,
+                                      GenericTensorAccessorW const &input_grad,
+                                      GenericTensorAccessorR const &gamma,
+                                      GenericTensorAccessorW const &gamma_grad,
+                                      GenericTensorAccessorW const &beta_grad);
+  static void inference_kernel_wrapper(LayerNormMeta *m,
+                                       BatchConfig const *bc,
+                                       GenericTensorAccessorR const &input,
+                                       GenericTensorAccessorW &output,
+                                       GenericTensorAccessorR const &gamma,
+                                       GenericTensorAccessorR const &beta);
+  static void peft_bwd_kernel_wrapper(LayerNormMeta const *m,
+                                      GenericTensorAccessorR const &output_grad,
+                                      GenericTensorAccessorW const &input_grad,
+                                      GenericTensorAccessorR const &gamma);
 
 public:
   bool elementwise_affine, use_bias;
@@ -124,6 +149,9 @@ class LayerNormMeta : public OpMeta {
   float eps;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/linear.h b/include/flexflow/ops/linear.h
index a32df80537..ed2fad580f 100644
--- a/include/flexflow/ops/linear.h
+++ b/include/flexflow/ops/linear.h
@@ -52,6 +52,11 @@ class Linear : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override;
   bool get_int_parameter(PMParameter, int *) const override;
   static Op *
@@ -66,6 +71,10 @@ class Linear : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
diff --git a/include/flexflow/ops/lora_linear.h b/include/flexflow/ops/lora_linear.h
new file mode 100644
index 0000000000..9e83c3f90e
--- /dev/null
+++ b/include/flexflow/ops/lora_linear.h
@@ -0,0 +1,99 @@
+#ifndef _FLEXFLOW_LORA_LINEAR_FIRST_H
+#define _FLEXFLOW_LORA_LINEAR_FIRST_H
+
+#include "flexflow/inference.h"
+#include "flexflow/node.h"
+#include "flexflow/operator.h"
+#include "flexflow/ops/lora_linear_params.h"
+#include "flexflow/utils/memory_allocator.h"
+
+namespace FlexFlow {
+
+class FFModel;
+class Layer;
+
+class LoraLinear : public Op {
+public:
+  using Params = LoraLinearParams;
+  using Input = std::pair<ParallelTensor, ParallelTensor>;
+
+  LoraLinear(
+      FFModel &model,
+      LayerID const &layer_guid,
+      OperatorType type,
+      ParallelTensor const input,
+      ParallelTensor const output,
+      std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+      char const *name = nullptr);
+  LoraLinear(FFModel &model,
+             LoraLinear const &other,
+             ParallelTensor const input,
+             ParallelTensor const output);
+  LoraLinear(FFModel &model,
+             Params const &params,
+             Input const &inputs,
+             char const *name = nullptr);
+
+  void init(FFModel const &) override;
+  void init_inference(FFModel const &,
+                      std::vector<ParallelTensor> const &,
+                      std::vector<ParallelTensor> const &,
+                      MachineView const *mv = nullptr) override;
+  void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
+  Legion::FutureMap inference(FFModel const &,
+                              BatchConfigFuture const &,
+                              std::vector<ParallelTensor> const &,
+                              std::vector<ParallelTensor> const &,
+                              MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
+  void print_layer(FFModel const &model) override;
+  void map_output_tensors(FFModel &model) override;
+  static Op *
+      create_operator_from_layer(FFModel &model,
+                                 Layer const *layer,
+                                 std::vector<ParallelTensor> const &inputs);
+  static OpMeta *init_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void forward_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  bool measure_operator_cost(Simulator *sim,
+                             MachineView const &pc,
+                             CostMetrics &cost_metrics) const override;
+  void serialize(Legion::Serializer &) const override;
+  static PCG::Node deserialize(FFModel &ff,
+                               Legion::Deserializer &d,
+                               ParallelTensor inputs[],
+                               int num_inputs);
+  Op *materialize(FFModel &ff,
+                  ParallelTensor inputs[],
+                  int num_inputs) const override;
+  // size_t get_params_hash() const override;
+  LoraLinearParams get_params() const;
+
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+};
+
+}; // namespace FlexFlow
+
+#endif // _FLEXLOW_LORA_LINEAR_FIRST_H
diff --git a/include/flexflow/ops/lora_linear_params.h b/include/flexflow/ops/lora_linear_params.h
new file mode 100644
index 0000000000..70539271f2
--- /dev/null
+++ b/include/flexflow/ops/lora_linear_params.h
@@ -0,0 +1,150 @@
+#ifndef _FLEXFLOW_LORA_LINEAR_PARAMS_H
+#define _FLEXFLOW_LORA_LINEAR_PARAMS_H
+
+#include "flexflow/ffconst.h"
+#include "flexflow/fftype.h"
+#include "flexflow/inference.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/operator.h"
+#include "flexflow/parallel_tensor.h"
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <nlohmann/json.hpp>
+
+namespace FlexFlow {
+
+class LoraOptimizerConfig {
+public:
+  LoraOptimizerConfig();
+  virtual ~LoraOptimizerConfig() {}
+};
+
+class LoraSGDOptimizerConfig : public LoraOptimizerConfig {
+public:
+  LoraSGDOptimizerConfig();
+  LoraSGDOptimizerConfig(double lr_,
+                         double momentum_ = 0.0f,
+                         bool nesterov_ = false,
+                         bool weight_decay_ = 0.0f);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  LoraSGDOptimizerConfig const &llc);
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(
+      LoraSGDOptimizerConfig, lr, momentum, nesterov, weight_decay)
+
+public:
+  double lr = 0.001f;
+  double momentum = 0.0f;
+  bool nesterov = false;
+  double weight_decay = 0.0f;
+};
+
+class LoraAdamOptimizerConfig : public LoraOptimizerConfig {
+public:
+  LoraAdamOptimizerConfig();
+  LoraAdamOptimizerConfig(double alpha_,
+                          double beta1_ = 0.9f,
+                          double beta2_ = 0.999f,
+                          double weight_decay_ = 0.0f,
+                          double epsilon_ = 1e-8);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  LoraAdamOptimizerConfig const &llc);
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(
+      LoraAdamOptimizerConfig, alpha, beta1, beta2, weight_decay, epsilon)
+
+public:
+  // Adam
+  double alpha = 0.001f;
+  double beta1 = 0.9f;
+  double beta2 = 0.999f;
+  double weight_decay = 0.0f;
+  double epsilon = 1e-8;
+};
+
+// Serialization helpers
+template <typename T>
+void serialize_to_json_file(T const &obj, fs::path const &filepath);
+
+// Function to deserialize JSON from file and create object
+template <typename T>
+std::unique_ptr<T> deserialize_from_json_file(fs::path const &filepath);
+
+class LoraLinearConfig {
+public:
+  static const LoraLinearConfig EmptyConfig;
+  LoraLinearConfig(std::string const &cache_folder_,
+                   std::string const &peft_model_id_,
+                   bool trainable_ = false,
+                   LoraOptimizerConfig *optimizer_config_ = nullptr,
+                   bool init_lora_weights_ = false,
+                   std::string const &base_model_name_or_path_ = "",
+                   std::string const &precision_ = "fp16",
+                   int rank_ = 8,
+                   float lora_alpha_ = 8.0f,
+                   float lora_dropout_ = 0.0f,
+                   std::vector<std::string> const &target_modules_ = {});
+  // constructor used to support std::unordered_map
+  LoraLinearConfig();
+  friend bool operator==(LoraLinearConfig const &lhs,
+                         LoraLinearConfig const &rhs);
+  friend std::ostream &operator<<(std::ostream &os,
+                                  LoraLinearConfig const &llc);
+
+  NLOHMANN_DEFINE_TYPE_INTRUSIVE(LoraLinearConfig,
+                                 cache_folder,
+                                 peft_model_id,
+                                 rank,
+                                 lora_alpha,
+                                 lora_dropout,
+                                 target_modules,
+                                 trainable,
+                                 init_lora_weights,
+                                 base_model_name_or_path,
+                                 precision)
+
+  std::string cache_folder;
+  // Huggingface model ID (for download and/or upload)
+  std::string peft_model_id;
+  // Lora parameters
+  int rank;
+  float lora_alpha;
+  float lora_dropout;
+  std::vector<std::string> target_modules;
+  // Training parameters
+  // whether the weights are trainable (fine-tuning scenario) or not
+  // (inference-only). If set to true, allocate space for the gradients
+  bool trainable = false;
+  LoraOptimizerConfig *optimizer_config;
+  // whether to initialize weights randomly (instead of attempting to load them
+  // from file)
+  bool init_lora_weights;
+  // parameters only used to upload model after finetuning
+  std::string base_model_name_or_path;
+  std::string precision;
+};
+
+class LoraLinearParams {
+public:
+  LayerID layer_guid;
+  OperatorType type;
+  std::unordered_map<PEFTModelID, LoraLinearConfig> peft_configs;
+  char name[MAX_OPNAME];
+
+  bool is_valid(std::pair<ParallelTensorShape, ParallelTensorShape> const
+                    &input_shape) const;
+  friend bool operator==(LoraLinearParams const &lhs,
+                         LoraLinearParams const &rhs);
+};
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<FlexFlow::LoraLinearParams> {
+  size_t operator()(FlexFlow::LoraLinearParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_LORA_LINEAR_PARAMS_H
diff --git a/include/flexflow/ops/residual_layer_norm.h b/include/flexflow/ops/residual_layer_norm.h
index 0e9be82125..33a8e8be51 100644
--- a/include/flexflow/ops/residual_layer_norm.h
+++ b/include/flexflow/ops/residual_layer_norm.h
@@ -26,8 +26,10 @@ class ResidualLayerNorm : public Op {
                     bool _elementwise_affine,
                     bool _use_bias,
                     float _eps,
+                    bool inplace_residual,
                     bool allocate_weights,
                     char const *name);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
@@ -40,6 +42,11 @@ class ResidualLayerNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -65,6 +72,14 @@ class ResidualLayerNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -78,7 +93,8 @@ class ResidualLayerNorm : public Op {
                                T const *gamma_ptr,
                                T const *beta_ptr,
                                ffStream_t stream);
-  static void inference_kernel_wrapper(ResidualLayerNormMeta const *m,
+  static void inference_kernel_wrapper(ResidualLayerNormMeta *m,
+                                       BatchConfig const *bc,
                                        GenericTensorAccessorR const &input,
                                        GenericTensorAccessorR const &residual1,
                                        GenericTensorAccessorR const &residual2,
@@ -86,11 +102,30 @@ class ResidualLayerNorm : public Op {
                                        GenericTensorAccessorW &output,
                                        GenericTensorAccessorR const &gamma,
                                        GenericTensorAccessorR const &beta);
+  static void
+      backward_kernel_wrapper(ResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorR const &added_output,
+                              GenericTensorAccessorW const &input_grad,
+                              GenericTensorAccessorW const &residual1_grad,
+                              GenericTensorAccessorW const &residual2_grad,
+                              GenericTensorAccessorR const &gamma,
+                              GenericTensorAccessorW const &gamma_grad,
+                              GenericTensorAccessorW const &beta_grad);
+
+  static void
+      peft_bwd_kernel_wrapper(ResidualLayerNormMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorW const &input_grad,
+                              GenericTensorAccessorW const &residual1_grad,
+                              GenericTensorAccessorW const &residual2_grad,
+                              GenericTensorAccessorR const &gamma);
 
 public:
   bool elementwise_affine, use_bias, use_two_residuals;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   std::vector<int> axes;
 };
 
@@ -105,8 +140,12 @@ class ResidualLayerNormMeta : public OpMeta {
   bool elementwise_affine, use_bias, use_two_residuals;
   int64_t effective_batch_size, effective_num_elements;
   float eps;
+  bool inplace_residual;
   void *mean_ptr, *rstd_ptr, *ds_ptr, *db_ptr, *scale_ptr, *bias_ptr;
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/residual_layer_norm_params.h b/include/flexflow/ops/residual_layer_norm_params.h
index 949ae0c799..166d4b2b4e 100644
--- a/include/flexflow/ops/residual_layer_norm_params.h
+++ b/include/flexflow/ops/residual_layer_norm_params.h
@@ -13,6 +13,7 @@ struct ResidualLayerNormParams {
   float eps;
   bool use_bias;
   bool use_two_residuals;
+  bool inplace_residual;
   char name[MAX_OPNAME];
   bool is_valid(std::tuple<ParallelTensorShape,
                            ParallelTensorShape,
diff --git a/include/flexflow/ops/residual_rms_norm.h b/include/flexflow/ops/residual_rms_norm.h
index 0d92a236e8..bf75cd573a 100644
--- a/include/flexflow/ops/residual_rms_norm.h
+++ b/include/flexflow/ops/residual_rms_norm.h
@@ -20,6 +20,7 @@ class ResidualRMSNorm : public Op {
                   const ParallelTensor _input2,
                   float _eps,
                   int dim,
+                  bool inplace_residual,
                   bool allocate_weights,
                   char const *name);
   ResidualRMSNorm(FFModel &model,
@@ -32,6 +33,7 @@ class ResidualRMSNorm : public Op {
                   ResidualRMSNorm const &other,
                   Input const &inputs,
                   bool allocate_weights);
+  void map_output_tensors(FFModel &ff) override;
   void init(FFModel const &) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
@@ -44,6 +46,11 @@ class ResidualRMSNorm : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void print_layer(FFModel const &model) override {
     assert(0);
   }
@@ -74,6 +81,14 @@ class ResidualRMSNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -82,6 +97,7 @@ class ResidualRMSNorm : public Op {
   float eps;
   int effective_batch_size;
   int dim, data_dim;
+  bool inplace_residual;
 };
 } // namespace FlexFlow
 #endif // _FLEXFLOW_RESIDUAL_RMS_NORM_H
diff --git a/include/flexflow/ops/residual_rms_norm_params.h b/include/flexflow/ops/residual_rms_norm_params.h
index a4e4de59ab..8b8f666dc1 100644
--- a/include/flexflow/ops/residual_rms_norm_params.h
+++ b/include/flexflow/ops/residual_rms_norm_params.h
@@ -11,6 +11,7 @@ struct ResidualRMSNormParams {
   LayerID layer_guid;
   float eps;
   int dim;
+  bool inplace_residual;
   char name[MAX_OPNAME];
   bool is_valid(
       std::pair<ParallelTensorShape, ParallelTensorShape> const &input) const;
diff --git a/include/flexflow/ops/rms_norm.h b/include/flexflow/ops/rms_norm.h
index 1dc940ebd3..384404d8a0 100644
--- a/include/flexflow/ops/rms_norm.h
+++ b/include/flexflow/ops/rms_norm.h
@@ -34,6 +34,11 @@ class RMSNorm : public Op {
   void init(FFModel const &) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
+  Legion::FutureMap peft_bwd(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) override;
   void init_inference(FFModel const &,
                       std::vector<ParallelTensor> const &,
                       std::vector<ParallelTensor> const &,
@@ -73,6 +78,14 @@ class RMSNorm : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/include/flexflow/ops/sigmoid_silu_multi.h b/include/flexflow/ops/sigmoid_silu_multi.h
index 604438260a..ac60ff15dd 100644
--- a/include/flexflow/ops/sigmoid_silu_multi.h
+++ b/include/flexflow/ops/sigmoid_silu_multi.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "flexflow/batch_config.h"
 #include "flexflow/inference.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/memory_allocator.h"
@@ -27,6 +28,11 @@ class SigmoidSiluMulti : public Op {
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
   void backward(FFModel const &) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &,
                               std::vector<ParallelTensor> const &,
@@ -55,6 +61,14 @@ class SigmoidSiluMulti : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
@@ -65,10 +79,24 @@ class SigmoidSiluMulti : public Op {
                                T const *input2_ptr,
                                T *output_ptr,
                                ffStream_t stream);
-  static void inference_kernel_wrapper(SigmoidSiluMultiMeta const *m,
+  static void inference_kernel_wrapper(SigmoidSiluMultiMeta *m,
+                                       BatchConfig const *bc,
                                        GenericTensorAccessorR const &input1,
                                        GenericTensorAccessorR const &input2,
                                        GenericTensorAccessorW const &output);
+  static void
+      backward_kernel_wrapper(SigmoidSiluMultiMeta const *m,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorW const &input1_grad,
+                              GenericTensorAccessorW const &input2_grad);
+  static void
+      peft_bwd_kernel_wrapper(SigmoidSiluMultiMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &output_grad,
+                              GenericTensorAccessorW const &input1_grad,
+                              GenericTensorAccessorW const &input2_grad);
 };
 
 class SigmoidSiluMultiMeta : public OpMeta {
@@ -80,6 +108,9 @@ class SigmoidSiluMultiMeta : public OpMeta {
 
 public:
   Realm::RegionInstance reserveInst;
+  // PEFT related fields
+  void *input_activation;
+  size_t allocated_peft_buffer_size = 0;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/ops/softmax.h b/include/flexflow/ops/softmax.h
index 61094f7361..82aff53766 100644
--- a/include/flexflow/ops/softmax.h
+++ b/include/flexflow/ops/softmax.h
@@ -33,6 +33,11 @@ class Softmax : public Op {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void backward(FFModel const &) override;
   bool get_int_parameter(PMParameter, int *) const override;
   void print_layer(FFModel const &model) override {
@@ -58,6 +63,10 @@ class Softmax : public Op {
                              std::vector<Legion::PhysicalRegion> const &regions,
                              Legion::Context ctx,
                              Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/include/flexflow/ops/topk.h b/include/flexflow/ops/topk.h
index 47144bf6d7..4b67692032 100644
--- a/include/flexflow/ops/topk.h
+++ b/include/flexflow/ops/topk.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class TopK;
+
 class TopKMeta : public OpMeta {
 public:
-  TopKMeta(FFHandler handle);
+  TopKMeta(FFHandler handle, TopK const *topk);
   bool sorted;
 };
 
diff --git a/include/flexflow/ops/transpose.h b/include/flexflow/ops/transpose.h
index 3e6fb575c0..bca0b83460 100644
--- a/include/flexflow/ops/transpose.h
+++ b/include/flexflow/ops/transpose.h
@@ -6,6 +6,8 @@
 
 namespace FlexFlow {
 
+class TransposeMeta;
+
 class Transpose : public Op {
 public:
   using Params = TransposeParams;
diff --git a/include/flexflow/ops/tree_inc_multihead_self_attention.h b/include/flexflow/ops/tree_inc_multihead_self_attention.h
index 02df0c0137..168ad5f618 100644
--- a/include/flexflow/ops/tree_inc_multihead_self_attention.h
+++ b/include/flexflow/ops/tree_inc_multihead_self_attention.h
@@ -144,7 +144,7 @@ class TreeIncMultiHeadSelfAttentionMeta : public IncMultiHeadSelfAttentionMeta {
   ~TreeIncMultiHeadSelfAttentionMeta(void);
 
 public:
-  int num_active_tokens;
+  int num_active_infr_tokens;
   Realm::RegionInstance committed_token_reserve_inst;
   TreeVerifyBatchConfig::CommittedTokensInfo *committed_token_infos;
   bool *request_completed;
diff --git a/include/flexflow/parallel_ops/allreduce.h b/include/flexflow/parallel_ops/allreduce.h
index 045f9b36a0..7e0e4362e2 100644
--- a/include/flexflow/parallel_ops/allreduce.h
+++ b/include/flexflow/parallel_ops/allreduce.h
@@ -34,12 +34,17 @@ class AllReduce : public ParallelOp {
                       std::vector<ParallelTensor> const &,
                       MachineView const *mv = nullptr) override;
   void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
   Legion::FutureMap inference(FFModel const &,
                               BatchConfigFuture const &bc,
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
-  void backward(FFModel const &) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   bool get_int_parameter(PMParameter, int *) const override;
   bool append_parallel_op_info(
       std::vector<ParallelOpInfo> &parallel_ops) const override;
@@ -47,10 +52,6 @@ class AllReduce : public ParallelOp {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
-  static void inference_task(Legion::Task const *task,
-                             std::vector<Legion::PhysicalRegion> const &regions,
-                             Legion::Context ctx,
-                             Legion::Runtime *runtime);
   static void forward_task(Legion::Task const *task,
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
@@ -59,6 +60,14 @@ class AllReduce : public ParallelOp {
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   bool measure_operator_cost(Simulator *sim,
                              MachineView const &pc,
                              CostMetrics &cost_metrics) const override;
diff --git a/include/flexflow/parallel_ops/combine.h b/include/flexflow/parallel_ops/combine.h
index 2e4fdb86a9..1db776f59d 100644
--- a/include/flexflow/parallel_ops/combine.h
+++ b/include/flexflow/parallel_ops/combine.h
@@ -40,6 +40,11 @@ class Combine : public ParallelOp {
                               std::vector<ParallelTensor> const &,
                               std::vector<ParallelTensor> const &,
                               MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   void backward(FFModel const &) override;
   bool get_int_parameter(PMParameter, int *) const override;
   bool append_parallel_op_info(
@@ -52,10 +57,18 @@ class Combine : public ParallelOp {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
   static void backward_task(Legion::Task const *task,
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   template <typename T>
   static void
       forward_task_with_type(Legion::Task const *task,
diff --git a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
index bdf7aae501..a4ccbee8a5 100644
--- a/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/allreduce_kernels.h
@@ -17,11 +17,6 @@ class AllReduceMeta : public OpMeta {
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
-                              BatchConfig const *bc,
-                              GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output);
-
 void forward_kernel_wrapper(AllReduceMeta const *m,
                             GenericTensorAccessorR const &input,
                             GenericTensorAccessorW const &output);
@@ -30,6 +25,15 @@ void backward_kernel_wrapper(AllReduceMeta const *m,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad);
 
+void inference_kernel_wrapper(AllReduceMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output);
+
+void peft_bwd_kernel_wrapper(AllReduceMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
 } // namespace AllReduce
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/include/flexflow/parallel_ops/kernels/combine_kernels.h b/include/flexflow/parallel_ops/kernels/combine_kernels.h
index 456013cd81..4b2227b178 100644
--- a/include/flexflow/parallel_ops/kernels/combine_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/combine_kernels.h
@@ -8,9 +8,11 @@
 
 namespace FlexFlow {
 
+class Combine;
+
 class CombineMeta : public OpMeta {
 public:
-  CombineMeta(FFHandler handle);
+  CombineMeta(FFHandler handle, Combine const *comb);
   DataType data_type;
 };
 
diff --git a/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h b/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h
new file mode 100644
index 0000000000..fd6778a37f
--- /dev/null
+++ b/include/flexflow/parallel_ops/kernels/parallel_identity_kernels.h
@@ -0,0 +1,41 @@
+#ifndef _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H
+#define _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H
+
+#include "flexflow/batch_config.h"
+#include "flexflow/device.h"
+#include "flexflow/fftype.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
+
+namespace FlexFlow {
+
+class ParallelIdentityMeta : public OpMeta {
+public:
+  ParallelIdentityMeta(FFHandler handle, ParallelIdentity const *reduct);
+};
+
+namespace Kernels {
+namespace ParallelIdentity {
+
+void forward_kernel_wrapper(ParallelIdentityMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output);
+
+void backward_kernel_wrapper(ParallelIdentityMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
+
+void inference_kernel_wrapper(ParallelIdentityMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output);
+
+void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad);
+} // namespace ParallelIdentity
+} // namespace Kernels
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_OPS_KERNELS_PARALLEL_IDENTITY_KERNELS_H
diff --git a/include/flexflow/parallel_ops/kernels/partition_kernels.h b/include/flexflow/parallel_ops/kernels/partition_kernels.h
index 81b190603a..1e77090d11 100644
--- a/include/flexflow/parallel_ops/kernels/partition_kernels.h
+++ b/include/flexflow/parallel_ops/kernels/partition_kernels.h
@@ -7,9 +7,11 @@
 
 namespace FlexFlow {
 
+class Repartition;
+
 class RepartitionMeta : public OpMeta {
 public:
-  RepartitionMeta(FFHandler handle);
+  RepartitionMeta(FFHandler handle, Repartition const *repart);
   DataType data_type;
 };
 
diff --git a/include/flexflow/parallel_ops/parallel_identity.h b/include/flexflow/parallel_ops/parallel_identity.h
new file mode 100644
index 0000000000..b3ca789f08
--- /dev/null
+++ b/include/flexflow/parallel_ops/parallel_identity.h
@@ -0,0 +1,83 @@
+#ifndef _FLEXFLOW_PARALLEL_IDENTITY_H
+#define _FLEXFLOW_PARALLEL_IDENTITY_H
+
+#include "flexflow/layer.h"
+#include "flexflow/node.h"
+#include "flexflow/op_meta.h"
+#include "flexflow/operator.h"
+#include "flexflow/parallel_ops/parallel_identity_params.h"
+#include "parallel_op.h"
+
+namespace FlexFlow {
+
+class ParallelIdentity : public ParallelOp {
+public:
+  using Params = ParallelIdentityParams;
+  using Input = ParallelTensor;
+
+  ParallelIdentity(FFModel &model,
+                   const ParallelTensor input,
+                   int parallel_identity_legion_dim,
+                   char const *name = NULL);
+  ParallelIdentity(FFModel &model,
+                   Params const &params,
+                   Input const input,
+                   char const *name = nullptr);
+  void create_input_partition(FFModel &model) override;
+  void create_input_partition_inference(
+      FFModel &model,
+      std::vector<ParallelTensor> const &batch_inputs,
+      std::vector<ParallelTensor> const &batch_outputs) override;
+  void init(FFModel const &) override;
+  void init_inference(FFModel const &,
+                      std::vector<ParallelTensor> const &,
+                      std::vector<ParallelTensor> const &,
+                      MachineView const *mv = nullptr) override;
+  void forward(FFModel const &) override;
+  void backward(FFModel const &) override;
+  Legion::FutureMap inference(FFModel const &,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &,
+                              std::vector<ParallelTensor> const &,
+                              MachineView const *mv = nullptr) override;
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
+  bool get_int_parameter(PMParameter, int *) const override;
+  bool append_parallel_op_info(
+      std::vector<ParallelOpInfo> &parallel_ops) const override;
+  static OpMeta *init_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void forward_task(Legion::Task const *task,
+                           std::vector<Legion::PhysicalRegion> const &regions,
+                           Legion::Context ctx,
+                           Legion::Runtime *runtime);
+  static void backward_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  static void inference_task(Legion::Task const *task,
+                             std::vector<Legion::PhysicalRegion> const &regions,
+                             Legion::Context ctx,
+                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
+  bool measure_operator_cost(Simulator *sim,
+                             MachineView const &pc,
+                             CostMetrics &cost_metrics) const override;
+
+  Params get_params() const;
+
+public:
+  int parallel_identity_dim;
+};
+
+}; // namespace FlexFlow
+
+#endif // _FLEXFLOW_PARALLEL_IDENTITY_H
diff --git a/include/flexflow/parallel_ops/parallel_identity_params.h b/include/flexflow/parallel_ops/parallel_identity_params.h
new file mode 100644
index 0000000000..6eeed662ec
--- /dev/null
+++ b/include/flexflow/parallel_ops/parallel_identity_params.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H
+#define _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H
+
+namespace FlexFlow {
+
+struct ParallelIdentityParams {
+  int parallel_identity_legion_dim;
+  char name[MAX_OPNAME];
+  bool is_valid(ParallelTensorShape const &) const;
+};
+bool operator==(ParallelIdentityParams const &, ParallelIdentityParams const &);
+
+} // namespace FlexFlow
+
+namespace std {
+template <>
+struct hash<FlexFlow::ParallelIdentityParams> {
+  size_t operator()(FlexFlow::ParallelIdentityParams const &) const;
+};
+} // namespace std
+
+#endif // _FLEXFLOW_PARALLEL_IDENTITY_PARAMS_H
diff --git a/include/flexflow/parallel_ops/parallel_op.h b/include/flexflow/parallel_ops/parallel_op.h
index 0bf573996c..39324c2a51 100644
--- a/include/flexflow/parallel_ops/parallel_op.h
+++ b/include/flexflow/parallel_ops/parallel_op.h
@@ -41,7 +41,7 @@ class ParallelOp : public Op {
 public:
   Legion::LogicalPartition input_lp, output_grad_lp;
   std::unordered_map<ParallelTensor, Legion::LogicalPartition>
-      inference_input_lps;
+      inference_input_lps, inference_output_grad_lps;
 };
 
 }; // namespace FlexFlow
diff --git a/include/flexflow/parallel_ops/replicate.h b/include/flexflow/parallel_ops/replicate.h
index 65d69d8564..c27616634f 100644
--- a/include/flexflow/parallel_ops/replicate.h
+++ b/include/flexflow/parallel_ops/replicate.h
@@ -54,10 +54,19 @@ class Replicate : public ParallelOp {
                            std::vector<Legion::PhysicalRegion> const &regions,
                            Legion::Context ctx,
                            Legion::Runtime *runtime);
+  Legion::FutureMap peft_bwd(FFModel const &,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &,
+                             std::vector<ParallelTensor> const &,
+                             MachineView const *mv = nullptr) override;
   static void backward_task(Legion::Task const *task,
                             std::vector<Legion::PhysicalRegion> const &regions,
                             Legion::Context ctx,
                             Legion::Runtime *runtime);
+  static void peft_bwd_task(Legion::Task const *task,
+                            std::vector<Legion::PhysicalRegion> const &regions,
+                            Legion::Context ctx,
+                            Legion::Runtime *runtime);
   static void forward_kernel_wrapper(ReplicateMeta const *m,
                                      GenericTensorAccessorR const &input,
                                      GenericTensorAccessorW const &output,
diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
index a38a3b2671..f0fab957ee 100644
--- a/include/flexflow/request_manager.h
+++ b/include/flexflow/request_manager.h
@@ -39,6 +39,7 @@ class InferenceManager {
   Legion::FutureMap inference(FFModel *model, int index, BatchConfig const &bc);
   Legion::FutureMap
       inference(FFModel *model, int index, BatchConfigFuture const &bc);
+  void peft_bwd(FFModel *model, int index, BatchConfigFuture const &bc);
   void load_input_tokens_from_batch_config(FFModel *model,
                                            BatchConfigFuture const &bc,
                                            ParallelTensor const input,
@@ -65,15 +66,34 @@ struct Request {
     FINISHING = 104, // finishing request, but not yet verified
   };
   BatchConfig::RequestGuid guid;
-  int max_sequence_length;
+  PEFTModelID peft_model_id = PEFTModelID::NO_ID;
+  int max_sequence_length = 128;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
 
   Status status = PENDING;
   std::vector<BatchConfig::TokenId> tokens;
-
+  std::string prompt;
   std::vector<struct BeamTree> beam_trees;
+  // PEFT field
+  RequestType req_type = REQ_INFERENCE;
+  size_t processed_finetuning_tokens = 0;
+  int completed_training_steps = 0;
+  int dataset_entry_processed_tokens = 0;
+  int max_training_steps = 1;
+  // how many gradient accumulation steps to do before updating the weights. if
+  // left as -1, it will be set to the number of entries in the dataset
+  int gradient_accumulation_steps = -1;
+  int benchmarking_tokens = -1;
+  std::vector<int> finetuning_tokens_per_batch;
+  bool warmup = false;
+  std::string dataset_filepath;
+  std::vector<std::pair<std::vector<BatchConfig::TokenId>,
+                        std::vector<BatchConfig::TokenId>>>
+      dataset;
+  std::vector<float> finetuning_losses;
+  friend std::ostream &operator<<(std::ostream &os, Request const &req);
 };
 
 // store the result of beam search
@@ -120,6 +140,8 @@ class RequestManager {
   void set_max_sequence_length(int max_seq_length);
   void push_spec_infer_tree_width(int tree_width);
   int get_max_sequence_length();
+  void set_enable_peft_finetuning(bool enable_peft_finetuning_);
+  static void set_inference_finished(bool finished = true);
   int register_ssm_model(FFModel *model);
   void register_tokenizer(ModelType model_type,
                           int bos_token_id,
@@ -143,10 +165,9 @@ class RequestManager {
   void serve_incr_decoding(FFModel *model);
   void serve_spec_infer(FFModel *model);
   GenerationResult get_generation_result(RequestGuid const &guid);
-  RequestGuid register_new_request(std::string const &prompt,
-                                   int max_sequence_length);
-  RequestGuid register_new_request(std::vector<TokenId> const &prompt,
-                                   int max_sequence_length);
+  RequestGuid register_new_request(Request const &request_);
+  RequestGuid register_new_peft_request(Request const &request_);
+
   // Methods to start and terminate request manager's background task
   void start_background_server(FFModel *model);
   bool is_background_server_terminated();
@@ -156,6 +177,8 @@ class RequestManager {
   bool is_request_completed(RequestGuid const &guid);
   void trigger_request_completion_future(RequestGuid const &guid);
   // Methods for preparing next batches
+  bool check_inf_req_completion(BatchConfig const &old_bc, int i);
+  void check_batch(BatchConfig const &old_bc, BatchConfig const &new_bc);
   BatchConfig prepare_next_batch(BatchConfig const &bc,
                                  InferenceResult const &result);
   BatchConfigFuture prepare_next_batch(BatchConfigFuture const &bc,
@@ -265,6 +288,10 @@ class RequestManager {
   int max_sequence_length;
   Status request_manager_status;
 
+  // peft benchmarking
+  bool enable_peft_finetuning = false;
+  static bool inference_finished;
+
   // tree width in each speculative step, if not specified 1
   std::vector<int> spec_infer_tree_width;
 
@@ -275,7 +302,8 @@ class RequestManager {
   int bos_token_id;
   int eos_token_id;
   std::string output_filepath;
-  std::queue<Request> pending_request_queue;
+  std::queue<Request> pending_infr_request_queue;
+  std::queue<Request> pending_peft_request_queue;
   std::unordered_map<RequestGuid, Request> all_requests;
   std::unordered_map<RequestGuid, GenerationResult> request_generation_results;
   std::mutex request_queue_mutex;
@@ -304,6 +332,8 @@ class RequestManager {
     int llm_decoding_steps;
     int ssm_decoding_steps;
     double start_time, finish_time;
+    double registration_time, first_token_time;
+    bool first_token_time_set = false;
   };
   std::unordered_map<RequestGuid, ProfileInfo> profiling_requests;
   double total_request_run_time;
diff --git a/include/flexflow/simulator.h b/include/flexflow/simulator.h
index e410f66325..6cda96aa8b 100644
--- a/include/flexflow/simulator.h
+++ b/include/flexflow/simulator.h
@@ -33,21 +33,21 @@ namespace FlexFlow {
 
 #define MOD(a, b) ((a) % (b)) < 0 ? ((a) % (b)) + (b) : ((a) % (b))
 
-class Conv2DMeta;
-class LinearMeta;
-class Pool2DMeta;
-class ElementUnaryMeta;
-class ElementBinaryMeta;
-class LayerNormMeta;
-// class EmbeddingMeta;
-// class SoftmaxMeta;
-class BatchMatmulMeta;
-// class BatchNormMeta;
-class ConcatMeta;
-// class DropoutMeta;
-class TransposeMeta;
-class Op;
-class FFModel;
+// class Conv2DMeta;
+// class LinearMeta;
+// class Pool2DMeta;
+// class ElementUnaryMeta;
+// class ElementBinaryMeta;
+// class LayerNormMeta;
+//  class EmbeddingMeta;
+//  class SoftmaxMeta;
+// class BatchMatmulMeta;
+//  class BatchNormMeta;
+// class ConcatMeta;
+//  class DropoutMeta;
+// class TransposeMeta;
+// class Op;
+// class FFModel;
 
 /**
  * @brief Costs of an operator.
@@ -751,19 +751,19 @@ class Simulator {
       strict_hash_to_operator_cost;
 
 public:
-  Conv2DMeta *conv2d_meta;
-  LinearMeta *linear_meta;
-  Pool2DMeta *pool2d_meta;
-  ElementUnaryMeta *ele_unary_meta;
-  LayerNormMeta *layernorm_meta;
-  // ElementBinaryMeta *ele_binary_meta;
-  // EmbeddingMeta *embedding_meta;
-  // SoftmaxMeta *softmax_meta;
-  BatchMatmulMeta *batch_matmul_meta;
-  // BatchNormMeta *batch_norm_meta;
-  ConcatMeta *concat_meta;
-  // DropoutMeta *dropout_meta;
-  TransposeMeta *transpose_meta;
+  // Conv2DMeta *conv2d_meta;
+  // LinearMeta *linear_meta;
+  // Pool2DMeta *pool2d_meta;
+  // ElementUnaryMeta *ele_unary_meta;
+  // LayerNormMeta *layernorm_meta;
+  //  ElementBinaryMeta *ele_binary_meta;
+  //  EmbeddingMeta *embedding_meta;
+  //  SoftmaxMeta *softmax_meta;
+  // BatchMatmulMeta *batch_matmul_meta;
+  //  BatchNormMeta *batch_norm_meta;
+  // ConcatMeta *concat_meta;
+  //  DropoutMeta *dropout_meta;
+  // TransposeMeta *transpose_meta;
   int segment_size;
   int max_num_segments; // simulation could be slow if the number of segments
                         // are too large
diff --git a/include/flexflow/utils/cuda_helper.h b/include/flexflow/utils/cuda_helper.h
index f8bf67b3e1..486a65eb3d 100644
--- a/include/flexflow/utils/cuda_helper.h
+++ b/include/flexflow/utils/cuda_helper.h
@@ -75,8 +75,8 @@ inline int GET_BLOCKS(int const N) {
   return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret;
 }
 
-__global__ void
-    scale_kernel(float *ptr, Legion::coord_t size, float a, float b);
+template <typename DT>
+__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b);
 
 __global__ void ones_kernel(float *ptr, Legion::coord_t size);
 
@@ -156,10 +156,13 @@ template <typename T>
 void save_tensor(T const *ptr, size_t num_elements, char const *file_name);
 
 template <typename T>
-T *download_tensor(T const *ptr, size_t num_elements);
+T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements);
+
+template <typename T>
+void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements);
 
 template <typename T>
-bool download_tensor(T const *ptr, T *dst, size_t num_elements);
+void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements);
 
 cudnnStatus_t cudnnSetTensorDescriptorFromDomain(cudnnTensorDescriptor_t tensor,
                                                  Legion::Domain domain,
@@ -179,3 +182,5 @@ ncclDataType_t ff_to_nccl_datatype(DataType type);
 cudaDataType_t cudnn_to_cuda_datatype(cudnnDataType_t type);
 cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type);
 #endif
+void check_device_vs_host_ptr(void const *maybe_devicePtr);
+void check_ptr_alignment(void const *ptr);
diff --git a/include/flexflow/utils/hip_helper.h b/include/flexflow/utils/hip_helper.h
index 5d3c831d4f..805cc46b4c 100644
--- a/include/flexflow/utils/hip_helper.h
+++ b/include/flexflow/utils/hip_helper.h
@@ -75,8 +75,8 @@ inline int GET_BLOCKS(int const N) {
   return (ret > BLOCK_SIZE_LIMIT) ? BLOCK_SIZE_LIMIT : ret;
 }
 
-__global__ void
-    scale_kernel(float *ptr, Legion::coord_t size, float a, float b);
+template <typename DT>
+__global__ void scale_kernel(DT *ptr, Legion::coord_t size, DT a, DT b);
 
 __global__ void ones_kernel(float *ptr, Legion::coord_t size);
 
@@ -86,6 +86,12 @@ __global__ void assign_kernel(DT *ptr, Legion::coord_t size, DT value);
 template <typename DT>
 __global__ void copy_kernel(DT *dst, const DT *src, Legion::coord_t size);
 
+template <typename DT>
+__global__ void copy_kernel_discrete(DT *dst,
+                                     const DT *src,
+                                     Legion::coord_t size,
+                                     size_t *index);
+
 template <typename T>
 __global__ void add_kernel(T *data_ptr, T const *grad_ptr, size_t size);
 
@@ -135,16 +141,28 @@ __host__ void updateGAS(float *para_ptr,
                         float learning_rate);
 
 template <typename T>
-void print_tensor(T const *ptr, size_t num_elements, char const *prefix);
+void print_tensor(T const *ptr,
+                  size_t num_elements,
+                  char const *prefix,
+                  int shard_id = 0);
+template <typename T>
+void print_beam_tensor(T const *ptr,
+                       size_t num_elements,
+                       int skip,
+                       int channel,
+                       char const *prefix);
 
 template <typename T>
 void save_tensor(T const *ptr, size_t num_elements, char const *file_name);
 
 template <typename T>
-T *download_tensor(T const *ptr, size_t num_elements);
+T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements);
+
+template <typename T>
+void copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements);
 
 template <typename T>
-bool download_tensor(T const *ptr, T *dst, size_t num_elements);
+void copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements);
 
 miopenStatus_t
     cudnnSetTensorDescriptorFromDomain(miopenTensorDescriptor_t tensor,
@@ -153,7 +171,8 @@ miopenStatus_t
 
 miopenStatus_t
     cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor,
-                                               Legion::Domain domain);
+                                               Legion::Domain domain,
+                                               DataType data_type = DT_FLOAT);
 
 hipblasDatatype_t ff_to_cuda_datatype(DataType type);
 
@@ -164,3 +183,5 @@ ncclDataType_t ff_to_nccl_datatype(DataType type);
 
 void handle_unimplemented_hip_kernel(OperatorType op_type);
 #endif
+void check_device_vs_host_ptr(void const *maybe_devicePtr);
+void check_ptr_alignment(void const *ptr);
diff --git a/include/flexflow/utils/memory_allocator.h b/include/flexflow/utils/memory_allocator.h
index 7091b159b2..fad7630770 100644
--- a/include/flexflow/utils/memory_allocator.h
+++ b/include/flexflow/utils/memory_allocator.h
@@ -54,6 +54,11 @@ class MemoryAllocator {
     return static_cast<DT *>(ptr);
   }
 
+  inline void free_all() {
+    reserved_allocated_size = 0;
+    instance_allocated_size = 0;
+  }
+
 public:
   Legion::Memory memory;
   void *reserved_ptr;
diff --git a/include/flexflow/utils/peft_weight_allocator.h b/include/flexflow/utils/peft_weight_allocator.h
new file mode 100644
index 0000000000..dae46a8af1
--- /dev/null
+++ b/include/flexflow/utils/peft_weight_allocator.h
@@ -0,0 +1,92 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
+#define _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
+
+#include "flexflow/config.h"
+#include <mutex>
+
+namespace FlexFlow {
+
+class PEFTWeightAllocator {
+public:
+  PEFTWeightAllocator(void *_base_ptr, size_t _total_size)
+      : base_ptr(_base_ptr), total_size(_total_size), sync_offset(0),
+        local_offset(_total_size) {}
+
+  inline void *allocate_sync_weights_untyped(PEFTModelID const &peft_model_id,
+                                             size_t datalen) {
+    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
+    void *ptr = static_cast<char *>(base_ptr) + sync_offset;
+    off_t model_sync_weights_offset = sync_offset;
+    size_t model_sync_weights_size = datalen;
+    if (sync_weights.find(peft_model_id) != sync_weights.end()) {
+      // Assert that sync weights for each PEFT model is consecutive
+      std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
+      assert(sync_offset == offset_and_size.first + offset_and_size.second);
+      model_sync_weights_offset = offset_and_size.first;
+      model_sync_weights_size = offset_and_size.second + datalen;
+    }
+    sync_offset += datalen;
+    assert(sync_offset < local_offset);
+    sync_weights[peft_model_id] =
+        std::make_pair(model_sync_weights_offset, model_sync_weights_size);
+    return ptr;
+  }
+
+  std::pair<void *, size_t>
+      get_sync_weights_ptr_and_size(PEFTModelID const &peft_model_id) {
+    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
+    assert(sync_weights.find(peft_model_id) != sync_weights.end());
+    std::pair<off_t, size_t> offset_and_size = sync_weights[peft_model_id];
+    return std::make_pair(static_cast<char *>(base_ptr) + offset_and_size.first,
+                          offset_and_size.second);
+  }
+
+  inline void *allocate_local_weights_untyped(PEFTModelID const &peft_model_id,
+                                              size_t datalen) {
+    const std::lock_guard<std::mutex> lock(peft_weight_allocator_mutex);
+    local_offset -= datalen;
+    assert(sync_offset < local_offset);
+    void *ptr = static_cast<char *>(base_ptr) + local_offset;
+    return ptr;
+  }
+
+  template <typename DT>
+  inline DT *allocate_sync_weights(PEFTModelID const &peft_model_id,
+                                   size_t count) {
+    return static_cast<DT *>(
+        allocate_sync_weights_untyped(peft_model_id, sizeof(DT) * count));
+  }
+
+  template <typename DT>
+  inline DT *allocate_local_weights(PEFTModelID const &peft_model_id,
+                                    size_t count) {
+    return static_cast<DT *>(
+        allocate_local_weights_untyped(peft_model_id, sizeof(DT) * count));
+  }
+
+public:
+  void *base_ptr;
+  size_t total_size;
+  off_t sync_offset, local_offset;
+  std::unordered_map<PEFTModelID, std::pair<off_t, size_t>> sync_weights;
+  std::mutex peft_weight_allocator_mutex;
+};
+
+}; // namespace FlexFlow
+
+#endif // _FLEXFLOW_UTILS_PEFT_WEIGHT_ALLOCATOR_H_
diff --git a/inference/MODEL_WEIGHTS.md b/inference/MODEL_WEIGHTS.md
deleted file mode 100644
index d78fb37be9..0000000000
--- a/inference/MODEL_WEIGHTS.md
+++ /dev/null
@@ -1,28 +0,0 @@
-To convert the weights of a HuggingFace LLM to SpecInfer's weight format, we first load the model and modify the tensor names to match SpecInfer's convention, and then convert these tensors to numpy arrays to store them in binary files.
-
-```python
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
-
-for name, params in model.named_parameters():
-    for name, params in model.named_parameters():
-    name = (
-        name.replace(".", "_")
-        .replace("self_attn", "attention")
-        .replace("q_proj", "wq")
-        .replace("k_proj", "wk")
-        .replace("v_proj", "wv")
-        .replace("o_proj", "wo")
-        .replace("mlp", "feed_forward")
-        .replace("gate_proj", "w1")
-        .replace("down_proj", "w2")
-        .replace("up_proj", "w3")
-        .replace("input_layernorm", "attention_norm")
-        .replace("post_attention_layernorm", "ffn_norm")
-        .replace("embed_tokens", "tok_embeddings")
-        .replace("lm_head", "output")
-        .replace("model_", "")
-    )
-    params.detach().cpu().numpy().tofile('weights/llama_7B_weights/' + name)
-```
-
diff --git a/inference/README.md b/inference/README.md
new file mode 100644
index 0000000000..14c94e22ac
--- /dev/null
+++ b/inference/README.md
@@ -0,0 +1,42 @@
+# Inference Examples
+This folder contains the code to run inference examples in FlexFlow
+
+To create a sample prompt, call (from the `build` folder):
+
+```bash
+mkdir -p ../inference/prompt
+echo '["San Francisco is a "]' > ../inference/prompt/test.json
+```
+
+To download a model for use in C++, call:
+```bash
+huggingface-cli login # if needed
+python ../inference/utils/download_hf_model.py meta-llama/Llama-2-7b-hf --half-precision-only
+```
+
+To run the incremental decoding example in C++, call:
+
+```bash
+./inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4
+```
+
+To run the speculative inference example in C++, call:
+
+```bash
+./inference/spec_infer/spec_infer -ll:cpu 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../inference/prompt/test.json -tensor-parallelism-degree 4
+```
+
+To run a PEFT model example in C++, call:
+
+```bash
+./inference/peft/peft \
+    -ll:gpu 4 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 4 \
+    -ll:fsize 8192 -ll:zsize 12000 \
+    -llm-model JackFram/llama-160m \
+    -finetuning-dataset ../inference/prompt/peft_dataset.json \
+    -peft-model goliaro/llama-160m-lora \
+    -enable-peft \
+    --use-full-precision \
+    --inference-debugging
+```
\ No newline at end of file
diff --git a/inference/incr_decoding/incr_decoding.cc b/inference/incr_decoding/incr_decoding.cc
index ec3dda3158..c9ffff5c07 100644
--- a/inference/incr_decoding/incr_decoding.cc
+++ b/inference/incr_decoding/incr_decoding.cc
@@ -264,15 +264,18 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*parser_callback_t */ nullptr,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
-    std::vector<std::string> prompts;
+
+    std::vector<Request> requests;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      Request inference_req;
+      inference_req.prompt = text;
+      inference_req.max_sequence_length = 128;
+      requests.push_back(inference_req);
       total_num_requests++;
-      prompts.push_back(text);
     }
-    std::vector<GenerationResult> result =
-        model.generate(prompts, 128 /*max_sequence_length*/);
+    std::vector<GenerationResult> result = model.generate(requests);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/models/falcon.cc b/inference/models/falcon.cc
index a529411ddb..195d6ba7e3 100644
--- a/inference/models/falcon.cc
+++ b/inference/models/falcon.cc
@@ -76,7 +76,7 @@ void FALCON::create_falcon_model(FFModel &ff,
           falcon_config.layer_norm_epsilon,
           true,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_layer_norm(
@@ -89,8 +89,9 @@ void FALCON::create_falcon_model(FFModel &ff,
           true,
           falcon_config.layer_norm_epsilon,
           true,
+          false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_input_layernorm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = res_ln_outputs[0];
       att_norm = res_ln_outputs[1];
@@ -116,7 +117,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -141,7 +142,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -166,7 +167,7 @@ void FALCON::create_falcon_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attention")
                 .c_str() /*name*/
         );
         break;
@@ -187,7 +188,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_h_to_4h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_h_to_4h")
             .c_str());
 
     dense_h_to_4h = ff.gelu(dense_h_to_4h);
@@ -203,7 +204,7 @@ void FALCON::create_falcon_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_dense_4h_to_h")
+        std::string("layers." + std::to_string(i) + ".mlp.dense_4h_to_h")
             .c_str());
   }
   // final normalization and linear
@@ -216,6 +217,7 @@ void FALCON::create_falcon_model(FFModel &ff,
                          true,
                          falcon_config.layer_norm_epsilon,
                          true,
+                         false,
                          DT_NONE,
                          "ln_f");
   Tensor ln_f = res_ln_outputs[1];
diff --git a/inference/models/llama.cc b/inference/models/llama.cc
index 517f534438..cf26194597 100644
--- a/inference/models/llama.cc
+++ b/inference/models/llama.cc
@@ -58,7 +58,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "tok_embeddings");
+                              "embed_tokens");
 
   Tensor w2 = nullptr;
 
@@ -75,7 +75,7 @@ void LLAMA::create_llama_model(FFModel &ff,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
     } else {
       ff.residual_rms_norm(
@@ -84,8 +84,9 @@ void LLAMA::create_llama_model(FFModel &ff,
           token_att_norm,
           llama_config.rms_norm_eps,
           llama_config.hidden_size,
+          false, // inplace_residual
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_attention_norm")
+          std::string("layers." + std::to_string(i) + ".input_layernorm")
               .c_str());
       token = token_att_norm[0];
       att_norm = token_att_norm[1];
@@ -94,10 +95,11 @@ void LLAMA::create_llama_model(FFModel &ff,
     Tensor mha;
     switch (mode) {
       case BEAM_SEARCH_MODE: {
-        mha = ff.spec_inc_multihead_self_attention(
+        mha = ff.spec_inc_multiquery_self_attention(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -111,16 +113,17 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
       }
       case TREE_VERIFY_MODE: {
-        mha = ff.inc_multihead_self_attention_verify(
+        mha = ff.inc_multiquery_self_attention_verify(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -134,16 +137,17 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
       }
       case INC_DECODING_MODE: {
-        mha = ff.inc_multihead_self_attention(
+        mha = ff.inc_multiquery_self_attention(
             att_norm,
             llama_config.hidden_size,
             llama_config.num_attention_heads,
+            llama_config.num_key_value_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             llama_config.hidden_size / llama_config.num_attention_heads,
             0.0f,    /*dropout*/
@@ -157,7 +161,7 @@ void LLAMA::create_llama_model(FFModel &ff,
             1.0f,    /*scaling factor*/
             true,    /*qk_prod_scaling*/
             false,   /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -175,54 +179,56 @@ void LLAMA::create_llama_model(FFModel &ff,
         token_ff_norm,
         llama_config.rms_norm_eps,
         llama_config.hidden_size,
+        false, // inplace_residual
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ffn_norm").c_str());
+        std::string("layers." + std::to_string(i) + ".post_attention_layernorm")
+            .c_str());
     token = token_ff_norm[0];
     Tensor ff_norm = token_ff_norm[1];
 
-    Tensor w1 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w1")
-                     .c_str());
+    Tensor w1 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.gate_proj").c_str());
 
-    Tensor w3 =
-        ff.dense(ff_norm,
-                 llama_config.intermediate_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w3")
-                     .c_str());
+    Tensor w3 = ff.dense(
+        ff_norm,
+        llama_config.intermediate_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.up_proj").c_str());
 
     Tensor multi = ff.sigmoid_silu_multi(w1, w3);
 
-    w2 =
-        ff.dense(multi,
-                 llama_config.hidden_size,
-                 AC_MODE_NONE,
-                 false,
-                 DT_NONE,
-                 nullptr,
-                 nullptr,
-                 nullptr,
-                 REG_MODE_NONE,
-                 0.0f,
-                 std::string("layers_" + std::to_string(i) + "_feed_forward_w2")
-                     .c_str());
+    w2 = ff.dense(
+        multi,
+        llama_config.hidden_size,
+        AC_MODE_NONE,
+        false,
+        DT_NONE,
+        nullptr,
+        nullptr,
+        nullptr,
+        REG_MODE_NONE,
+        0.0f,
+        std::string("layers." + std::to_string(i) + ".mlp.down_proj").c_str());
+    // Low-Rank Adapter (LoRA) for the second linear layer
+    // ff.lora_linear(std::string("down_proj"), std::string("layers." +
+    // std::to_string(i) + ".mlp.down_proj.lora").c_str());
   }
   // final normalization and linear
   Tensor final_rms_norm_output[2] = {nullptr, nullptr};
@@ -231,6 +237,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                        final_rms_norm_output,
                        llama_config.rms_norm_eps,
                        llama_config.hidden_size,
+                       false, // inplace_residual
                        DT_NONE,
                        "norm");
 
@@ -244,7 +251,7 @@ void LLAMA::create_llama_model(FFModel &ff,
                           nullptr,
                           REG_MODE_NONE,
                           0.0f,
-                          "output");
+                          "lm_head");
 
   Tensor output;
   if (mode == BEAM_SEARCH_MODE) {
@@ -261,7 +268,8 @@ void LLAMA::create_llama_model(FFModel &ff,
       output = ff.sampling(softmax, generation_config.topp);
     } else {
       // output = ff.arg_top_k(dense, /*k=*/1, false);
-      output = ff.argmax(dense, /*beam_Search*/ false);
+      Tensor softmax = ff.softmax(dense, -1);
+      output = ff.argmax(softmax, /*beam_Search*/ false);
     }
   }
 
@@ -269,7 +277,7 @@ void LLAMA::create_llama_model(FFModel &ff,
       "",
       weight_file_path,
       llama_config.num_attention_heads,
-      llama_config.num_attention_heads,
+      llama_config.num_key_value_heads,
       llama_config.hidden_size,
       llama_config.hidden_size / llama_config.num_attention_heads,
       ff.config.tensor_parallelism_degree,
diff --git a/inference/models/llama.h b/inference/models/llama.h
index ba1f0236f9..edb78f1300 100644
--- a/inference/models/llama.h
+++ b/inference/models/llama.h
@@ -36,6 +36,11 @@ class LLAMA {
           num_hidden_layers = model_config["num_hidden_layers"];
           vocab_size = model_config["vocab_size"];
           num_attention_heads = model_config["num_attention_heads"];
+          if (model_config.find("num_key_value_heads") != model_config.end()) {
+            num_key_value_heads = model_config["num_key_value_heads"];
+          } else {
+            num_key_value_heads = num_attention_heads;
+          }
           hidden_size = model_config["hidden_size"];
           rms_norm_eps = model_config["rms_norm_eps"];
           intermediate_size = model_config["intermediate_size"];
@@ -61,6 +66,8 @@ class LLAMA {
       std::cout << "\tvocab_size: " << vocab_size << std::endl;
       std::cout << "\tnum_attention_heads: " << num_attention_heads
                 << std::endl;
+      std::cout << "\tnum_key_value_heads: " << num_key_value_heads
+                << std::endl;
       std::cout << "\thidden_size: " << hidden_size << std::endl;
       std::cout << "\trms_norm_eps: " << rms_norm_eps << std::endl;
       std::cout << "\tintermediate_size: " << intermediate_size << std::endl;
@@ -73,8 +80,8 @@ class LLAMA {
 
     // int max_seq_len, max_num_tokens;
     int max_beam_width, max_beam_depth;
-    int num_hidden_layers, vocab_size, num_attention_heads, hidden_size,
-        intermediate_size;
+    int num_hidden_layers, vocab_size, num_attention_heads, num_key_value_heads,
+        hidden_size, intermediate_size;
     float rms_norm_eps;
   };
 
diff --git a/inference/models/mpt.cc b/inference/models/mpt.cc
index 70e2b5e9c5..e4a7e0056d 100644
--- a/inference/models/mpt.cc
+++ b/inference/models/mpt.cc
@@ -58,7 +58,7 @@ void MPT::create_mpt_model(FFModel &ff,
                                       use_full_precision ? DT_FLOAT : DT_HALF,
                                       NULL,
                                       embed_init,
-                                      "transformer_wte");
+                                      "wte");
 
   Tensor intermediate_output = nullptr, layernorm_output = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -74,7 +74,7 @@ void MPT::create_mpt_model(FFModel &ff,
           1e-05,
           false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
     } else {
       ff.residual_layer_norm(
           intermediate_output,
@@ -86,8 +86,9 @@ void MPT::create_mpt_model(FFModel &ff,
           true,
           1e-05,
           false,
+          false,
           DT_NONE,
-          std::string("layers_" + std::to_string(i) + "_norm_1").c_str());
+          std::string("layers." + std::to_string(i) + ".norm_1").c_str());
       hidden_states = res_ln_outputs[0];
       layernorm_output = res_ln_outputs[1];
     }
@@ -113,7 +114,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -137,7 +138,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -161,7 +162,7 @@ void MPT::create_mpt_model(FFModel &ff,
             pow((mpt_config.hidden_size / mpt_config.n_heads), -0.5),
             /*qk_prod_scaling*/ false,
             /*position_bias*/ true,
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn")
                 .c_str() /*name*/
         );
         break;
@@ -181,8 +182,9 @@ void MPT::create_mpt_model(FFModel &ff,
         true,
         1e-05,
         false,
+        false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_norm_2").c_str());
+        std::string("layers." + std::to_string(i) + ".norm_2").c_str());
     hidden_states = res_ln_outputs[0];
     layernorm_output = res_ln_outputs[1];
 
@@ -198,7 +200,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_up_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.up_proj").c_str());
     layernorm_output = ff.gelu(layernorm_output);
     intermediate_output = ff.dense(
         layernorm_output,
@@ -211,7 +213,7 @@ void MPT::create_mpt_model(FFModel &ff,
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_ffn_down_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".ffn.down_proj").c_str());
   }
 
   // final
@@ -224,8 +226,9 @@ void MPT::create_mpt_model(FFModel &ff,
                          true,
                          1e-05,
                          false,
+                         false,
                          DT_NONE,
-                         "transformer_norm_f");
+                         "norm_f");
   Tensor all_final_norm = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(all_final_norm,
diff --git a/inference/models/opt.cc b/inference/models/opt.cc
index 5677d5658e..b3f2ef4e17 100644
--- a/inference/models/opt.cc
+++ b/inference/models/opt.cc
@@ -94,8 +94,9 @@ void OPT::create_opt_model(FFModel &ff,
         opt_config.layer_norm_elementwise_affine,
         1e-05,
         true,
+        false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_attention_layer_norm")
+        std::string("layers." + std::to_string(i) + ".self_attn_layer_norm")
             .c_str());
     Tensor residual = res_ln_outputs[0];
     Tensor hidden_states = res_ln_outputs[1];
@@ -121,7 +122,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -145,7 +146,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -169,7 +170,7 @@ void OPT::create_opt_model(FFModel &ff,
                 -0.5), /*scaling factor*/
             false,     /*qk_prod_scaling*/
             false,     /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".self_attn")
                 .c_str() /*name*/
         );
         break;
@@ -186,9 +187,10 @@ void OPT::create_opt_model(FFModel &ff,
                                     opt_config.layer_norm_elementwise_affine,
                                     1e-05,
                                     true,
+                                    false,
                                     DT_NONE,
-                                    std::string("layers_" + std::to_string(i) +
-                                                "_add_bias_residual_layer_norm")
+                                    std::string("layers." + std::to_string(i) +
+                                                ".add_bias_residual_layer_norm")
                                         .c_str());
     added = res_ln_outputs[0];
     Tensor final_norm = res_ln_outputs[1];
@@ -205,7 +207,7 @@ void OPT::create_opt_model(FFModel &ff,
                  nullptr,
                  REG_MODE_NONE,
                  0.0f,
-                 std::string("layers_" + std::to_string(i) + "_fc1").c_str());
+                 std::string("layers." + std::to_string(i) + ".fc1").c_str());
     fc2 = ff.dense(fc1,
                    opt_config.hidden_size,
                    AC_MODE_NONE,
@@ -216,7 +218,10 @@ void OPT::create_opt_model(FFModel &ff,
                    nullptr,
                    REG_MODE_NONE,
                    0.0f,
-                   std::string("layers_" + std::to_string(i) + "_fc2").c_str());
+                   std::string("layers." + std::to_string(i) + ".fc2").c_str());
+    // Low-Rank Adapter (LoRA) for the second linear layer
+    // ff.lora_linear(std::string("fc2"), std::string("layers." +
+    // std::to_string(i) + ".fc2.lora").c_str());
   }
 
   // final
@@ -229,6 +234,7 @@ void OPT::create_opt_model(FFModel &ff,
                          opt_config.layer_norm_elementwise_affine,
                          1e-05,
                          true,
+                         false,
                          DT_NONE,
                          "final_layer_norm");
   Tensor all_final_norm = res_ln_outputs[1];
@@ -243,7 +249,7 @@ void OPT::create_opt_model(FFModel &ff,
                             nullptr,
                             REG_MODE_NONE,
                             0.0f,
-                            "embed_tokens_weight_lm_head");
+                            "lm_head");
 
   Tensor output;
   if (mode == BEAM_SEARCH_MODE) {
@@ -252,7 +258,8 @@ void OPT::create_opt_model(FFModel &ff,
     output = ff.argmax(softmax, /*beam_Search*/ true);
   } else {
     // output = ff.arg_top_k(lm_head, /*k=*/1, false);
-    output = ff.argmax(lm_head, /*beam_Search*/ false);
+    Tensor softmax = ff.softmax(lm_head, -1);
+    output = ff.argmax(softmax, /*beam_Search*/ false);
   }
 
   FileDataLoader *fileloader = new FileDataLoader(
diff --git a/inference/models/starcoder.cc b/inference/models/starcoder.cc
index 8b0dc1098c..cd8bf3a9a7 100644
--- a/inference/models/starcoder.cc
+++ b/inference/models/starcoder.cc
@@ -66,7 +66,7 @@ void STARCODER::create_starcoder_model(
                               use_full_precision ? DT_FLOAT : DT_HALF,
                               NULL,
                               embed_init,
-                              "transformer_wte");
+                              "wte");
 
   Tensor positional_embedding =
       ff.embedding(position_input,
@@ -76,7 +76,7 @@ void STARCODER::create_starcoder_model(
                    use_full_precision ? DT_FLOAT : DT_HALF,
                    NULL,
                    embed_init,
-                   "transformer_wpe");
+                   "wpe");
 
   Tensor residual = nullptr, c_proj = nullptr;
   Tensor res_ln_outputs[2] = {nullptr, nullptr};
@@ -96,8 +96,9 @@ void STARCODER::create_starcoder_model(
         true,
         startcoder_config.layer_norm_epsilon,
         true,
+        false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_1").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_1").c_str());
     Tensor hidden_states = res_ln_outputs[0];
     Tensor ln_1 = res_ln_outputs[1];
 
@@ -124,7 +125,7 @@ void STARCODER::create_starcoder_model(
             1.0f,                        /*scaling factor*/
             true,                        /*qk_prod_scaling*/
             false,                       /*position_bias*/
-            std::string("layers_" + std::to_string(i) + "_attention")
+            std::string("layers." + std::to_string(i) + ".attn.c_attn")
                 .c_str() /*name*/
         );
         break;
@@ -144,8 +145,9 @@ void STARCODER::create_starcoder_model(
         true,
         startcoder_config.layer_norm_epsilon,
         true,
+        false,
         DT_NONE,
-        std::string("layers_" + std::to_string(i) + "_ln_2").c_str());
+        std::string("layers." + std::to_string(i) + ".ln_2").c_str());
     residual = res_ln_outputs[0];
     Tensor l2_norm = res_ln_outputs[1];
 
@@ -161,7 +163,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_fc").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_fc").c_str());
 
     c_fc = ff.gelu(c_fc);
 
@@ -176,7 +178,7 @@ void STARCODER::create_starcoder_model(
         nullptr,
         REG_MODE_NONE,
         0.0f,
-        std::string("layers_" + std::to_string(i) + "_mlp_c_proj").c_str());
+        std::string("layers." + std::to_string(i) + ".mlp.c_proj").c_str());
   }
   // final normalization and linear
   ff.residual_layer_norm(residual,
@@ -188,8 +190,9 @@ void STARCODER::create_starcoder_model(
                          true,
                          startcoder_config.layer_norm_epsilon,
                          true,
+                         false,
                          DT_NONE,
-                         "transformer_ln_f");
+                         "ln_f");
   Tensor ln_f = res_ln_outputs[1];
 
   Tensor lm_head = ff.dense(ln_f,
diff --git a/inference/peft/CMakeLists.txt b/inference/peft/CMakeLists.txt
new file mode 100644
index 0000000000..e0bad79cab
--- /dev/null
+++ b/inference/peft/CMakeLists.txt
@@ -0,0 +1,139 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FlexFlow_Peft)
+
+# Normal PEFT
+set(project_target1 peft)
+set(CPU_SRC1
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC1} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target1} ${CPU_SRC1})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target1} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target1} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target1} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target1} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target1} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target1} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target1} DESTINATION ${BIN_DEST})
+
+# FWD benchmark
+set(project_target2 peft_fwd_benchmark)
+set(CPU_SRC2
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft_fwd_benchmark.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target2} ${CPU_SRC2})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC2} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target2} ${CPU_SRC2})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target2} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target2} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target2} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target2} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target2} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target2} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target2} DESTINATION ${BIN_DEST})
+
+# BWD benchmark
+set(project_target3 peft_bwd_benchmark)
+set(CPU_SRC3
+  ${FLEXFLOW_CPP_DRV_SRC}
+  peft_bwd_benchmark.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC3} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target3} ${CPU_SRC3})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target3} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target3} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target3} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target3} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target3} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target3} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target3} DESTINATION ${BIN_DEST})
+
+# Online peft
+set(project_target4 req_rate_benchmark)
+set(CPU_SRC4
+  ${FLEXFLOW_CPP_DRV_SRC}
+  req_rate_benchmark.cc
+  ../models/llama.cc
+  ../models/opt.cc
+  ../models/falcon.cc
+  ../models/starcoder.cc
+  ../models/mpt.cc)
+
+if (FF_GPU_BACKEND STREQUAL "cuda" OR FF_GPU_BACKEND STREQUAL "hip_cuda")
+  cuda_add_executable(${project_target4} ${CPU_SRC4})
+  if (FF_GPU_BACKEND STREQUAL "hip_cuda")
+    target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_NVIDIA__)
+  endif()
+elseif(FF_GPU_BACKEND STREQUAL "hip_rocm")
+  set_source_files_properties(${CPU_SRC4} PROPERTIES LANGUAGE HIP)
+  hip_add_executable(${project_target4} ${CPU_SRC4})
+  if (FF_HIP_ARCH STREQUAL "")
+    message(FATAL_ERROR "FF_HIP_ARCH is empty!")
+  endif()
+  set_property(TARGET ${project_target4} PROPERTY HIP_ARCHITECTURES "${FF_HIP_ARCH}")
+  target_compile_definitions(${project_target4} PRIVATE __HIP_PLATFORM_AMD__)
+else()
+  message(FATAL_ERROR "Compilation of ${project_target4} for ${FF_GPU_BACKEND} backend not yet supported")
+endif()
+
+target_include_directories(${project_target4} PRIVATE ${FLEXFLOW_INCLUDE_DIRS} ${CMAKE_INSTALL_INCLUDEDIR})
+target_include_directories(${project_target4} PRIVATE ${CMAKE_SOURCE_DIR}/inference)
+target_link_libraries(${project_target4} -Wl,--whole-archive flexflow -Wl,--no-whole-archive ${FLEXFLOW_EXT_LIBRARIES})
+set(BIN_DEST "bin")
+install(TARGETS ${project_target4} DESTINATION ${BIN_DEST})
diff --git a/inference/peft/Makefile b/inference/peft/Makefile
new file mode 100644
index 0000000000..0e4b79f51f
--- /dev/null
+++ b/inference/peft/Makefile
@@ -0,0 +1,37 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 0		# Include debugging symbols
+MAX_DIM         ?= 4		# Maximum number of dimensions
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_CUDA        ?= 1		# Include CUDA support (requires CUDA)
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 1		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Put the binary file name here
+OUTFILE		?= llama_pipeline
+# List all the application source files here
+ifndef CUDA_HOME
+CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc | head -1))
+endif
+
+
+ifndef FF_HOME
+$(error FF_HOME variable is not defined, aborting build)
+endif
+
+include $(FF_HOME)/FlexFlow.mk
diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
new file mode 100644
index 0000000000..c55f2c0bfd
--- /dev/null
+++ b/inference/peft/peft.cc
@@ -0,0 +1,387 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string dataset_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // dataset for finetuning
+    if (!strcmp(argv[i], "-finetuning-dataset")) {
+      paths.dataset_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 1;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  bool enable_peft_finetuning = true;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  LoraOptimizerConfig *optim_config = nullptr;
+  if (enable_peft_finetuning) {
+    // float sgd_learning_rate = 2e-1;
+    float sgd_learning_rate = 1.0f;
+    optim_config = new LoraSGDOptimizerConfig(sgd_learning_rate);
+  }
+  LoraLinearConfig peft_config_finetuning =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path,
+                             peft_model_name,
+                             true /*trainable*/,
+                             optim_config,
+                             false /*init_lora_weights*/,
+                             llm_model_name,
+                             use_full_precision ? "fp32" : "fp16");
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr, *peft_model_id_finetuning = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+    if (enable_peft_finetuning) {
+      peft_model_id_finetuning = model.add_lora_layer(peft_config_finetuning);
+    }
+  }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Run workload
+  {
+    std::vector<Request> requests;
+
+    // Add inference requests
+    if (!file_paths.prompt_file_path.empty()) {
+      using json = nlohmann::json;
+      std::ifstream file_handle(file_paths.prompt_file_path);
+      assert(file_handle.good() && "Prompt file does not exist.");
+      json prompt_json = json::parse(file_handle,
+                                     /*parser_callback_t */ nullptr,
+                                     /*allow_exceptions */ true,
+                                     /*ignore_comments */ true);
+      int total_num_requests = 0;
+      for (auto &prompt : prompt_json) {
+        std::string text = prompt.get<std::string>();
+        printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
+        Request inference_req;
+        inference_req.prompt = text;
+        inference_req.max_sequence_length = 128;
+        inference_req.peft_model_id =
+            (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+        requests.push_back(inference_req);
+        total_num_requests++;
+      }
+    }
+
+    // Add fine-tuning request
+    if (enable_peft_finetuning) {
+      assert(!file_paths.dataset_file_path.empty() &&
+             "Dataset file path is required for fine-tuning.");
+      printf("Finetuning request with dataset %s\n",
+             file_paths.dataset_file_path.c_str());
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.peft_model_id = (peft_model_id_finetuning != nullptr)
+                                          ? *peft_model_id_finetuning
+                                          : PEFTModelID::NO_ID;
+      fine_tuning_req.dataset_filepath = file_paths.dataset_file_path;
+      fine_tuning_req.max_training_steps = 2;
+      requests.push_back(fine_tuning_req);
+    }
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/peft/peft_bwd_benchmark.cc b/inference/peft/peft_bwd_benchmark.cc
new file mode 100644
index 0000000000..86d6d8cbbf
--- /dev/null
+++ b/inference/peft/peft_bwd_benchmark.cc
@@ -0,0 +1,391 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_requests_to_run) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-to-run")) {
+      max_requests_to_run = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  int max_requests_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_requests_to_run);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Warmup stage
+  {
+    std::vector<Request> requests;
+    for (int i = 0; i < 100; i++) {
+      Request inference_req;
+      inference_req.benchmarking_tokens = 128;
+      inference_req.max_sequence_length = 256;
+      inference_req.warmup = true;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.warmup = true;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1;
+    requests.push_back(fine_tuning_req);
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  rm->set_inference_finished(false); // reset inference finished flag
+  std::cout << "----------warmup finished--------------" << std::endl;
+
+  // Run workload
+  {
+    std::vector<Request> requests;
+
+    // Add inference requests
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+    std::vector<int> lengths;
+    int index = 0;
+    for (auto &entry : prompt_json) {
+      if (index == max_requests_to_run) {
+        break;
+      }
+      int prompt_length = entry.get<int>();
+      assert(prompt_length > 0 && "Prompt length must be greater than 0.");
+      assert(prompt_length <= 1024 &&
+             "Prompt length must be less than or equal to 1024.");
+      lengths.push_back(prompt_length);
+      index++;
+    }
+    printf("Total number of finetuning requests: %ld", lengths.size());
+
+    // Add fine-tuning requests
+    for (int i = 0; i < lengths.size(); i++) {
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.benchmarking_tokens = lengths[i];
+      fine_tuning_req.max_sequence_length = lengths[i];
+      fine_tuning_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      fine_tuning_req.max_training_steps = 1;
+      requests.push_back(fine_tuning_req);
+    }
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
+  std::cout << "----------finetuning finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/peft/peft_fwd_benchmark.cc b/inference/peft/peft_fwd_benchmark.cc
new file mode 100644
index 0000000000..9ff042c157
--- /dev/null
+++ b/inference/peft/peft_fwd_benchmark.cc
@@ -0,0 +1,363 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "models/falcon.h"
+#include "models/llama.h"
+#include "models/mpt.h"
+#include "models/opt.h"
+#include "models/starcoder.h"
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_requests_to_run) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-to-run")) {
+      max_requests_to_run = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  int max_requests_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_requests_to_run);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  // Start background server
+  rm->start_background_server(&model);
+
+  // Run workload
+  {
+    std::vector<Request> requests;
+
+    // Add inference requests
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+    std::vector<std::pair<int, int>> prompts;
+    int index = 0;
+    for (auto &entry : prompt_json) {
+      if (index >= max_requests_to_run) {
+        break;
+      }
+      int prompt_length = entry["human"];
+      int sequence_length = entry["gpt"];
+      assert(prompt_length + sequence_length <= max_sequence_length &&
+             "Prompt + sequence length exceeds max sequence length");
+      prompts.push_back(std::make_pair(prompt_length, sequence_length));
+      index++;
+    }
+    printf("Total number of prompts: %ld", prompts.size());
+    for (auto &prompt : prompts) {
+      // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
+      // sequence_length);
+      Request inference_req;
+      inference_req.benchmarking_tokens = prompt.first;
+      inference_req.max_sequence_length = prompt.second + prompt.first;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  if (peft_model_id != nullptr) {
+    free(peft_model_id);
+  }
+
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/peft/req_rate_benchmark.cc b/inference/peft/req_rate_benchmark.cc
new file mode 100644
index 0000000000..43008e74fe
--- /dev/null
+++ b/inference/peft/req_rate_benchmark.cc
@@ -0,0 +1,518 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/inference.h"
+#include "flexflow/request_manager.h"
+#include "inference/models/falcon.h"
+#include "inference/models/llama.h"
+#include "inference/models/mpt.h"
+#include "inference/models/opt.h"
+#include "inference/models/starcoder.h"
+#include <chrono>
+#include <mutex>
+#include <thread>
+#include <wordexp.h>
+
+#include <nlohmann/json.hpp>
+
+using namespace FlexFlow;
+using namespace Legion;
+using json = nlohmann::json;
+
+Legion::Logger log_app("llama");
+
+class ConcurrentQueue {
+public:
+  std::queue<RequestManager::RequestGuid> inf_queue;
+  std::queue<RequestManager::RequestGuid> peft_queue;
+  std::mutex request_queue_mutex;
+  bool producer_finished = false;
+};
+
+ConcurrentQueue *common_guids_singleton = nullptr;
+int nb_millisecs = 1000; // Default bucket timeframe is 1 second
+
+ConcurrentQueue *get_common_guids_queue() {
+  if (common_guids_singleton == nullptr) {
+    common_guids_singleton = new ConcurrentQueue();
+  }
+  return common_guids_singleton;
+}
+
+void consume() {
+  RequestManager *rm = RequestManager::get_request_manager();
+  ConcurrentQueue *guids = get_common_guids_queue();
+  bool producer_is_finished = false;
+  bool queue_is_empty = false;
+  // int i=0;
+  while (!producer_is_finished || !queue_is_empty) {
+    RequestManager::RequestGuid guid = RequestManager::INVALID_GUID;
+    {
+      const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+      queue_is_empty = guids->inf_queue.empty();
+      producer_is_finished = guids->producer_finished;
+      if (!queue_is_empty) {
+        guid = guids->inf_queue.front();
+        guids->inf_queue.pop();
+      }
+    }
+    if (guid != RequestManager::INVALID_GUID) {
+      GenerationResult result = rm->get_generation_result(guid);
+    } else {
+      std::this_thread::sleep_for(std::chrono::milliseconds(nb_millisecs));
+    }
+    // i++;
+    // cout << "Iteration " << i;
+  }
+  rm->set_inference_finished();
+
+  while (guids->peft_queue.size() > 0) {
+    GenerationResult result =
+        rm->get_generation_result(guids->peft_queue.front());
+    guids->peft_queue.pop();
+  }
+}
+
+struct FilePaths {
+  std::string cache_folder_path;
+  std::string prompt_file_path;
+  std::string output_file_path;
+};
+
+void parse_input_args(char **argv,
+                      int argc,
+                      FilePaths &paths,
+                      std::string &llm_model_name,
+                      std::string &peft_model_name,
+                      bool &use_full_precision,
+                      bool &verbose,
+                      bool &do_sample,
+                      bool &enable_peft,
+                      float &temperature,
+                      float &topp,
+                      int &max_requests_per_batch,
+                      int &max_tokens_per_batch,
+                      int &max_sequence_length,
+                      int &max_buckets_to_run,
+                      int &bucket_timeframe) {
+  for (int i = 1; i < argc; i++) {
+    // llm model type
+    if (!strcmp(argv[i], "-llm-model")) {
+      llm_model_name = std::string(argv[++i]);
+      for (char &c : llm_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    if (!strcmp(argv[i], "-enable-peft")) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-model")) {
+      peft_model_name = std::string(argv[++i]);
+      for (char &c : peft_model_name) {
+        c = std::tolower(c);
+      }
+      continue;
+    }
+    // cache folder
+    if (!strcmp(argv[i], "-cache-folder")) {
+      paths.cache_folder_path = std::string(argv[++i]);
+      continue;
+    }
+    // prompts
+    if (!strcmp(argv[i], "-prompt")) {
+      paths.prompt_file_path = std::string(argv[++i]);
+      continue;
+    }
+    // output file
+    if (!strcmp(argv[i], "-output-file")) {
+      paths.output_file_path = std::string(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--use-full-precision")) {
+      use_full_precision = true;
+      continue;
+    }
+    // verbose logging to stdout
+    if (!strcmp(argv[i], "--verbose")) {
+      verbose = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--do-sample")) {
+      do_sample = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "--temperature")) {
+      temperature = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--topp")) {
+      topp = std::stof(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-requests-per-batch")) {
+      max_requests_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-tokens-per-batch")) {
+      max_tokens_per_batch = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-sequence-length")) {
+      max_sequence_length = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--max-buckets-to-run")) {
+      max_buckets_to_run = std::stoi(argv[++i]);
+      continue;
+    }
+    if (!strcmp(argv[i], "--bucket-timeframe")) {
+      bucket_timeframe = std::stoi(argv[++i]);
+      continue;
+    }
+  }
+  if (paths.cache_folder_path.empty()) {
+    char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+    paths.cache_folder_path = ff_cache_path ? std::string(ff_cache_path)
+                                            : std::string("~/.cache/flexflow");
+  }
+  // Expand ~ to the home directory if needed
+  wordexp_t p;
+  wordexp(paths.cache_folder_path.c_str(), &p, 0);
+  paths.cache_folder_path = p.we_wordv[0];
+  wordfree(&p);
+}
+
+void FlexFlow::top_level_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  FFConfig ffconfig;
+  if (ffconfig.cpu_offload == false && ffconfig.quantization_type != DT_NONE) {
+    assert(false && "Doesn't support quantization in non-offload mode");
+  }
+  FilePaths file_paths;
+  std::string llm_model_name, peft_model_name;
+  bool use_full_precision = false;
+  bool verbose = false;
+  bool do_sample = false;
+  bool enable_peft = false;
+  float temperature = 0.0f;
+  float topp = 0.0f;
+  int max_requests_per_batch = 8;
+  int max_tokens_per_batch = 128;
+  int max_sequence_length = 256;
+  int max_buckets_to_run = 1000000000;
+  bool enable_peft_finetuning = false;
+  int bucket_timespan = 1;
+
+  InputArgs const &command_args = HighLevelRuntime::get_input_args();
+  char **argv = command_args.argv;
+  int argc = command_args.argc;
+  parse_input_args(argv,
+                   argc,
+                   file_paths,
+                   llm_model_name,
+                   peft_model_name,
+                   use_full_precision,
+                   verbose,
+                   do_sample,
+                   enable_peft,
+                   temperature,
+                   topp,
+                   max_requests_per_batch,
+                   max_tokens_per_batch,
+                   max_sequence_length,
+                   max_buckets_to_run,
+                   bucket_timespan);
+  assert(ffconfig.data_parallelism_degree * ffconfig.tensor_parallelism_degree *
+             ffconfig.pipeline_parallelism_degree ==
+         ffconfig.numNodes * ffconfig.workersPerNode);
+
+  std::string config_filepath = join_path(
+      {file_paths.cache_folder_path, "configs", llm_model_name, "config.json"});
+  std::string tokenizer_filepath =
+      join_path({file_paths.cache_folder_path, "tokenizers", llm_model_name});
+  std::string weights_filepath =
+      join_path({file_paths.cache_folder_path,
+                 "weights",
+                 llm_model_name,
+                 use_full_precision ? "full-precision" : "half-precision"});
+  std::ifstream config_file_handle(config_filepath);
+  if (!config_file_handle.good()) {
+    std::cout << "Model config file " << config_filepath << " not found."
+              << std::endl;
+    assert(false);
+  }
+  if (enable_peft && peft_model_name.empty()) {
+    std::cout << "PEFT enabled, but no PEFT model id passed" << std::endl;
+    assert(false);
+  } else if (!enable_peft && !peft_model_name.empty()) {
+    std::cout << "PEFT model id passed, but PEFT is not enabled" << std::endl;
+    assert(false);
+  }
+
+  json model_config = json::parse(config_file_handle,
+                                  /*parser_callback_t */ nullptr,
+                                  /*allow_exceptions */ true,
+                                  /*ignore_comments */ true);
+  ModelType model_type = ModelType::UNKNOWN;
+  auto architectures = model_config["architectures"];
+  for (auto const &str : architectures) {
+    if (str == "LlamaForCausalLM" || str == "LLaMAForCausalLM") {
+      model_type = ModelType::LLAMA;
+      break;
+    } else if (str == "OPTForCausalLM") {
+      model_type = ModelType::OPT;
+      break;
+    } else if (str == "RWForCausalLM" || str == "FalconForCausalLM") {
+      model_type = ModelType::FALCON;
+      break;
+    } else if (str == "GPTBigCodeForCausalLM") {
+      model_type = ModelType::STARCODER;
+      break;
+    } else if (str == "MPTForCausalLM") {
+      model_type = ModelType::MPT;
+      break;
+    }
+  }
+  int bos_token_id = model_config.find("bos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("bos_token_id");
+  int eos_token_id = model_config.find("eos_token_id") == model_config.end()
+                         ? -1
+                         : (int)model_config.at("eos_token_id");
+
+  assert(model_type != ModelType::UNKNOWN &&
+         "Invalid LLM model type passed (or no type was passed).");
+
+  // load PEFT config
+  LoraLinearConfig peft_config =
+      peft_model_name.empty()
+          ? LoraLinearConfig::EmptyConfig
+          : LoraLinearConfig(file_paths.cache_folder_path, peft_model_name);
+
+  GenerationConfig generationConfig(do_sample, temperature, topp);
+  RequestManager *rm = RequestManager::get_request_manager();
+  rm->set_max_requests_per_batch(
+      max_requests_per_batch +
+      (int)enable_peft_finetuning); // add one slot for finetuning if needed
+  rm->set_max_tokens_per_batch(max_tokens_per_batch);
+  rm->set_max_sequence_length(max_sequence_length);
+  rm->register_tokenizer(
+      model_type, bos_token_id, eos_token_id, tokenizer_filepath);
+  rm->register_output_filepath(file_paths.output_file_path);
+  rm->set_enable_peft_finetuning(enable_peft_finetuning);
+
+  FFModel model(ffconfig, ffconfig.cpu_offload);
+  if (model_type == ModelType::LLAMA) {
+    LLAMA::create_llama_model(model,
+                              config_filepath,
+                              weights_filepath,
+                              INC_DECODING_MODE,
+                              generationConfig,
+                              use_full_precision);
+  } else if (model_type == ModelType::OPT) {
+    OPT::create_opt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          use_full_precision);
+  } else if (model_type == ModelType::FALCON) {
+    FALCON::create_falcon_model(model,
+                                config_filepath,
+                                weights_filepath,
+                                INC_DECODING_MODE,
+                                use_full_precision);
+  } else if (model_type == ModelType::STARCODER) {
+    STARCODER::create_starcoder_model(model,
+                                      config_filepath,
+                                      weights_filepath,
+                                      INC_DECODING_MODE,
+                                      generationConfig,
+                                      use_full_precision);
+  } else if (model_type == ModelType::MPT) {
+    MPT::create_mpt_model(model,
+                          config_filepath,
+                          weights_filepath,
+                          INC_DECODING_MODE,
+                          generationConfig,
+                          use_full_precision);
+  } else {
+    assert(false && "unknow model type");
+  }
+
+  // Add PEFT layer
+  PEFTModelID *peft_model_id = nullptr;
+  if (!peft_model_name.empty()) {
+    peft_model_id = model.add_lora_layer(peft_config);
+  }
+
+  rm->start_background_server(&model);
+
+  // Warmup stage
+  {
+    std::vector<Request> requests;
+    for (int i = 0; i < 100; i++) {
+      Request inference_req;
+      inference_req.benchmarking_tokens = 128;
+      inference_req.max_sequence_length = 256;
+      inference_req.warmup = true;
+      inference_req.peft_model_id =
+          (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+      requests.push_back(inference_req);
+    }
+
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.warmup = true;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1;
+    requests.push_back(fine_tuning_req);
+    std::vector<GenerationResult> result = model.generate(requests);
+  }
+
+  rm->set_inference_finished(false); // reset inference finished flag
+  std::cout << "----------warmup finished--------------" << std::endl;
+
+  // Now run online workload!
+
+  nb_millisecs = nb_millisecs * bucket_timespan;
+  int total_num_requests = 0;
+  int num_arrival_buckets = 0;
+  ConcurrentQueue *guids = get_common_guids_queue();
+  std::thread consumer{consume};
+  {
+
+    // Load all requests in advance
+    using json = nlohmann::json;
+    std::ifstream file_handle(file_paths.prompt_file_path);
+    assert(file_handle.good() && "Prompt file does not exist.");
+    json prompt_json = json::parse(file_handle,
+                                   /*parser_callback_t */ nullptr,
+                                   /*allow_exceptions */ true,
+                                   /*ignore_comments */ true);
+
+    auto const &lists = prompt_json.get<std::vector<std::vector<json>>>();
+    std::vector<size_t> bucket_arrival_times_s;
+    std::vector<std::vector<std::pair<int, int>>> buckets;
+
+    size_t index = 0;
+    for (auto const &list : lists) {
+      if (!list.empty()) {
+        bucket_arrival_times_s.push_back(index);
+        std::vector<std::pair<int, int>> prompts;
+        for (auto const &dict : list) {
+          int prompt_length = dict["human"];
+          int sequence_length = dict["gpt"];
+          assert(prompt_length + sequence_length <= max_sequence_length &&
+                 "Prompt + sequence length exceeds max sequence length");
+          prompts.push_back(std::make_pair(prompt_length, sequence_length));
+        }
+        buckets.push_back(prompts);
+      }
+      index++;
+    }
+    assert(bucket_arrival_times_s.size() == buckets.size() &&
+           "Bucket arrival times and buckets are not the same size");
+    // for (int i=0; i<10; i++) {
+    //   printf("bucket_arrival_times_s[%i]: %i\n", i,
+    //   bucket_arrival_times_s[i]); printf("bucket[%i]: %i\n", i,
+    //   buckets[i].size()); for (const auto& prompt : buckets[i]) {
+    //     printf("\tprompt: %i, %i\n", prompt.first, prompt.second);
+    //   }
+    // }
+
+    // Add fine-tuning request
+    Request fine_tuning_req;
+    fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+    fine_tuning_req.benchmarking_tokens = 1024;
+    fine_tuning_req.max_sequence_length = 1024;
+    fine_tuning_req.peft_model_id =
+        (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+    fine_tuning_req.max_training_steps = 1000000000;
+    RequestManager::RequestGuid ft_guid =
+        rm->register_new_peft_request(fine_tuning_req);
+    if (ft_guid != RequestManager::INVALID_GUID) {
+      const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+      guids->peft_queue.push(ft_guid);
+    }
+
+    // Replay the trace of inference requests
+    auto start_time = std::chrono::steady_clock::now();
+    for (int i = 0; i < bucket_arrival_times_s.size(); i++) {
+      if (bucket_arrival_times_s[i] >= max_buckets_to_run) {
+        break;
+      }
+      // sleep until bucket arrives
+      auto bucket_arrival_time =
+          start_time +
+          std::chrono::milliseconds(bucket_arrival_times_s[i] * nb_millisecs);
+      std::this_thread::sleep_until(bucket_arrival_time);
+
+      // create inference requests for the bucket
+      std::vector<Request> requests;
+      for (auto const &prompt : buckets[i]) {
+        // printf("Prompt length: %d, sequence length: %d\n", prompt_length,
+        // sequence_length);
+        Request inference_req;
+        inference_req.benchmarking_tokens = prompt.first;
+        inference_req.max_sequence_length = prompt.second + prompt.first;
+        inference_req.peft_model_id =
+            (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
+        requests.push_back(inference_req);
+      }
+
+      {
+        const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+        for (int i = 0; i < requests.size(); i++) {
+          RequestManager::RequestGuid guid =
+              rm->register_new_request(requests.at(i));
+          if (guid != RequestManager::INVALID_GUID) {
+            guids->inf_queue.push(guid);
+          }
+        }
+      }
+    }
+
+    { // Notify the consumer that no more requests are incoming
+      const std::lock_guard<std::mutex> lock(guids->request_queue_mutex);
+      guids->producer_finished = true;
+    }
+  }
+
+  // Wait for consumer to finish
+  consumer.join();
+
+  // terminate the request manager by stopping the background thread
+  rm->terminate_background_server();
+
+  // Execution fence
+  {
+    Future future = runtime->issue_execution_fence(ctx);
+    future.get_void_result();
+  }
+
+  // float* data
+  std::cout << "----------inference finished--------------" << std::endl;
+
+  // free tokenizer space in memory
+}
+
+void FlexFlow::register_custom_tasks() {}
diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
new file mode 100644
index 0000000000..a7d38a66b6
--- /dev/null
+++ b/inference/python/ff_peft.py
@@ -0,0 +1,189 @@
+# Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import flexflow.serve as ff
+import argparse, json, os
+from types import SimpleNamespace
+
+
+def get_configs():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-config-file",
+        help="The path to a JSON file with the configs. If omitted, a sample model and configs will be used instead.",
+        type=str,
+        default="",
+    )
+    args = parser.parse_args()
+
+    # Load configs from JSON file (if specified)
+    if len(args.config_file) > 0:
+        if not os.path.isfile(args.config_file):
+            raise FileNotFoundError(f"Config file {args.config_file} not found.")
+        try:
+            with open(args.config_file) as f:
+                return json.load(f)
+        except json.JSONDecodeError as e:
+            print("JSON format error:")
+            print(e)
+    else:
+        # Define sample configs
+        ff_init_configs = {
+            # required parameters
+            "num_gpus": 2,
+            "memory_per_gpu": 14000,
+            "zero_copy_memory_per_node": 10000,
+            # optional parameters
+            "num_cpus": 4,
+            "legion_utility_processors": 4,
+            "data_parallelism_degree": 1,
+            "tensor_parallelism_degree": 2,
+            "pipeline_parallelism_degree": 1,
+            "offload": False,
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
+            "use_4bit_quantization": False,
+            "use_8bit_quantization": False,
+            "enable_peft": True,
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
+            "profiling": False,
+            "inference_debugging": True,
+            "fusion": False,
+        }
+        model_configs = {
+            # required parameters
+            "base_model": "JackFram/llama-160m",
+            "inference_peft_model_id": "goliaro/llama-160m-lora",
+            "finetuning_peft_model_id": "goliaro/llama-160m-lora",
+            # "base_model": "meta-llama/Meta-Llama-3-8B",
+            # "inference_peft_model_id": "goliaro/llama-3-8b-lora",
+            # "finetuning_peft_model_id": "goliaro/llama-3-8b-lora-dolly",
+            # optional parameters
+            "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+            "refresh_cache": False,
+            "full_precision": True,
+            "prompt": "",
+            "finetuning_dataset": os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "../prompt/peft_dataset.json",
+            ),
+            "output_file": "",
+        }
+        # Merge dictionaries
+        ff_init_configs.update(model_configs)
+        return ff_init_configs
+
+
+def main():
+    configs_dict = get_configs()
+    configs = SimpleNamespace(**configs_dict)
+
+    # Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+    ff.init(configs_dict)
+
+    # Create the FlexFlow LLM
+    ff_data_type = (
+        ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+    )
+    llm = ff.LLM(
+        configs.base_model,
+        data_type=ff_data_type,
+        cache_path=configs.cache_path,
+        refresh_cache=configs.refresh_cache,
+        output_file=configs.output_file,
+    )
+    # Add inference and/or finetuning lora
+    lora_inference_config = None
+    lora_finetuning_config = None
+    if len(configs.prompt) > 0:
+        lora_inference_config = ff.LoraLinearConfig(
+            llm.cache_path,
+            configs.inference_peft_model_id,
+            base_model_name_or_path=configs.base_model,
+        )
+        llm.add_peft(lora_inference_config)
+    if len(configs.finetuning_dataset) > 0:
+        # lora_finetuning_config = ff.LoraLinearConfig(
+        #     llm.cache_path,
+        #     configs.finetuning_peft_model_id,
+        #     target_modules=["down_proj"],
+        #     rank=16,
+        #     lora_alpha=16,
+        #     trainable=True,
+        #     init_lora_weights=True,
+        #     optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
+        # )
+        lora_finetuning_config = ff.LoraLinearConfig(
+            llm.cache_path,
+            configs.inference_peft_model_id,
+            trainable=True,
+            base_model_name_or_path=configs.base_model,
+            optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
+            optimizer_kwargs={
+                "learning_rate": 0.001,
+                "momentum": 0.0,
+                "weight_decay": 0.0,
+                "nesterov": False,
+            },
+        )
+        llm.add_peft(lora_finetuning_config)
+
+    # Compile the LLM for inference and load the weights into memory
+    generation_config = ff.GenerationConfig(
+        do_sample=False, temperature=0.9, topp=0.8, topk=1
+    )
+    enable_peft_finetuning = len(configs.finetuning_dataset) > 0
+    llm.compile(
+        generation_config,
+        enable_peft_finetuning=enable_peft_finetuning,
+        max_requests_per_batch=1 if not enable_peft_finetuning else 2,
+        max_seq_length=256,
+        max_tokens_per_batch=128,
+    )
+
+    llm.start_server()
+
+    requests = []
+    # Serving
+    if len(configs.prompt) > 0:
+        prompts = [s for s in json.load(open(configs.prompt))]
+        inference_requests = [
+            ff.Request(
+                ff.RequestType.REQ_INFERENCE,
+                prompt=prompt,
+                max_sequence_length=128,
+                peft_model_id=llm.get_ff_peft_id(lora_inference_config),
+            )
+            for prompt in prompts
+        ]
+        requests += inference_requests
+    # Finetuning
+    if len(configs.finetuning_dataset) > 0:
+        finetuning_request = ff.Request(
+            ff.RequestType.REQ_FINETUNING,
+            max_sequence_length=128,
+            peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
+            dataset_filepath=configs.finetuning_dataset,
+            max_training_steps=2,
+        )
+        requests.append(finetuning_request)
+
+    results = llm.generate(requests)
+
+    llm.stop_server()
+
+
+if __name__ == "__main__":
+    print("flexflow PEFT example")
+    main()
diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
index 05599ea6b9..f888982f2c 100644
--- a/inference/python/incr_decoding.py
+++ b/inference/python/incr_decoding.py
@@ -51,9 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 1024**2,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
diff --git a/inference/python/peft_demo/INSTRUCTIONS.md b/inference/python/peft_demo/INSTRUCTIONS.md
new file mode 100644
index 0000000000..9b2a7a53b2
--- /dev/null
+++ b/inference/python/peft_demo/INSTRUCTIONS.md
@@ -0,0 +1,25 @@
+## Peft Demo
+* `git clone -b peft --recursive https://github.com/flexflow/FlexFlow.git`
+* `cd FlexFlow/`
+
+* If you wish to run the demo by installing FlexFlow
+    * `conda env create -f conda/flexflow.yml`
+    * `conda activate flexflow`
+
+* If you wish to run the demo using a Docker container
+    * `export FF_CUDA_ARCH=all && export cuda_version=12.0 && ./docker/build.sh flexflow && ./docker/run.sh flexflow`
+
+* Then, install the Llama2 model (the `meta-llama/Llama-2-7b-hf` model is gated, so make sure to add your HF access token)
+
+    * `export HUGGINGFACE_TOKEN="[Your token]"`
+    * `huggingface-cli login --token "$HUGGINGFACE_TOKEN"`
+    * `python3 inference/utils/download_peft_model.py "goliaro/llama-2-7b-lora-full" --base_model_name "meta-llama/Llama-2-7b-hf"`
+
+* Run the demo
+    ```
+    mkdir inference/output
+    cd inference/python/peft_demo/
+    python3 demo.py -config-file demo_config.json
+    ```
+
+
diff --git a/inference/python/peft_demo/demo.ipynb b/inference/python/peft_demo/demo.ipynb
new file mode 100644
index 0000000000..dfb5193a1d
--- /dev/null
+++ b/inference/python/peft_demo/demo.ipynb
@@ -0,0 +1,1907 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# FlexFlow Co-Serving Demo\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json, random, subprocess, os\n",
+    "from datasets import load_dataset\n",
+    "from types import SimpleNamespace\n",
+    "from huggingface_hub import HfFolder\n",
+    "import flexflow.serve as ff\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'):\n",
+    "    \"\"\"Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k.\n",
+    "    Only the 'open_qa' and 'closed_qa' prompts without context are kept.\n",
+    "    The datasets are saved into the files given as arguments.\n",
+    "\n",
+    "    Keyword arguments:\n",
+    "    dataset_size -- the number of prompts to consider\n",
+    "    inference_file_path -- the file in which to save the inference data\n",
+    "    finetuning_file_path -- the file in which to save the finetuning data\n",
+    "    \"\"\"\n",
+    "    dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")\n",
+    "    inference_data = []\n",
+    "    finetuning_data = []\n",
+    "    for row in dataset:\n",
+    "        if len(finetuning_data) == finetune_dataset_size:\n",
+    "            break\n",
+    "        if (\"open_qa\" in row['category'] or \"closed_qa\" in row['category']) and len(row['context']) == 0:\n",
+    "            inference_data.append(row['instruction'])\n",
+    "            finetuning_data.append(row['instruction'] + \" \" + row['response'])\n",
+    "    with open(inference_file_path, 'w') as file:\n",
+    "        json.dump(inference_data[:1], file)\n",
+    "    with open(finetuning_file_path, 'w') as file:\n",
+    "        json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': '))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration fields"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "configs_dict = {\n",
+    "    \"num_gpus\": 1,\n",
+    "    \"memory_per_gpu\": 21000,\n",
+    "    \"zero_copy_memory_per_node\": 40000,\n",
+    "    \"num_cpus\": 4,\n",
+    "    \"legion_utility_processors\": 4,\n",
+    "    \"data_parallelism_degree\": 1,\n",
+    "    \"tensor_parallelism_degree\": 1,\n",
+    "    \"pipeline_parallelism_degree\": 1,\n",
+    "    \"offload\": False,\n",
+    "    \"offload_reserve_space_size\": 8 * 1024,  # 8GB\n",
+    "    \"use_4bit_quantization\": False,\n",
+    "    \"use_8bit_quantization\": False,\n",
+    "    \"enable_peft\": True,\n",
+    "    \"peft_activation_reserve_space_size\": 1024,  # 1GB\n",
+    "    \"peft_weight_reserve_space_size\": 1024,  # 1GB\n",
+    "    \"profiling\": False,\n",
+    "    \"inference_debugging\": False,\n",
+    "    \"fusion\": False,\n",
+    "    \"max_requests_per_batch\": 1,\n",
+    "    \"max_sequence_length\": 128,\n",
+    "    \"max_tokens_per_batch\": 128,\n",
+    "    \"max_training_steps\": 100,\n",
+    "    \"seed\": 42,\n",
+    "}\n",
+    "model_configs = {\n",
+    "    \"base_model\": \"meta-llama/Meta-Llama-3-8B\",\n",
+    "    \"inference_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n",
+    "    \"finetuning_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n",
+    "    \"cache_path\": os.environ.get(\"FF_CACHE_PATH\", \"\"),\n",
+    "    \"refresh_cache\": False,\n",
+    "    \"full_precision\": False,\n",
+    "    # relative paths\n",
+    "    \"inference_dataset\": \"inference_dataset.json\",\n",
+    "    \"finetuning_dataset\": \"/usr/FlexFlow/inference/prompt/peft_dataset.json\",\n",
+    "    \"output_file\": \"peft_demo.txt\",\n",
+    "}\n",
+    "generation_configs = {\n",
+    "    \"do_sample\": False,\n",
+    "    \"temperature\": 0.9,\n",
+    "    \"topp\": 0.8,\n",
+    "    \"topk\": 1,\n",
+    "}\n",
+    "finetuning_configs = {\n",
+    "    \"learning_rate\": 0.001,\n",
+    "    \"momentum\": 0.0,\n",
+    "    \"weight_decay\": 0.0,\n",
+    "    \"nesterov\": False,\n",
+    "}\n",
+    "# Merge dictionaries\n",
+    "configs_dict.update(model_configs)\n",
+    "configs_dict.update(generation_configs)\n",
+    "configs_dict.update(finetuning_configs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "random.seed(configs_dict[\"seed\"])\n",
+    "\n",
+    "configs = SimpleNamespace(**configs_dict)\n",
+    "\n",
+    "create_datasets(inference_file_path=configs_dict[\"inference_dataset\"], \n",
+    "                finetuning_file_path=configs_dict[\"finetuning_dataset\"])\n",
+    "\n",
+    "# Clear output file\n",
+    "with open(configs.output_file, 'w') as file:\n",
+    "    file.write('')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download base and peft inference models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n",
+      "Loading tokenizer...\n",
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n",
+      "Loading tokenizer...\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CompletedProcess(args=['python', '../../utils/download_peft_model.py', 'goliaro/llama-3-8b-lora', '--base_model_name', 'meta-llama/Meta-Llama-3-8B'], returncode=0)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]\n",
+    "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialize FlexFlow runtime and LLM object"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0 - 7f4d49d21280]    0.672934 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7f4d49d21280]    0.672995 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7f4d49d21280]    0.673107 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7f4d49d21280]    0.673118 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7f4d49d21280]    0.673124 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "workSpaceSize (128 MB)\n",
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora/config.json...\n",
+      "Loading tokenizer...\n",
+      "Adding layer layers.0.mlp.down_proj.lora\n",
+      "Adding layer layers.1.mlp.down_proj.lora\n",
+      "Adding layer layers.2.mlp.down_proj.lora\n",
+      "Adding layer layers.3.mlp.down_proj.lora\n",
+      "Adding layer layers.4.mlp.down_proj.lora\n",
+      "Adding layer layers.5.mlp.down_proj.lora\n",
+      "Adding layer layers.6.mlp.down_proj.lora\n",
+      "Adding layer layers.7.mlp.down_proj.lora\n",
+      "Adding layer layers.8.mlp.down_proj.lora\n",
+      "Adding layer layers.9.mlp.down_proj.lora\n",
+      "Adding layer layers.10.mlp.down_proj.lora\n",
+      "Adding layer layers.11.mlp.down_proj.lora\n",
+      "Adding layer layers.12.mlp.down_proj.lora\n",
+      "Adding layer layers.13.mlp.down_proj.lora\n",
+      "Adding layer layers.14.mlp.down_proj.lora\n",
+      "Adding layer layers.15.mlp.down_proj.lora\n",
+      "Adding layer layers.16.mlp.down_proj.lora\n",
+      "Adding layer layers.17.mlp.down_proj.lora\n",
+      "Adding layer layers.18.mlp.down_proj.lora\n",
+      "Adding layer layers.19.mlp.down_proj.lora\n",
+      "Adding layer layers.20.mlp.down_proj.lora\n",
+      "Adding layer layers.21.mlp.down_proj.lora\n",
+      "Adding layer layers.22.mlp.down_proj.lora\n",
+      "Adding layer layers.23.mlp.down_proj.lora\n",
+      "Adding layer layers.24.mlp.down_proj.lora\n",
+      "Adding layer layers.25.mlp.down_proj.lora\n",
+      "Adding layer layers.26.mlp.down_proj.lora\n",
+      "Adding layer layers.27.mlp.down_proj.lora\n",
+      "Adding layer layers.28.mlp.down_proj.lora\n",
+      "Adding layer layers.29.mlp.down_proj.lora\n",
+      "Adding layer layers.30.mlp.down_proj.lora\n",
+      "Adding layer layers.31.mlp.down_proj.lora\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n",
+    "ff.init(configs_dict)\n",
+    "\n",
+    "# Create the FlexFlow LLM\n",
+    "ff_data_type = (\n",
+    "    ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF\n",
+    ")\n",
+    "llm = ff.LLM(\n",
+    "    configs.base_model,\n",
+    "    data_type=ff_data_type,\n",
+    "    cache_path=configs.cache_path,\n",
+    "    refresh_cache=configs.refresh_cache,\n",
+    "    output_file=configs.output_file,\n",
+    ")\n",
+    "# Add inference and/or finetuning lora\n",
+    "lora_inference_config = None\n",
+    "lora_finetuning_config = None\n",
+    "if len(configs.inference_dataset) > 0:\n",
+    "    lora_inference_config = ff.LoraLinearConfig(\n",
+    "        llm.cache_path, \n",
+    "        configs.inference_peft_model_id,\n",
+    "        base_model_name_or_path=configs.base_model\n",
+    "    )\n",
+    "    llm.add_peft(lora_inference_config)\n",
+    "if len(configs.finetuning_dataset) > 0:\n",
+    "    lora_finetuning_config = ff.LoraLinearConfig(\n",
+    "        llm.cache_path,\n",
+    "        configs.finetuning_peft_model_id,\n",
+    "        trainable=True,\n",
+    "        init_lora_weights=False,\n",
+    "        rank=16,\n",
+    "        lora_alpha=16.0,\n",
+    "        # target_modules = [\"down_proj\"],\n",
+    "        base_model_name_or_path=configs.base_model,\n",
+    "        optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,\n",
+    "        optimizer_kwargs={\n",
+    "            \"learning_rate\": configs.learning_rate,\n",
+    "            \"momentum\": configs.momentum,\n",
+    "            \"weight_decay\": configs.weight_decay,\n",
+    "            \"nesterov\": configs.nesterov,\n",
+    "        },\n",
+    "    )\n",
+    "    llm.add_peft(lora_finetuning_config)\n",
+    "\n",
+    "# Compile the LLM for inference and load the weights into memory\n",
+    "generation_config = ff.GenerationConfig(\n",
+    "    do_sample=configs.do_sample,\n",
+    "    temperature=configs.temperature,\n",
+    "    topp=configs.topp,\n",
+    "    topk=configs.topk\n",
+    ")\n",
+    "enable_peft_finetuning = len(configs.finetuning_dataset) > 0\n",
+    "llm.compile(\n",
+    "    generation_config,\n",
+    "    enable_peft_finetuning=enable_peft_finetuning,\n",
+    "    max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning),\n",
+    "    max_seq_length=configs.max_sequence_length,\n",
+    "    max_tokens_per_batch=configs.max_tokens_per_batch,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Start the LLM Co-serving system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Background server started.\n",
+      "2024-07-22 06:45:43 - ###PEFT DEBUGGING### Starting background serving task.\n",
+      "2024-07-22 06:45:43 - ###PEFT DEBUGGING### Updated models' configuration.\n",
+      "###PEFT DEBUGGING### LLM Model object exists.\n",
+      "###PEFT DEBUGGING### Model object exists.\n",
+      "###PEFT DEBUGGING### Model object still exists.\n",
+      "###PEFT DEBUGGING### Entering compile_inference.\n",
+      "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node.\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm.start_server()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generate inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "###PEFT DEBUGGING### Launching graph optimization task.\n",
+      "[<flexflow.core.flexflow_cffi.Request object at 0x7f4ce8e13250>]\n",
+      "num_nodes = 1 num_gpus_per_node = 1\n",
+      "[0]10445\n",
+      "[1]649\n",
+      "[2]6730\n",
+      "[3]2053\n",
+      "[4]18167\n",
+      "[5]369\n",
+      "[6]1317\n",
+      "[7]2085\n",
+      "[8]3090\n",
+      "[9]30\n",
+      "No small speculative model registered, using incremental decoding.\n",
+      "[0 - 7f4d49d21280]    1.600215 {3}{RequestManager}: [1000000]New request tokens: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30\n",
+      "optimal_views.size = 262\n",
+      "views.size() = 262\n",
+      "###PEFT DEBUGGING### Operators reconstructed from optimized graph.\n",
+      "###PEFT DEBUGGING### Starting inplace optimizations.\n",
+      "###PEFT DEBUGGING### Mapping output tensors.\n",
+      "ndim(1) dims[1 0 0 0]\n",
+      "###PEFT DEBUGGING### Setting up NCCL communications.\n",
+      "###PEFT DEBUGGING### compile_inference completed successfully.\n",
+      "Loading weight file embed_tokens.weight\n",
+      "Loading weight file layers.0.input_layernorm.weight\n",
+      "Loading weight file layers.0.self_attn.q_proj.weight\n",
+      "Loading weight file layers.0.self_attn.k_proj.weight\n",
+      "Loading weight file layers.0.self_attn.v_proj.weight\n",
+      "Loading weight file layers.0.self_attn.o_proj.weight\n",
+      "Loading weight file layers.0.post_attention_layernorm.weight\n",
+      "Loading weight file layers.0.mlp.gate_proj.weight\n",
+      "Loading weight file layers.0.mlp.up_proj.weight\n",
+      "Loading weight file layers.0.mlp.down_proj.weight\n",
+      "Loading weight file layers.1.input_layernorm.weight\n",
+      "Loading weight file layers.1.self_attn.q_proj.weight\n",
+      "Loading weight file layers.1.self_attn.k_proj.weight\n",
+      "Loading weight file layers.1.self_attn.v_proj.weight\n",
+      "Loading weight file layers.1.self_attn.o_proj.weight\n",
+      "Loading weight file layers.1.post_attention_layernorm.weight\n",
+      "Loading weight file layers.1.mlp.gate_proj.weight\n",
+      "Loading weight file layers.1.mlp.up_proj.weight\n",
+      "Loading weight file layers.1.mlp.down_proj.weight\n",
+      "Loading weight file layers.2.input_layernorm.weight\n",
+      "Loading weight file layers.2.self_attn.q_proj.weight\n",
+      "Loading weight file layers.2.self_attn.k_proj.weight\n",
+      "Loading weight file layers.2.self_attn.v_proj.weight\n",
+      "Loading weight file layers.2.self_attn.o_proj.weight\n",
+      "Loading weight file layers.2.post_attention_layernorm.weight\n",
+      "Loading weight file layers.2.mlp.gate_proj.weight\n",
+      "Loading weight file layers.2.mlp.up_proj.weight\n",
+      "Loading weight file layers.2.mlp.down_proj.weight\n",
+      "Loading weight file layers.3.input_layernorm.weight\n",
+      "Loading weight file layers.3.self_attn.q_proj.weight\n",
+      "Loading weight file layers.3.self_attn.k_proj.weight\n",
+      "Loading weight file layers.3.self_attn.v_proj.weight\n",
+      "Loading weight file layers.3.self_attn.o_proj.weight\n",
+      "Loading weight file layers.3.post_attention_layernorm.weight\n",
+      "Loading weight file layers.3.mlp.gate_proj.weight\n",
+      "Loading weight file layers.3.mlp.up_proj.weight\n",
+      "Loading weight file layers.3.mlp.down_proj.weight\n",
+      "Loading weight file layers.4.input_layernorm.weight\n",
+      "Loading weight file layers.4.self_attn.q_proj.weight\n",
+      "Loading weight file layers.4.self_attn.k_proj.weight\n",
+      "Loading weight file layers.4.self_attn.v_proj.weight\n",
+      "Loading weight file layers.4.self_attn.o_proj.weight\n",
+      "Loading weight file layers.4.post_attention_layernorm.weight\n",
+      "Loading weight file layers.4.mlp.gate_proj.weight\n",
+      "Loading weight file layers.4.mlp.up_proj.weight\n",
+      "Loading weight file layers.4.mlp.down_proj.weight\n",
+      "Loading weight file layers.5.input_layernorm.weight\n",
+      "Loading weight file layers.5.self_attn.q_proj.weight\n",
+      "Loading weight file layers.5.self_attn.k_proj.weight\n",
+      "Loading weight file layers.5.self_attn.v_proj.weight\n",
+      "Loading weight file layers.5.self_attn.o_proj.weight\n",
+      "Loading weight file layers.5.post_attention_layernorm.weight\n",
+      "Loading weight file layers.5.mlp.gate_proj.weight\n",
+      "Loading weight file layers.5.mlp.up_proj.weight\n",
+      "Loading weight file layers.5.mlp.down_proj.weight\n",
+      "Loading weight file layers.6.input_layernorm.weight\n",
+      "Loading weight file layers.6.self_attn.q_proj.weight\n",
+      "Loading weight file layers.6.self_attn.k_proj.weight\n",
+      "Loading weight file layers.6.self_attn.v_proj.weight\n",
+      "Loading weight file layers.6.self_attn.o_proj.weight\n",
+      "Loading weight file layers.6.post_attention_layernorm.weight\n",
+      "Loading weight file layers.6.mlp.gate_proj.weight\n",
+      "Loading weight file layers.6.mlp.up_proj.weight\n",
+      "Loading weight file layers.6.mlp.down_proj.weight\n",
+      "Loading weight file layers.7.input_layernorm.weight\n",
+      "Loading weight file layers.7.self_attn.q_proj.weight\n",
+      "Loading weight file layers.7.self_attn.k_proj.weight\n",
+      "Loading weight file layers.7.self_attn.v_proj.weight\n",
+      "Loading weight file layers.7.self_attn.o_proj.weight\n",
+      "Loading weight file layers.7.post_attention_layernorm.weight\n",
+      "Loading weight file layers.7.mlp.gate_proj.weight\n",
+      "Loading weight file layers.7.mlp.up_proj.weight\n",
+      "Loading weight file layers.7.mlp.down_proj.weight\n",
+      "Loading weight file layers.8.input_layernorm.weight\n",
+      "Loading weight file layers.8.self_attn.q_proj.weight\n",
+      "Loading weight file layers.8.self_attn.k_proj.weight\n",
+      "Loading weight file layers.8.self_attn.v_proj.weight\n",
+      "Loading weight file layers.8.self_attn.o_proj.weight\n",
+      "Loading weight file layers.8.post_attention_layernorm.weight\n",
+      "Loading weight file layers.8.mlp.gate_proj.weight\n",
+      "Loading weight file layers.8.mlp.up_proj.weight\n",
+      "Loading weight file layers.8.mlp.down_proj.weight\n",
+      "Loading weight file layers.9.input_layernorm.weight\n",
+      "Loading weight file layers.9.self_attn.q_proj.weight\n",
+      "Loading weight file layers.9.self_attn.k_proj.weight\n",
+      "Loading weight file layers.9.self_attn.v_proj.weight\n",
+      "Loading weight file layers.9.self_attn.o_proj.weight\n",
+      "Loading weight file layers.9.post_attention_layernorm.weight\n",
+      "Loading weight file layers.9.mlp.gate_proj.weight\n",
+      "Loading weight file layers.9.mlp.up_proj.weight\n",
+      "Loading weight file layers.9.mlp.down_proj.weight\n",
+      "Loading weight file layers.10.input_layernorm.weight\n",
+      "Loading weight file layers.10.self_attn.q_proj.weight\n",
+      "Loading weight file layers.10.self_attn.k_proj.weight\n",
+      "Loading weight file layers.10.self_attn.v_proj.weight\n",
+      "Loading weight file layers.10.self_attn.o_proj.weight\n",
+      "Loading weight file layers.10.post_attention_layernorm.weight\n",
+      "Loading weight file layers.10.mlp.gate_proj.weight\n",
+      "Loading weight file layers.10.mlp.up_proj.weight\n",
+      "Loading weight file layers.10.mlp.down_proj.weight\n",
+      "Loading weight file layers.11.input_layernorm.weight\n",
+      "Loading weight file layers.11.self_attn.q_proj.weight\n",
+      "Loading weight file layers.11.self_attn.k_proj.weight\n",
+      "Loading weight file layers.11.self_attn.v_proj.weight\n",
+      "Loading weight file layers.11.self_attn.o_proj.weight\n",
+      "Loading weight file layers.11.post_attention_layernorm.weight\n",
+      "Loading weight file layers.11.mlp.gate_proj.weight\n",
+      "Loading weight file layers.11.mlp.up_proj.weight\n",
+      "Loading weight file layers.11.mlp.down_proj.weight\n",
+      "Loading weight file layers.12.input_layernorm.weight\n",
+      "Loading weight file layers.12.self_attn.q_proj.weight\n",
+      "Loading weight file layers.12.self_attn.k_proj.weight\n",
+      "Loading weight file layers.12.self_attn.v_proj.weight\n",
+      "Loading weight file layers.12.self_attn.o_proj.weight\n",
+      "Loading weight file layers.12.post_attention_layernorm.weight\n",
+      "Loading weight file layers.12.mlp.gate_proj.weight\n",
+      "Loading weight file layers.12.mlp.up_proj.weight\n",
+      "Loading weight file layers.12.mlp.down_proj.weight\n",
+      "Loading weight file layers.13.input_layernorm.weight\n",
+      "Loading weight file layers.13.self_attn.q_proj.weight\n",
+      "Loading weight file layers.13.self_attn.k_proj.weight\n",
+      "Loading weight file layers.13.self_attn.v_proj.weight\n",
+      "Loading weight file layers.13.self_attn.o_proj.weight\n",
+      "Loading weight file layers.13.post_attention_layernorm.weight\n",
+      "Loading weight file layers.13.mlp.gate_proj.weight\n",
+      "Loading weight file layers.13.mlp.up_proj.weight\n",
+      "Loading weight file layers.13.mlp.down_proj.weight\n",
+      "Loading weight file layers.14.input_layernorm.weight\n",
+      "Loading weight file layers.14.self_attn.q_proj.weight\n",
+      "Loading weight file layers.14.self_attn.k_proj.weight\n",
+      "Loading weight file layers.14.self_attn.v_proj.weight\n",
+      "Loading weight file layers.14.self_attn.o_proj.weight\n",
+      "Loading weight file layers.14.post_attention_layernorm.weight\n",
+      "Loading weight file layers.14.mlp.gate_proj.weight\n",
+      "Loading weight file layers.14.mlp.up_proj.weight\n",
+      "Loading weight file layers.14.mlp.down_proj.weight\n",
+      "Loading weight file layers.15.input_layernorm.weight\n",
+      "Loading weight file layers.15.self_attn.q_proj.weight\n",
+      "Loading weight file layers.15.self_attn.k_proj.weight\n",
+      "Loading weight file layers.15.self_attn.v_proj.weight\n",
+      "Loading weight file layers.15.self_attn.o_proj.weight\n",
+      "Loading weight file layers.15.post_attention_layernorm.weight\n",
+      "Loading weight file layers.15.mlp.gate_proj.weight\n",
+      "Loading weight file layers.15.mlp.up_proj.weight\n",
+      "Loading weight file layers.15.mlp.down_proj.weight\n",
+      "Loading weight file layers.16.input_layernorm.weight\n",
+      "Loading weight file layers.16.self_attn.q_proj.weight\n",
+      "Loading weight file layers.16.self_attn.k_proj.weight\n",
+      "Loading weight file layers.16.self_attn.v_proj.weight\n",
+      "Loading weight file layers.16.self_attn.o_proj.weight\n",
+      "Loading weight file layers.16.post_attention_layernorm.weight\n",
+      "Loading weight file layers.16.mlp.gate_proj.weight\n",
+      "Loading weight file layers.16.mlp.up_proj.weight\n",
+      "Loading weight file layers.16.mlp.down_proj.weight\n",
+      "Loading weight file layers.17.input_layernorm.weight\n",
+      "Loading weight file layers.17.self_attn.q_proj.weight\n",
+      "Loading weight file layers.17.self_attn.k_proj.weight\n",
+      "Loading weight file layers.17.self_attn.v_proj.weight\n",
+      "Loading weight file layers.17.self_attn.o_proj.weight\n",
+      "Loading weight file layers.17.post_attention_layernorm.weight\n",
+      "Loading weight file layers.17.mlp.gate_proj.weight\n",
+      "Loading weight file layers.17.mlp.up_proj.weight\n",
+      "Loading weight file layers.17.mlp.down_proj.weight\n",
+      "Loading weight file layers.18.input_layernorm.weight\n",
+      "Loading weight file layers.18.self_attn.q_proj.weight\n",
+      "Loading weight file layers.18.self_attn.k_proj.weight\n",
+      "Loading weight file layers.18.self_attn.v_proj.weight\n",
+      "Loading weight file layers.18.self_attn.o_proj.weight\n",
+      "Loading weight file layers.18.post_attention_layernorm.weight\n",
+      "Loading weight file layers.18.mlp.gate_proj.weight\n",
+      "Loading weight file layers.18.mlp.up_proj.weight\n",
+      "Loading weight file layers.18.mlp.down_proj.weight\n",
+      "Loading weight file layers.19.input_layernorm.weight\n",
+      "Loading weight file layers.19.self_attn.q_proj.weight\n",
+      "Loading weight file layers.19.self_attn.k_proj.weight\n",
+      "Loading weight file layers.19.self_attn.v_proj.weight\n",
+      "Loading weight file layers.19.self_attn.o_proj.weight\n",
+      "Loading weight file layers.19.post_attention_layernorm.weight\n",
+      "Loading weight file layers.19.mlp.gate_proj.weight\n",
+      "Loading weight file layers.19.mlp.up_proj.weight\n",
+      "Loading weight file layers.19.mlp.down_proj.weight\n",
+      "Loading weight file layers.20.input_layernorm.weight\n",
+      "Loading weight file layers.20.self_attn.q_proj.weight\n",
+      "Loading weight file layers.20.self_attn.k_proj.weight\n",
+      "Loading weight file layers.20.self_attn.v_proj.weight\n",
+      "Loading weight file layers.20.self_attn.o_proj.weight\n",
+      "Loading weight file layers.20.post_attention_layernorm.weight\n",
+      "Loading weight file layers.20.mlp.gate_proj.weight\n",
+      "Loading weight file layers.20.mlp.up_proj.weight\n",
+      "Loading weight file layers.20.mlp.down_proj.weight\n",
+      "Loading weight file layers.21.input_layernorm.weight\n",
+      "Loading weight file layers.21.self_attn.q_proj.weight\n",
+      "Loading weight file layers.21.self_attn.k_proj.weight\n",
+      "Loading weight file layers.21.self_attn.v_proj.weight\n",
+      "Loading weight file layers.21.self_attn.o_proj.weight\n",
+      "Loading weight file layers.21.post_attention_layernorm.weight\n",
+      "Loading weight file layers.21.mlp.gate_proj.weight\n",
+      "Loading weight file layers.21.mlp.up_proj.weight\n",
+      "Loading weight file layers.21.mlp.down_proj.weight\n",
+      "Loading weight file layers.22.input_layernorm.weight\n",
+      "Loading weight file layers.22.self_attn.q_proj.weight\n",
+      "Loading weight file layers.22.self_attn.k_proj.weight\n",
+      "Loading weight file layers.22.self_attn.v_proj.weight\n",
+      "Loading weight file layers.22.self_attn.o_proj.weight\n",
+      "Loading weight file layers.22.post_attention_layernorm.weight\n",
+      "Loading weight file layers.22.mlp.gate_proj.weight\n",
+      "Loading weight file layers.22.mlp.up_proj.weight\n",
+      "Loading weight file layers.22.mlp.down_proj.weight\n",
+      "Loading weight file layers.23.input_layernorm.weight\n",
+      "Loading weight file layers.23.self_attn.q_proj.weight\n",
+      "Loading weight file layers.23.self_attn.k_proj.weight\n",
+      "Loading weight file layers.23.self_attn.v_proj.weight\n",
+      "Loading weight file layers.23.self_attn.o_proj.weight\n",
+      "Loading weight file layers.23.post_attention_layernorm.weight\n",
+      "Loading weight file layers.23.mlp.gate_proj.weight\n",
+      "Loading weight file layers.23.mlp.up_proj.weight\n",
+      "Loading weight file layers.23.mlp.down_proj.weight\n",
+      "Loading weight file layers.24.input_layernorm.weight\n",
+      "Loading weight file layers.24.self_attn.q_proj.weight\n",
+      "Loading weight file layers.24.self_attn.k_proj.weight\n",
+      "Loading weight file layers.24.self_attn.v_proj.weight\n",
+      "Loading weight file layers.24.self_attn.o_proj.weight\n",
+      "Loading weight file layers.24.post_attention_layernorm.weight\n",
+      "Loading weight file layers.24.mlp.gate_proj.weight\n",
+      "Loading weight file layers.24.mlp.up_proj.weight\n",
+      "Loading weight file layers.24.mlp.down_proj.weight\n",
+      "Loading weight file layers.25.input_layernorm.weight\n",
+      "Loading weight file layers.25.self_attn.q_proj.weight\n",
+      "Loading weight file layers.25.self_attn.k_proj.weight\n",
+      "Loading weight file layers.25.self_attn.v_proj.weight\n",
+      "Loading weight file layers.25.self_attn.o_proj.weight\n",
+      "Loading weight file layers.25.post_attention_layernorm.weight\n",
+      "Loading weight file layers.25.mlp.gate_proj.weight\n",
+      "Loading weight file layers.25.mlp.up_proj.weight\n",
+      "Loading weight file layers.25.mlp.down_proj.weight\n",
+      "Loading weight file layers.26.input_layernorm.weight\n",
+      "Loading weight file layers.26.self_attn.q_proj.weight\n",
+      "Loading weight file layers.26.self_attn.k_proj.weight\n",
+      "Loading weight file layers.26.self_attn.v_proj.weight\n",
+      "Loading weight file layers.26.self_attn.o_proj.weight\n",
+      "Loading weight file layers.26.post_attention_layernorm.weight\n",
+      "Loading weight file layers.26.mlp.gate_proj.weight\n",
+      "Loading weight file layers.26.mlp.up_proj.weight\n",
+      "Loading weight file layers.26.mlp.down_proj.weight\n",
+      "Loading weight file layers.27.input_layernorm.weight\n",
+      "Loading weight file layers.27.self_attn.q_proj.weight\n",
+      "Loading weight file layers.27.self_attn.k_proj.weight\n",
+      "Loading weight file layers.27.self_attn.v_proj.weight\n",
+      "Loading weight file layers.27.self_attn.o_proj.weight\n",
+      "Loading weight file layers.27.post_attention_layernorm.weight\n",
+      "Loading weight file layers.27.mlp.gate_proj.weight\n",
+      "Loading weight file layers.27.mlp.up_proj.weight\n",
+      "Loading weight file layers.27.mlp.down_proj.weight\n",
+      "Loading weight file layers.28.input_layernorm.weight\n",
+      "Loading weight file layers.28.self_attn.q_proj.weight\n",
+      "Loading weight file layers.28.self_attn.k_proj.weight\n",
+      "Loading weight file layers.28.self_attn.v_proj.weight\n",
+      "Loading weight file layers.28.self_attn.o_proj.weight\n",
+      "Loading weight file layers.28.post_attention_layernorm.weight\n",
+      "Loading weight file layers.28.mlp.gate_proj.weight\n",
+      "Loading weight file layers.28.mlp.up_proj.weight\n",
+      "Loading weight file layers.28.mlp.down_proj.weight\n",
+      "Loading weight file layers.29.input_layernorm.weight\n",
+      "Loading weight file layers.29.self_attn.q_proj.weight\n",
+      "Loading weight file layers.29.self_attn.k_proj.weight\n",
+      "Loading weight file layers.29.self_attn.v_proj.weight\n",
+      "Loading weight file layers.29.self_attn.o_proj.weight\n",
+      "Loading weight file layers.29.post_attention_layernorm.weight\n",
+      "Loading weight file layers.29.mlp.gate_proj.weight\n",
+      "Loading weight file layers.29.mlp.up_proj.weight\n",
+      "Loading weight file layers.29.mlp.down_proj.weight\n",
+      "Loading weight file layers.30.input_layernorm.weight\n",
+      "Loading weight file layers.30.self_attn.q_proj.weight\n",
+      "Loading weight file layers.30.self_attn.k_proj.weight\n",
+      "Loading weight file layers.30.self_attn.v_proj.weight\n",
+      "Loading weight file layers.30.self_attn.o_proj.weight\n",
+      "Loading weight file layers.30.post_attention_layernorm.weight\n",
+      "Loading weight file layers.30.mlp.gate_proj.weight\n",
+      "Loading weight file layers.30.mlp.up_proj.weight\n",
+      "Loading weight file layers.30.mlp.down_proj.weight\n",
+      "Loading weight file layers.31.input_layernorm.weight\n",
+      "Loading weight file layers.31.self_attn.q_proj.weight\n",
+      "Loading weight file layers.31.self_attn.k_proj.weight\n",
+      "Loading weight file layers.31.self_attn.v_proj.weight\n",
+      "Loading weight file layers.31.self_attn.o_proj.weight\n",
+      "Loading weight file layers.31.post_attention_layernorm.weight\n",
+      "Loading weight file layers.31.mlp.gate_proj.weight\n",
+      "Loading weight file layers.31.mlp.up_proj.weight\n",
+      "Loading weight file layers.31.mlp.down_proj.weight\n",
+      "Loading weight file norm.weight\n",
+      "Loading weight file lm_head.weight\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "[0 - 7f4ce019c740]   24.015346 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0178740]   24.062661 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0190740]   24.128376 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0184740]   24.199797 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0178740]   24.255941 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0178740]   24.306545 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   24.357210 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0190740]   24.407958 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0178740]   24.459366 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0178740]   24.510618 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0178740]   24.560416 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0178740]   24.611335 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0178740]   24.663808 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0178740]   24.710965 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0178740]   24.756020 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0178740]   24.805719 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0178740]   24.858560 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0184740]   24.910607 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0178740]   24.958879 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0184740]   25.002851 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   25.050780 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0178740]   25.104554 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0184740]   25.159509 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   25.211003 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0184740]   25.261411 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0190740]   25.312357 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0184740]   25.362253 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0184740]   25.412284 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0184740]   25.461502 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0184740]   25.513610 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0184740]   25.564433 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0184740]   25.613662 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0184740]   25.663786 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0184740]   25.712708 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0184740]   25.762206 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0184740]   25.812755 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0184740]   25.863367 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0184740]   25.913378 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0184740]   25.965063 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0178740]   26.015739 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   26.065768 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0178740]   26.115556 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0184740]   26.166644 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0184740]   26.218528 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0178740]   26.269681 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0178740]   26.320250 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0178740]   26.371698 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0184740]   26.422587 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0178740]   26.474391 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0178740]   26.524817 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0190740]   26.575224 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0178740]   26.627207 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0190740]   26.679366 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0178740]   26.729921 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   26.779766 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0178740]   26.832104 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0184740]   26.884087 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   26.935580 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0184740]   26.992909 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0184740]   27.043722 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0184740]   27.093960 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0178740]   27.144937 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0190740]   27.196991 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0178740]   27.248143 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0190740]   27.299549 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0190740]   27.351395 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0178740]   27.402975 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0190740]   27.453662 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0178740]   27.504152 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0178740]   27.554072 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0184740]   27.605613 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0178740]   27.656807 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0190740]   27.707595 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0190740]   27.757815 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0190740]   27.809557 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0184740]   27.862148 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0190740]   27.914188 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0178740]   27.965942 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0184740]   28.017837 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0184740]   28.069997 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0184740]   28.122560 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0190740]   28.172513 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0190740]   28.224002 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0184740]   28.276536 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0184740]   28.327091 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0184740]   28.377124 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0190740]   28.427226 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0190740]   28.477499 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0184740]   28.528489 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0178740]   28.580135 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0190740]   28.631761 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0190740]   28.683392 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0184740]   28.734001 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0190740]   28.783914 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0190740]   28.835832 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0184740]   28.885271 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0190740]   28.936179 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0190740]   28.987163 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0184740]   29.038264 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0184740]   29.084248 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0184740]   29.129864 {3}{RequestManager}: Output token is: 3090\n",
+      "[0 - 7f4ce0184740]   29.175946 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7f4ce0184740]   29.226707 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7f4ce0184740]   29.277372 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7f4ce0184740]   29.329588 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0190740]   29.380856 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7f4ce0190740]   29.431483 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7f4ce0190740]   29.483399 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7f4ce0190740]   29.536268 {3}{RequestManager}: Output token is: 2144\n",
+      "[0 - 7f4ce0190740]   29.588317 {3}{RequestManager}: Output token is: 430\n",
+      "[0 - 7f4ce0184740]   29.638727 {3}{RequestManager}: Output token is: 6730\n",
+      "[0 - 7f4ce0190740]   29.689708 {3}{RequestManager}: Output token is: 2053\n",
+      "[0 - 7f4ce0190740]   29.740987 {3}{RequestManager}: Output token is: 649\n",
+      "[0 - 7f4ce0178740]   29.791166 {3}{RequestManager}: Output token is: 18167\n",
+      "[0 - 7f4ce0190740]   29.841776 {3}{RequestManager}: Output token is: 369\n",
+      "[0 - 7f4ce0184740]   29.893514 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7f4ce0178740]   29.945509 {3}{RequestManager}: Output token is: 2085\n",
+      "[0 - 7f4ce0178740]   29.945878 {3}{RequestManager}: [Done] guid(1000000) final_length(128)\n",
+      "[0 - 7f4ce0178740]   29.945889 {3}{RequestManager}: Final output: <s> <|begin_of_text|>Why can camels survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without water? What is the reason behind the fact that camels can survive for long without\n",
+      "[0 - 7f4ce0178740]   29.945900 {3}{RequestManager}: [Profile] guid(1000000) llm_decoding_steps(117) start(23696232.0) finish(29945893.0) latency(6249661.0) ttft(22415078.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompts = [s for s in json.load(open(configs.inference_dataset))]\n",
+    "inference_requests = [\n",
+    "    ff.Request(\n",
+    "        ff.RequestType.REQ_INFERENCE,\n",
+    "        prompt=prompt,\n",
+    "        max_sequence_length=configs.max_sequence_length,\n",
+    "        peft_model_id=llm.get_ff_peft_id(lora_inference_config),\n",
+    "    )\n",
+    "    for prompt in prompts\n",
+    "]\n",
+    "inf_req_res_1 = llm.generate(inference_requests)\n",
+    "with open(\"before_finetuning.txt\", \"w\") as file:\n",
+    "    file.write(str(inf_req_res_1[0].output_text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Perform Finetuning on dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[<flexflow.core.flexflow_cffi.Request object at 0x7f4ce3304c50>]\n",
+      "No small speculative model registered, using incremental decoding.\n",
+      "[0 - 7f4d49d21280]   29.957050 {3}{RequestManager}: [0] input: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30 8215 2053 1005 279 8834 304 872 305 12055 311 2567 1124 10409 449 4907 323 88000 369 1317 18852 315 892 13\n",
+      "[0 - 7f4d49d21280]   29.957061 {3}{RequestManager}: [0] output:\n",
+      "Loss: 2.6536\n",
+      "Loss: 2.5942\n",
+      "Loss: 2.5360\n",
+      "Loss: 2.5083\n",
+      "Loss: 2.4783\n",
+      "Loss: 2.4570\n",
+      "Loss: 2.4420\n",
+      "Loss: 2.4194\n",
+      "Loss: 2.4050\n",
+      "Loss: 2.3949\n",
+      "Loss: 2.3841\n",
+      "Loss: 2.3764\n",
+      "Loss: 2.3676\n",
+      "Loss: 2.3535\n",
+      "Loss: 2.3396\n",
+      "Loss: 2.3299\n",
+      "Loss: 2.3287\n",
+      "Loss: 2.3215\n",
+      "Loss: 2.3058\n",
+      "Loss: 2.2978\n",
+      "Loss: 2.2885\n",
+      "Loss: 2.2852\n",
+      "Loss: 2.2660\n",
+      "Loss: 2.2619\n",
+      "Loss: 2.2594\n",
+      "Loss: 2.2479\n",
+      "Loss: 2.2379\n",
+      "Loss: 2.2243\n",
+      "Loss: 2.2245\n",
+      "Loss: 2.2057\n",
+      "Loss: 2.2035\n",
+      "Loss: 2.1891\n",
+      "Loss: 2.1817\n",
+      "Loss: 2.1703\n",
+      "Loss: 2.1592\n",
+      "Loss: 2.1548\n",
+      "Loss: 2.1383\n",
+      "Loss: 2.1321\n",
+      "Loss: 2.1179\n",
+      "Loss: 2.1138\n",
+      "Loss: 2.1062\n",
+      "Loss: 2.0934\n",
+      "Loss: 2.0856\n",
+      "Loss: 2.0758\n",
+      "Loss: 2.0656\n",
+      "Loss: 2.0532\n",
+      "Loss: 2.0497\n",
+      "Loss: 2.0410\n",
+      "Loss: 2.0258\n",
+      "Loss: 2.0161\n",
+      "Loss: 2.0047\n",
+      "Loss: 1.9940\n",
+      "Loss: 1.9820\n",
+      "Loss: 1.9737\n",
+      "Loss: 1.9614\n",
+      "Loss: 1.9486\n",
+      "Loss: 1.9378\n",
+      "Loss: 1.9281\n",
+      "Loss: 1.9174\n",
+      "Loss: 1.9047\n",
+      "Loss: 1.8922\n",
+      "Loss: 1.8798\n",
+      "Loss: 1.8674\n",
+      "Loss: 1.8574\n",
+      "Loss: 1.8485\n",
+      "Loss: 1.8301\n",
+      "Loss: 1.8213\n",
+      "Loss: 1.8091\n",
+      "Loss: 1.8007\n",
+      "Loss: 1.7850\n",
+      "Loss: 1.7784\n",
+      "Loss: 1.7606\n",
+      "Loss: 1.7496\n",
+      "Loss: 1.7320\n",
+      "Loss: 1.7216\n",
+      "Loss: 1.7067\n",
+      "Loss: 1.6954\n",
+      "Loss: 1.6781\n",
+      "Loss: 1.6667\n",
+      "Loss: 1.6551\n",
+      "Loss: 1.6425\n",
+      "Loss: 1.6272\n",
+      "Loss: 1.6096\n",
+      "Loss: 1.6030\n",
+      "Loss: 1.5824\n",
+      "Loss: 1.5724\n",
+      "Loss: 1.5558\n",
+      "Loss: 1.5399\n",
+      "Loss: 1.5266\n",
+      "Loss: 1.5109\n",
+      "Loss: 1.4952\n",
+      "Loss: 1.4829\n",
+      "Loss: 1.4648\n",
+      "Loss: 1.4496\n",
+      "Loss: 1.4360\n",
+      "Loss: 1.4154\n",
+      "Loss: 1.4010\n",
+      "Loss: 1.3958\n",
+      "Loss: 1.3719\n",
+      "Loss: 1.3562\n",
+      "[0 - 7f4ce0190740]   38.933268 {3}{RequestManager}: [Finetuning] guid(1000001) completed_training_steps(100) processed_finetuning_tokens(3400) latency(38933176.0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "finetuning_request = ff.Request(\n",
+    "    ff.RequestType.REQ_FINETUNING,\n",
+    "    max_sequence_length=configs.max_sequence_length,\n",
+    "    peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),\n",
+    "    dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset),\n",
+    "    max_training_steps=configs.max_training_steps,\n",
+    ")\n",
+    "ft_res = llm.generate([finetuning_request])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA04AAAIjCAYAAAA0vUuxAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/TGe4hAAAACXBIWXMAAA9hAAAPYQGoP6dpAABm/UlEQVR4nO3de1yUdfr/8fcICKKioqgIJKaVHe1gBw94KA9ZmYpKiqVW+3VLLcnd2tq21O1gWdtWW1m2pZ3QjDTL7UQlHlK3rNztaG1KKmIeERVFGu7fH/dvBoaZYQ4MzAzzej4ePMa5577v+TB+UC8/13V9LIZhGAIAAAAAuNUk2AMAAAAAgFBH4AQAAAAAHhA4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBACNUEFBgSwWiwoKCoI9lIi3aNEiWSwWbdq0KdhD8cpPP/2kIUOGqFWrVrJYLHrrrbeCPSS/FBYWymKx6NFHHw32UAA0EgROABq1cPhH6znnnKOTTjpJhmG4PadPnz7q0KGDfvvttwYcWfiYPXu2LBaLOnTooLKyMqfX09PTddVVVwVhZOFn0qRJ+vrrr/XAAw/olVdeUc+ePV2eZwtM3H099NBDDTxyAKhf0cEeAABEugkTJujOO+/U2rVr1a9fP6fXCwsLtWHDBk2fPl3R0fyxXZs9e/Zo/vz5+sMf/hDsoYSlY8eOacOGDbr77rs1ffp0r64ZP368rrjiCqfj5513XqCHBwBBxd/AABBk2dnZuuuuu5Sbm+sycFq8eLEMw9CECROCMLrwcu655+qRRx7R1KlT1axZs2APp0EdPXpUzZs3r9M99u7dK0lq3bq119ecf/75uvbaa+v0vgAQDkjVAwBJX331lYYNG6aEhAS1aNFCl112mTZu3OhwTkVFhebMmaNTTjlFcXFxatu2rfr27av8/Hz7Obt379b111+v1NRUxcbGKjk5WSNGjFBhYaHb905LS1O/fv2Ul5eniooKp9dzc3PVtWtXXXzxxfrll180depUnXbaaWrWrJnatm2rsWPH1np/m/T0dE2ePNnp+IABAzRgwACHY+Xl5Zo1a5a6deum2NhYpaWl6Y477lB5eXmt7zF9+nS1aNHCZbrc+PHj1bFjR1mtVknSpk2bNHToULVr107NmjVTly5ddMMNN3j8Pmpz77336tdff9X8+fNrPc9dDZgt/WzRokX2Y5MnT1aLFi20fft2XXXVVWrRooVSUlL09NNPS5K+/vprXXrppWrevLk6d+6s3Nxcl+9ZVlam3//+92rbtq0SEhI0ceJEHTx40Om89957TxkZGWrevLlatmypK6+8Ut9++63DObYx/fzzz7riiivUsmVLj4G1pzk+e/Zsde7cWZJ0++23y2KxKD09vdZ7esuWKvnhhx/q3HPPVVxcnM444wwtW7bM6dytW7dq7NixSkxMVHx8vC655BL961//cjrv+PHjmj17tk499VTFxcUpOTlZmZmZ+vnnn53OXbBggbp27arY2FhdeOGF+vzzzx1e9+fnFkDkYcUJQMT79ttvlZGRoYSEBN1xxx2KiYnRc889pwEDBmj16tW6+OKLJZn/sJw7d65+97vf6aKLLlJpaak2bdqkL7/8UoMHD5YkjR49Wt9++61uueUWpaena8+ePcrPz9f27dtr/UfohAkTNGXKFH3wwQcOtThff/21vvnmG917772SpM8//1zr16/XuHHjlJqaqsLCQs2fP18DBgzQd999p/j4+Dp/HpWVlbr66qu1bt06TZkyRaeffrq+/vpr/f3vf9ePP/5Ya7OAa665Rk8//bT+9a9/aezYsfbjZWVleueddzR58mRFRUVpz549GjJkiJKSknTnnXeqdevWKiwsdPkPaV9kZGTo0ksv1bx583TzzTcHbNXJarVq2LBh6tevn+bNm6fXXntN06dPV/PmzXX33XdrwoQJyszM1LPPPquJEyeqV69e6tKli8M9pk+frtatW2v27NnasmWL5s+fr19++cUexEnSK6+8okmTJmno0KF6+OGHVVZWpvnz56tv37766quvHObQb7/9pqFDh6pv37569NFHa/2992aOZ2ZmqnXr1rrtttvs6XctWrTw+NmUlZVp3759Tsdbt27tkFr6008/6ZprrtFNN92kSZMmaeHChRo7dqzef/99+8/Pr7/+qt69e6usrEy33nqr2rZtq5deeklXX3218vLyNGrUKPvvx1VXXaWPP/5Y48aN04wZM3T48GHl5+frm2++UdeuXe3vm5ubq8OHD+v3v/+9LBaL5s2bp8zMTG3dulUxMTGS/P+5BRBhDABoxBYuXGhIMj7//HO354wcOdJo2rSp8fPPP9uP7dq1y2jZsqXRr18/+7EePXoYV155pdv7HDx40JBkPPLIIz6P88CBA0ZsbKwxfvx4h+N33nmnIcnYsmWLYRiGUVZW5nTthg0bDEnGyy+/bD+2atUqQ5KxatUq+7HOnTsbkyZNcrq+f//+Rv/+/e3PX3nlFaNJkybG2rVrHc579tlnDUnGp59+6vb7qKysNFJSUozRo0c7HF+6dKkhyVizZo1hGIaxfPlyj78vvpg1a5Yhydi7d6+xevVqQ5Lx2GOP2V/v3Lmzw++dq8/HMAxj27ZthiRj4cKF9mOTJk0yJBkPPvig/djBgweNZs2aGRaLxViyZIn9+A8//GBIMmbNmmU/ZpuDF1xwgXHixAn78Xnz5hmSjBUrVhiGYRiHDx82Wrdubfzf//2fw5h2795ttGrVyuG4bUx33nmnV5+Pt3Pc9v17M4dt57r72rBhg/3czp07G5KMN998037s0KFDRnJysnHeeefZj+Xk5BiSHObe4cOHjS5duhjp6emG1Wo1DMMwXnzxRaffY5vKykqH8bVt29Y4cOCA/fUVK1YYkox33nnHMIy6/dwCiCyk6gGIaFarVR9++KFGjhypk08+2X48OTlZ2dnZWrdunUpLSyWZ/4P+7bff6qeffnJ5r2bNmqlp06YqKChwmYJVmzZt2uiKK67Q22+/raNHj0qSDMPQkiVL1LNnT5166qn297CpqKjQ/v371a1bN7Vu3VpffvmlT+/pzhtvvKHTTz9d3bt31759++xfl156qSRp1apVbq+1WCwaO3as3n33XR05csR+/PXXX1dKSor69u0rqaqGZuXKlS7TE+uiX79+GjhwoObNm6djx44F7L6/+93v7L9u3bq1TjvtNDVv3lxZWVn246eddppat26trVu3Ol0/ZcoU+wqHJN18882Kjo7Wu+++K0nKz89XSUmJxo8f7/C5R0VF6eKLL3b5ud98880ex+3LHPfHlClTlJ+f7/R1xhlnOJzXqVMn+4qRJHu64ldffaXdu3dLkt59911ddNFF9nkiSS1atNCUKVNUWFio7777TpL05ptvql27drrlllucxmNbvbO55ppr1KZNG/vzjIwMSbL/HtXl5xZAZCFwAhDR9u7dq7KyMp122mlOr51++umqrKzUjh07JEl//etfVVJSolNPPVVnn322br/9dv33v/+1nx8bG6uHH35Y7733njp06GBP67L9o9CTCRMm6OjRo1qxYoUkaf369SosLHSoXTl27JjuvfdepaWlKTY2Vu3atVNSUpJKSkp06NChunwUdj/99JO+/fZbJSUlOXzZgrc9e/bUev0111yjY8eO6e2335YkHTlyRO+++67Gjh1r/0dt//79NXr0aM2ZM0ft2rXTiBEjtHDhQo81VN6aPXu2du/erWeffTYg94uLi1NSUpLDsVatWik1NdXpH+qtWrVy+Q/wU045xeF5ixYtlJycbK+jsQXkl156qdNn/+GHHzp97tHR0UpNTfU4dl/muD9OOeUUDRo0yOkrISHB4bxu3bo5fVa2OWX7DH755Re347S9Lkk///yzTjvtNK+6TJ500kkOz21BlO33qK4/twAiB4ETAHipX79++vnnn/Xiiy/qrLPO0j//+U+df/75+uc//2k/JycnRz/++KPmzp2ruLg43XPPPTr99NP11Vdfebz/VVddpVatWtmbC+Tm5ioqKkrjxo2zn3PLLbfogQceUFZWlpYuXaoPP/xQ+fn5atu2rSorK2u9f81/tNrYmjXYVFZW6uyzz3a5ipCfn6+pU6fW+j6XXHKJ0tPTtXTpUknSO++8o2PHjumaa65xGEteXp69zXpRUZFuuOEGXXDBBQ4rVf7q16+fBgwY4HbVydvPwiYqKsqn40Yte3K5Y/v9e+WVV1x+7raA2iY2NlZNmvDXuCfe/B7V5ecWQOSgOQSAiJaUlKT4+Hht2bLF6bUffvhBTZo0UVpamv1YYmKirr/+el1//fU6cuSI+vXrp9mzZzukcXXt2lV/+MMf9Ic//EE//fSTzj33XP3tb3/Tq6++WutYYmNjNWbMGL388sv69ddf9cYbb+jSSy9Vx44d7efk5eVp0qRJ+tvf/mY/dvz4cZWUlHj8Xtu0aePyvF9++cUhhatr1676z3/+o8suu8xtgOFJVlaWnnjiCZWWlur1119Xenq6LrnkEqfzLrnkEl1yySV64IEHlJubqwkTJmjJkiUOn6e/Zs+erQEDBui5555zes226lDz87CtaNSHn376SQMHDrQ/P3LkiIqLi+17INkaGrRv316DBg0K2Pv6Osfry//+9z8ZhuEwp3788UdJsjdg6Ny5s9tx2l6XzM/q3//+tyoqKhzSH+vC359bAJGD/6oCENGioqI0ZMgQrVixwqH18K+//qrc3Fz17dvXnnK0f/9+h2tbtGihbt262dPLysrKdPz4cYdzunbtqpYtW3qdgjZhwgRVVFTo97//vfbu3evUYjoqKsppNeMf//iH25WSmmPZuHGjTpw4YT+2cuVKpzStrKwsFRUV6fnnn3e6x7Fjx+w1WLW55pprVF5erpdeeknvv/++Qx2QZKZJ1fw+zj33XEly+Kx+/vlnl+2lvdG/f38NGDBADz/8sNPvS+fOnRUVFaU1a9Y4HH/mmWf8ei9vLFiwwKGea/78+frtt980bNgwSdLQoUOVkJCgBx980GXdl22PJV/5Msfr065du7R8+XL789LSUr388ss699xz7f85cMUVV+izzz7Thg0b7OcdPXpUCxYsUHp6ur1uavTo0dq3b5+eeuopp/fxdbUvED+3ACIDK04AIsKLL76o999/3+n4jBkzdP/99ys/P199+/bV1KlTFR0dreeee07l5eWaN2+e/dwzzjhDAwYM0AUXXKDExERt2rRJeXl5mj59uiTzf88vu+wyZWVl6YwzzlB0dLSWL1+uX3/91SHdrjb9+/dXamqqVqxYoWbNmikzM9Ph9auuukqvvPKKWrVqpTPOOEMbNmzQRx99pLZt23q89+9+9zvl5eXp8ssvV1ZWln7++We9+uqrDq2bJem6667T0qVLddNNN2nVqlXq06ePrFarfvjhBy1dulQffPCBevbsWet7nX/++erWrZvuvvtulZeXO6TpSdJLL72kZ555RqNGjVLXrl11+PBhPf/880pISLCvwEjSZZddJkl+76cza9Ysh1Uem1atWmns2LH6xz/+IYvFoq5du2rlypUe67fq4sSJE/b5sWXLFj3zzDPq27evrr76aklms4T58+fruuuu0/nnn69x48YpKSlJ27dv17/+9S/16dPHZaDgDW/nuD++/PJLl6syXbt2Va9evezPTz31VN144436/PPP1aFDB7344ov69ddftXDhQvs5d955pxYvXqxhw4bp1ltvVWJiol566SVt27ZNb775pj01ceLEiXr55Zc1c+ZMffbZZ8rIyNDRo0f10UcfaerUqRoxYoTX4w/Ezy2ACBHEjn4AUO9sraDdfe3YscMwDMP48ssvjaFDhxotWrQw4uPjjYEDBxrr1693uNf9999vXHTRRUbr1q2NZs2aGd27dzceeOABe4vpffv2GdOmTTO6d+9uNG/e3GjVqpVx8cUXG0uXLvVpzLfffrshycjKynJ67eDBg8b1119vtGvXzmjRooUxdOhQ44cffnBqNe6u3fbf/vY3IyUlxYiNjTX69OljbNq0yakduWEYxokTJ4yHH37YOPPMM43Y2FijTZs2xgUXXGDMmTPHOHTokFffx913321IMrp16+b02pdffmmMHz/eOOmkk4zY2Fijffv2xlVXXWVs2rTJ4bzOnTsbnTt39vhe1duR19S/f39DklMr+b179xqjR4824uPjjTZt2hi///3vjW+++cZlO/LmzZu7vO+ZZ57pdLxm63PbHFy9erUxZcoUo02bNkaLFi2MCRMmGPv373e6ftWqVcbQoUONVq1aGXFxcUbXrl2NyZMnO3w27sZUG2/meCDbkVefj7bP5IMPPjDOOeccIzY21ujevbvxxhtvON33559/NsaMGWO0bt3aiIuLMy666CJj5cqVTueVlZUZd999t9GlSxcjJibG6NixozFmzBh7y/XavhdVaxkfqJ9bAI2fxTD8qGAFAADwUnp6us466yytXLky2EMBAL9R4wQAAAAAHhA4AQAAAIAHBE4AAAAA4AE1TgAAAADgAStOAAAAAOABgRMAAAAAeBBxG+BWVlZq165datmypSwWS7CHAwAAACBIDMPQ4cOH1alTJ/sm2+5EXOC0a9cupaWlBXsYAAAAAELEjh07lJqaWus5ERc4tWzZUpL54SQkJAR5NFJFRYU+/PBDDRkyRDExMcEeDsIE8wb+YN7AX8wd+IN5A3809LwpLS1VWlqaPUaoTcQFTrb0vISEhJAJnOLj45WQkMAfKvAa8wb+YN7AX8wd+IN5A38Ea954U8JDcwgAAAAA8IDACQAAAAA8IHACAAAAAA8InAAAAADAAwInAAAAAPCAwAkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMAJAAAAADwgcAIAAAAADwicAAAAAMADAicAAAAA8IDAKYisVmn1aovWrEnR6tUWWa3BHhEAAAAAVwicgmTZMik9XRo8OFqPPdZTgwdHKz3dPA4AAAAgtBA4BcGyZdKYMdLOnY7Hi4rM4wRPAAAAQGghcGpgVqs0Y4ZkGM6v2Y7l5Ii0PQAAACCEEDg1sLVrnVeaqjMMaccO8zwAAAAAoYHAqYEVFwf2PAAAAAD1j8CpgSUnB/Y8AAAAAPWPwKmBZWRIqamSxeL6dYtFSkszzwMAAAAQGgicGlhUlPTEE+avawZPtuePP26eBwAAACA0EDgFQWamlJcnpaQ4Hm/Z0jyemRmccQEAAABwjcApSDIzpcJCKT//N11xxVZJUuvW0siRwRwVAAAAAFcInIIoKkrq39/QpEnfKiHB0Pbt0po1wR4VAAAAgJoInEJAbGylxowxd7996aUgDwYAAACAEwKnEHHddZWSzBqno0eDPBgAAAAADgicQkTv3oZOPlk6ckRavjzYowEAAABQHYFTiLBYpIkTzV+TrgcAAACEFgKnEGILnD7+WNqxI7hjAQAAAFCFwCmEdOki9esnGYb02mvBHg0AAAAAm6AGTnPnztWFF16oli1bqn379ho5cqS2bNni8bqSkhJNmzZNycnJio2N1amnnqp33323AUZc/6qn6xlGcMcCAAAAwBTUwGn16tWaNm2aNm7cqPz8fFVUVGjIkCE6WktbuRMnTmjw4MEqLCxUXl6etmzZoueff14pKSkNOPL6M3as1KyZ9MMP0rPPSosXSwUFktUa7JEBAAAAkSs6mG/+/vvvOzxftGiR2rdvry+++EL9+vVzec2LL76oAwcOaP369YqJiZEkpaen1/dQG0xCgtSzp7R2rTR1atXx1FTpiSekzMzgjQ0AAACIVEENnGo6dOiQJCkxMdHtOW+//bZ69eqladOmacWKFUpKSlJ2drb+9Kc/KSoqyun88vJylZeX25+XlpZKkioqKlRRURHg78B3tjHYHpcvt2jt2ihJFofziooMjRkjLVli1ahR5PBFuprzBvAG8wb+Yu7AH8wb+KOh540v72MxjNCopKmsrNTVV1+tkpISrVu3zu153bt3V2FhoSZMmKCpU6fqf//7n6ZOnapbb71Vs2bNcjp/9uzZmjNnjtPx3NxcxcfHB/R7qCurVZoyZYj2749TzcDJZKhdu2N67rl8uYgRAQAAAPigrKxM2dnZOnTokBISEmo9N2QCp5tvvlnvvfee1q1bp9TUVLfnnXrqqTp+/Li2bdtmX2F67LHH9Mgjj6i4uNjpfFcrTmlpadq3b5/HD6chVFRUKD8/X4MHD9b69U01eLDnRcD8/N/Uv39I/LYhSKrPG1vKKuAJ8wb+Yu7AH8wb+KOh501paanatWvnVeAUEql606dP18qVK7VmzZpagyZJSk5OVkxMjENa3umnn67du3frxIkTatq0qcP5sbGxio2NdbpPTExMSP0Qx8TEaO9e73479u6NVggNHUEUavMY4YF5A38xd+AP5g380VDzxpf3CGpXPcMwNH36dC1fvlyffPKJunTp4vGaPn366H//+58qKyvtx3788UclJyc7BU3hJjk5sOcBAAAACIygBk7Tpk3Tq6++qtzcXLVs2VK7d+/W7t27dezYMfs5EydO1F133WV/fvPNN+vAgQOaMWOGfvzxR/3rX//Sgw8+qGnTpgXjWwiojAyze57FVXmTzONpaeZ5AAAAABpOUAOn+fPn69ChQxowYICSk5PtX6+//rr9nO3btzvULqWlpemDDz7Q559/rnPOOUe33nqrZsyYoTvvvDMY30JARUWZLccl98HT44+LxhAAAABAAwtqjZM3fSkKCgqcjvXq1UsbN26shxEFX2amlJcnzZgh7dxZdbxFC+mll9jHCQAAAAiGoK44wbXMTKmwUFq1SrItpEVHS1dcEdRhAQAAABGLwClERUVJAwZI999v1j2VlEhvvx3sUQEAAACRicApxEVFSRMnmr9etCioQwEAAAAiFoFTGJg82Xz84AOpqCioQwEAAAAiEoFTGDjlFKlvX6myUnrllWCPBgAAAIg8BE5h4vrrzceFCyUvmhECAAAACCACpzAxdqwUHy/9+KPUSDuxAwAAACGLwClMtGwpjRlj/nrhwuCOBQAAAIg0BE5hxJaut2SJVFYW3LEAAAAAkSQ62AOA9/r1k7p0kbZtkx54QDrrLCk5WcrIMNuWAwAAAKgfBE5hpEkT6aKLzMDpwQerjqemSk88IWVmBm9sAAAAQGNGql4YWbZMWrrU+XhRkVn/tGxZw48JAAAAiAQETmHCapVmzHDditx2LCfHPA8AAABAYBE4hYm1a6WdO92/bhjSjh3meQAAAAACi8ApTBQXB/Y8AAAAAN4jcAoTycmBPQ8AAACA9wicwkRGhtk9z2Jxf05amnkeAAAAgMAicAoTUVFmy3HJffA0Ywb7OQEAAAD1gcApjGRmSnl5UkqK4/G4OPPxqaekPXukggJp8WLzkS57AAAAQN2xAW6YycyURowwu+cVF5s1TWeeKV1yibR1q3TSSVJ5edX5bI4LAAAA1B0rTmEoKkoaMEAaP958TEqSpk0zX6seNElsjgsAAAAEAoFTI2C1Sn//u+vX2BwXAAAAqDsCp0aAzXEBAACA+kXg1AiwOS4AAABQvwicGgE2xwUAAADqF4FTI8DmuAAAAED9InBqBLzZHPeee9gcFwAAAPAXgVMj4W5z3JgY8/HVV6UTJ9gcFwAAAPAHG+A2Iq42x+3YUbrwQmnNGnO/p9LSqvPZHBcAAADwDitOjUzNzXG7d5duuMF8rXrQJLE5LgAAAOAtAqdGzmp1HxixOS4AAADgHQKnRo7NcQEAAIC6I3Bq5NgcFwAAAKg7AqdGjs1xAQAAgLqjq14jZ9sct6ioqqapptRUqXdvs0W5rRtfRgb7PgEAAAA2BE6NnG1z3DFjzM1xXQVPMTHSySebwZUNrcoBAACAKqTqRQB3m+MmJUnR0dK2bY5Bk0SrcgAAAKA6AqcIkZkpFRZKq1ZJubnm486dUps2rs+nVTkAAABQhVS9CGLbHNemoEDau9f9+dVblVe/DgAAAIg0BE4RzNsW5EVFNI4AAABAZCNwimDetiC/7TbHlSkaRwAAACDSUOMUwWytyi2W2s+rmc5H4wgAAABEGgKnCGZrVS55Dp6qo3EEAAAAIg2BU4SrrVV5bao3jgAAAAAaO2qcoMxMacQIMwiyNYAoKpKuvdbztd42mAAAAADCGYETJLluVe4NbxtMAAAAAOGMVD245E3jCFtrcgAAAKCxI3CCS940jvjtN7POqaBAWrzYfKRZBAAAABojUvXglq1xxIwZ0s6dVcc7dZIqK6Xdu6Vu3RyDJfZ4AgAAQGPEihNqlZkpFRZKq1ZJubnm4/bt0pw55us1V5jY4wkAAACNEStO8Khm4wirVbrvPtfnGoaZ2peTY3bqi4pqiBECAAAA9YsVJ/hs7VrH1L2aqu/xZLVSAwUAAIDwx4oTfObt3k0rVkjXXecYZFEDBQAAgHDEihN85u3eTY8/7rwyRQ0UAAAAwlFQA6e5c+fqwgsvVMuWLdW+fXuNHDlSW7Zs8fr6JUuWyGKxaOTIkfU3SDjxZo8ndwzDfMzJkU6cII0PAAAA4SGogdPq1as1bdo0bdy4Ufn5+aqoqNCQIUN09OhRj9cWFhbqj3/8ozLYgbXB1bbHkzfBlK0GKjVVGjhQys42H9PTWYkCAABAaApq4PT+++9r8uTJOvPMM9WjRw8tWrRI27dv1xdffFHrdVarVRMmTNCcOXN08sknN9BoUZ1tj6eUFMfjqanmapI39u51fE4aHwAAAEJVSDWHOHTokCQpMTGx1vP++te/qn379rrxxhu1du3aWs8tLy9XeXm5/XlpaakkqaKiQhUVFXUccd3ZxhAKY/HV8OHSFVdI69ZZVFxs1j717Wto3TqLHn/c96lltjI3NGOGdMUVv9HKvBbhPG8QPMwb+Iu5A38wb+CPhp43vryPxTBsVSfBVVlZqauvvlolJSVat26d2/PWrVuncePGafPmzWrXrp0mT56skpISvfXWWy7Pnz17tubYdmutJjc3V/Hx8YEaPqqxWqUpU4Zo//44SX4UQkmaM2edmjSRDh6MU5s2x3XGGfsJpAAAABBQZWVlys7O1qFDh5SQkFDruSETON1888167733tG7dOqWmpro85/DhwzrnnHP0zDPPaNiwYZLkMXByteKUlpamffv2efxwGkJFRYXy8/M1ePBgxcTEBHs4AbN8uUXjxpmRjmFUD54MeRNMJSYaOnCg6ryUFEOPPWbVqFEhMV2DrrHOG9Qv5g38xdyBP5g38EdDz5vS0lK1a9fOq8ApJFL1pk+frpUrV2rNmjVugyZJ+vnnn1VYWKjhw4fbj1VWVkqSoqOjtWXLFnXt2tXhmtjYWMXGxjrdKyYmJqR+iENtPHWVlSVFR0szZji2JE9KsjjVNrlSPWiSpF27LBo3Llp5edKIEebmurb0wIwMRexqVGObN2gYzBv4i7kDfzBv4I+Gmje+vEdQAyfDMHTLLbdo+fLlKigoUJcuXWo9v3v37vr6668djv3lL3/R4cOH9cQTTygtLa0+hwsfZWY6Bzm9e0tdu5qNIHxZ6zTrn6QpU5yDMTbVBQAAQH0LauA0bdo05ebmasWKFWrZsqV2794tSWrVqpWaNWsmSZo4caJSUlI0d+5cxcXF6ayzznK4R+vWrSXJ6ThCQ1SUNGCA47EnnjC751ksvgdP+/c7H7d148vLI3gCAABA/QhqO/L58+fr0KFDGjBggJKTk+1fr7/+uv2c7du3q7i4OIijRKC5a2XuoZmiW2yqCwAAgPoW9FQ9TwoKCmp9fdGiRYEZDBqUqzQ+q1UaNMi/+1XfVLd6DRVpfAAAAAiEkGgOgchUM43PajUDHV/rn6pzt6kuaXwAAACoi6Cm6gHVRUWZq0OSWf8UCNXT+EjbAwAAgL8InBBS3NU/paZKbdv6F1DZ0vjWrg3MGAEAABB5SNVDyHFV/5SRIa1Y4V83PpuiIrNhBPs/AQAAwFcETghJrtqY21ajnDfVda5tcuW222gcAQAAAP+QqoewkpkpFRZKq1ZJubnm486dZhDkKY3PXeOIZcvqbbgAAABoJAicEHZsq1Hjx5uPTZv611SCxhEAAADwFoETGgV3TSWSkmq/ztY4oqCAjXMBAADgHjVOaDRcNZUoKpKuvdbztVlZ0oEDVc+pfwIAAEB1BE5oVGo2lSgo8O666kGTxMa5AAAAcESqHhq1jAzvGkfUVL3+6cQJ0vgAAAAiHStOaNSiosyUO3/2f7LVP6Wmum9jbrU67zfF3lAAAACNDytOaPTcNY5ITPTuendtzO+4Q0pPlwYOlLKzzcf0dNqbAwAANEasOCEiuGocYbVKgwb5fi/bqtUjjzi/VrM2ihUpAACAxoHACRGjZuMIq9VMuysq8i2FrzaGYaYE5uRIlZXSbbeZG/Ta0K0PAAAgPJGqh4hlq3+SfG8eURtbbdTYsY5Bk1S1IkU6HwAAQHghcEJE83fjXH9V79ZHdz4AAIDwQeCEiJeZKRUWSqtWSbm55uPOnf61MfeGbUVq7drA3xsAAAD1gxonQM71T5L/bcy9VVRk7gtF4wgAAIDQx4oT4Ia7NL60NOn2282AquaKlC8rVLfdRitzAACAcEHgBNTCVRrftm3SvHmug6rUVGnpUu/S/NztD0XwBAAAEHpI1QM8cJXGJ7neG8qWbhcV5XuaX/VW5lddJa1fTxofAABAqCBwAuqgtqAqL0+aMcOxJXlSkvNKU3W2xhGpqY7nsf8TAABAcJGqB9QTV2l+f/+7d9eSxgcAABBaWHEC6lHNFamCAv/uUzONb80ai9asSVHz5hYNHEgaHwAAQH0jcAIaUEaGmXZXVOR7i3PHNL5oST312GOOaXxWq+uaKwAAANQNqXpAA4qKMoMcyf/Ndd2l8d1xh9nSnBbnAAAAgUfgBDQwd/tDJSX5dz/DML8eecSxEYVEbRQAAECgEDgBQeCqccTOnd7t/+QLWzpgTo6ZxgcAAAD/UOMEBImrVuZPPOH7/k+e2Gqj1q513TodAAAAnrHiBISQQKfxVVdcXPd7AAAARCpWnIAQk5kpjRjh2B2vd2+pa1f/uvHZtG9vtkOn4x4AAIDvCJyAEBToNL64OGnSJDPwsqnexhwAAAC1I1UPCBPu0vjS0qTbbzcDKneNJY4fdwyaJMeOe1aruRq1eLH5WL2RRG2vAQAARApWnIAwYkvjW7XqN7333mYNG3auBg6MVlSUdMkl0owZji3JU1OlkhLpyBHnexmGGWhNmeL6Ott+U+5eY6UKAABEEgInIMxERUn9+xs6erRI/fv3sNcpuaqNslqlQYPc38swpP37nY8XFUmjR7u+xrZSlZdH8AQAACIHgRPQiNSsjVq82L/71FZDZVupyskxAzUaTAAAgEhAjRPQiCUn1899q+8NRQ0UAACIBAROQCOWkWHWJLlrGlFXK1ZI6enSwIFSdrb5mJ5uNpwAAABoTAicgEYsKqqqyUN9BE+PP+7YOEJy7NYHAADQWBA4AY2cuzbmqalS27aBD6hs9VE5OdKJE6TxAQCAxoHmEEAEcNVxLyPDTLVztalu9ee1veaOrQYqNVXau7fqOK3MAQBAuGLFCYgQto5748ebj1FRta9Gvfmm+eXqtZwc796zetAkkcYHAADCFytOQIRztxplazPu6rW1a836Jl/RyhwAAIQrAicATvs/eXrN1q2vqMhz2l5N1VuZu3tPAACAUEPgBMBntm59ruqjvFVUZDaMcLXKZbW6XwEDAAAIBgInAH6x1UfNmOHYkjwpybm2yZXbbnPdOEJyvidNJQAAQLAROAHwm6v6qN69pa5dPafxuWocMXq063NtTSXy8gieAABAcNBVD0Cd1OzW17Spf5vu1hZkVd8bir2gAABAMBA4AQg4d23Ok5L8v2f1phIAAAANjVQ9APXCVRpfUZF07bV1u29xcWDGBwAA4AsCJwD1pmYr84KCut+zfXu68QEAgIZH4ASgwdRl/ydJio6WrrvOcdWJbnwAAKAhUOMEoMHY9n+SnBtHVH/urqnEb785p+rZuvGNHu0YNNleGzNGWrasbuMGAAAIauA0d+5cXXjhhWrZsqXat2+vkSNHasuWLbVe8/zzzysjI0Nt2rRRmzZtNGjQIH322WcNNGIAdeWucURqqvTmm+aXq9dat3Z9P7rxAQCAhhDUwGn16tWaNm2aNm7cqPz8fFVUVGjIkCE6evSo22sKCgo0fvx4rVq1Shs2bFBaWpqGDBmioqKiBhw5gLrIzJQKC6VVq6TcXPNx2zbzuKvXFi2SSkr8ey+68QEAgEAIao3T+++/7/B80aJFat++vb744gv169fP5TWvvfaaw/N//vOfevPNN/Xxxx9r4sSJ9TZWAIFVs3FEba8tXlz39ysqct9UAgAAwJOQag5x6NAhSVJiYqLX15SVlamiosLtNeXl5SovL7c/Ly0tlSRVVFSooqKiDqMNDNsYQmEsCB+RNm+Skiyq6x9XOTmG9u2rKp5KSTH02GNWjRrlR5eKMBVp8waBw9yBP5g38EdDzxtf3sdiGP70tgq8yspKXX311SopKdG6deu8vm7q1Kn64IMP9O233youLs7p9dmzZ2vOnDlOx3NzcxUfH1+nMQNoGFarNGXKEO3fHyfJVecI2x9jbrpKuHzdPPanP32uiy4q1nfftdXBg3Fq0+a4zjhjP6tRAABEgLKyMmVnZ+vQoUNKSEio9dyQCZxuvvlmvffee1q3bp1SU1O9uuahhx7SvHnzVFBQoHPOOcflOa5WnNLS0rRv3z6PH05DqKioUH5+vgYPHqyYmJhgDwdhIhLnzfLlFo0bZ0YzhlEVAFkshr0JhMXi+JqngMpiMZSYKMXFSUVFjX81KhLnDQKDuQN/MG/gj4aeN6WlpWrXrp1XgVNIpOpNnz5dK1eu1Jo1a7wOmh599FE99NBD+uijj9wGTZIUGxur2NhYp+MxMTEh9UMcauNBeIikeZOVZe7j5LxXk0WPP27+uuZrSUkW7d3r/p6GYdH+/c7Hd+2yaNy4aOXlNc49oCJp3iCwmDvwB/MG/mioeePLewQ1cDIMQ7fccouWL1+ugoICdenSxavr5s2bpwceeEAffPCBevbsWc+jBBAqMjOlESPMDnmumjzUfK2oSLr2Wt/fxzDM1aucHPOepO0BAICgBk7Tpk1Tbm6uVqxYoZYtW2r37t2SpFatWqlZs2aSpIkTJyolJUVz586VJD388MO69957lZubq/T0dPs1LVq0UIsWLYLzjQBoML504yso8P99bG3MCwrM+9KNDwCAyBbUfZzmz5+vQ4cOacCAAUpOTrZ/vf766/Zztm/fruLiYodrTpw4oTFjxjhc8+ijjwbjWwAQwjIyzM1zLe56RnghK0saOFDKzjYf09OlZcsCNkQAABAmgp6q50lBjf8yLiwsrJ/BAGh0oqKkJ56QxoyxNY7w/R4HDjg+Lyoy79dY658AAIBrQV1xAoD6lplpBjkpKY7HU1Oltm19X42yBV85OWabdAAAEBkInAA0epmZUmGhtGqVlJtrPhYWSgsWmK/7Ezzt2GE2orBazTqoxYvNR4IpAAAap5BoRw4A9c1VUwnbalTNNuaJic4peq6sWCFdd13N9uhmemBmphlEuesACAAAwguBE4CI5qrFudUqDRrk+Vrb/lHV2Wqg/vhHcxXKXVAFAADCC4ETgIhXczXKajWDnKIi3xtK2M5/5BHn12gsAQBA+KLGCQBqsHXjk5zrn+rS2pzGEgAAhC8CJwBwobZufDk5/t+3emMJAAAQPgicAMANV934tm0za6Lqqtq+3gAAIAxQ4wQAtXDVjS8jw/8aKJv27c325XTcAwAgPBA4AYCPbDVQY8aYNU++Bk9xcdKkSWbgZUPHPQAAQhupegDgB3c1UGlp0u23mwGVu0YSx487Bk1SVce9ZcvYVBcAgFDEihMA+MnVHlC2lLtLLnHeWDc1VSopkY4ccb6XYZiB1pQprq9jNQoAgOAicAKAOnBVAyX5t7GuYUj79zsfZ/8nAACCj8AJAOpJzaBq8WL/7mNbjcrJMYMxmkgAANDwCJwAoIEkJ/t/rW3/p4ICM3CiGx8AAA2LwAkAGkgg2phnZUkHDlQ9r17/ZLW6rrcCAAB1R1c9AGggtjbmkvuOe55UD5qkqvqnO+6Q0tOlgQOl7GzzMT3d7NIHAADqjsAJABqQuzbmqalS27a+B1SGYX498ohjJz7JscU5AACoGwInAGhgmZlSYaG0apWUm2s+FhZKCxaYr/u7GlWTLR0wJ0c6cUJavdqiNWtStHq1hb2hAADwETVOABAErtqY21ajau7jlJjonKLnLVtTidRUae/eaEk99dhj7A0FAICvWHECgBDiajVq6dK633fvXsfnpPEBAOCbOq04HT9+XHFxcYEaCwBAzqtRVmvdu/HVxN5QAAD4xucVp8rKSt13331KSUlRixYttHXrVknSPffcoxdeeCHgAwSASBeIbnyuVN8bqqDA3KC3oEDUPwEA4ILPgdP999+vRYsWad68eWratKn9+FlnnaV//vOfAR0cAMDkrhtfWpp0++1mQOVvUJWVRRtzAAA88Tlwevnll7VgwQJNmDBBUdVyO3r06KEffvghoIMDAFRxVf+0bZs0b57roCopybv7utsbiuAJAIAqPtc4FRUVqVu3bk7HKysrVVFREZBBAQBcc9WNTzKDqhEjpLVrpeJiKTlZ6t1b6trV99qomvVPkuN9MzKoiQIARB6fA6czzjhDa9euVefOnR2O5+Xl6bzzzgvYwAAAvnEVVD3xhLl6ZLH4Hjzt2CE98ID0/POO7dFpZQ4AiEQ+B0733nuvJk2apKKiIlVWVmrZsmXasmWLXn75Za1cubI+xggA8FNd94aaNcv5mC2VLy+P4AkAEDl8rnEaMWKE3nnnHX300Udq3ry57r33Xn3//fd65513NHjw4PoYIwCgDmy1Ufn5v2nmzE3Kz/+tTntD2VaucnKkEyfoyAcAiAx+7eOUkZGh/Pz8QI8FAFBPoqKk/v0NHT1apP79e6hJk7rtDWVL5UtNddxclzQ+AEBj5fOKEwAg/NW2N5Qvbc2rB00SHfkAAI2Xz4FTkyZNFBUV5fYLABAe3O0NlZoqzZnj3z1J4wMANFY+p+otX77c4XlFRYW++uorvfTSS5rj79+0AICgcNXGPCPDfO355/1L5SONDwDQGPkcOI2wbepRzZgxY3TmmWfq9ddf14033hiQgQEAGoa7vaH8bWVu4y6Nj258AIBwFLAap0suuUQff/xxoG4HAAgyd6l8SUn+3a96Gh9pewCAcONXV72ajh07pieffFIpNf92BQCENVepfL17S1271i2Nr6DAXOmqnh5ImSwAIJT5HDi1adNGlmotlwzD0OHDhxUfH69XX301oIMDAASfq1S+uqbxZWU5bsBbvf7JanWuuSKoAgAEm8+B09///neHwKlJkyZKSkrSxRdfrDZt2gR0cACA0GRL45sxQ9q5s+p4UpJzbZMr1YMmqar+6Y9/NLvwVb8nTSUAAKHA58Bp8uTJ9TAMAEC4CWQan+3cRx5xfo2mEgCAUOBV4PTf//7X6xuec845fg8GABBe6iONrybDMO+VkyNddZW0fj1pfACAhudV4HTuuefKYrHI8PA3oMVikZVWSQAQ0dyl8SUmOqfoeYu9oQAAweZV4LRt27b6HgcAoBFxlcZntUqDBtXtvuwNBQAIFq8Cp86dO9f3OAAAjUzNND6r1Vwh8qeNuTvV0/hGjCBtDwBQf/zex+m7777T9u3bdeLECYfjV199dZ0HBQBofKKiAl//JFWl8a1d61xvBQBAoPgcOG3dulWjRo3S119/7VD3ZGtRTo0TAMAdd/VPaWnSuHHSo4+az/0JqoqKzI11aRwBAKgPTXy9YMaMGerSpYv27Nmj+Ph4ffvtt1qzZo169uypgoKCehgiAKAxycyUCgulVauk3Fzzcds2ad48M6hKSXE8PynJu/vedps0cKCUnW0+pqdLy5YFevQAgEjl84rThg0b9Mknn6hdu3Zq0qSJmjRpor59+2ru3Lm69dZb9dVXX9XHOAEAjYirNuZS3faGonEEAKA++bziZLVa1bJlS0lSu3bttGvXLklmA4ktW7YEdnQAgIhjC6rGjzcfmzY1a6MkszbKW7YgKydHOnHCTONbvNh8JKscAOArn1eczjrrLP3nP/9Rly5ddPHFF2vevHlq2rSpFixYoJNPPrk+xggAiHDuaqOSkpxXmqpj/ycAQKD4HDj95S9/0dGjRyVJf/3rX3XVVVcpIyNDbdu21euvvx7wAQIAILlO4ysqkq691vO1pPEBAOrK68CpZ8+e+t3vfqfs7GwlJCRIkrp166YffvhBBw4cUJs2beyd9QAAqA81a6P87UnE/k8AAF95XePUo0cP3XHHHUpOTtbEiRMdOuglJiYSNAEAGlxGhpl2589fQbY0voIC6p8AAJ55HTi98MIL2r17t55++mlt375dl112mbp166YHH3xQRUVF9TlGAABcsm2qK/kXPElSVhZtzAEAnvnUVS8+Pl6TJ09WQUGBfvzxR40bN07PPfec0tPTdeWVV2qZj3/TzJ07VxdeeKFatmyp9u3ba+TIkV515nvjjTfUvXt3xcXF6eyzz9a7777r0/sCABoPW+MIf/d/OnDA8bmt/ongCQBQnc/tyG26du2q+++/X4WFhVq8eLE2btyosWPH+nSP1atXa9q0adq4caPy8/NVUVGhIUOG2JtPuLJ+/XqNHz9eN954o7766iuNHDlSI0eO1DfffOPvtwIACHOuNtXdudO/NL7qbcytVvOLVD4AgM9d9aorKCjQwoUL9eabbyo6Olr/93//59P177//vsPzRYsWqX379vriiy/Ur18/l9c88cQTuvzyy3X77bdLku677z7l5+frqaee0rPPPuvfNwIACHuuNtV94glz9chiqX3z3Jps9U8PPCA9/7xjC3RamQNAZPI5cNq5c6cWLVqkRYsWaevWrcrIyNAzzzyjsWPHqlmzZnUazKFDhySZzSbc2bBhg2bOnOlwbOjQoXrrrbdcnl9eXq7y8nL789LSUklSRUWFKioq6jTeQLCNIRTGgvDBvIE/InHeDB8uLVli0cyZUSoqqlp6Skw0dOCA56WoWbNs0VbVuUVFhsaMkZYssWrUKB+isTAWiXMHdce8gT8aet748j4Ww/Du/+CWLl2qF198UR9//LHat2+vSZMm6YYbblC3bt38Hmh1lZWVuvrqq1VSUqJ169a5Pa9p06Z66aWXNH78ePuxZ555RnPmzNGvv/7qdP7s2bM1Z84cp+O5ubmKj48PyNgBAKHNapW++66tDh6MU5s2x1VZKc2a1deLKw1VD5qqH2/X7pieey6fVuYAEMbKysqUnZ2tQ4cO2bdccsfrFadrr71WV155pZYvX64rrrhCTZr4XR7l0rRp0/TNN9/UGjT546677nJYoSotLVVaWpqGDBni8cNpCBUVFcrPz9fgwYMVExMT7OEgTDBv4I9InzfDh1f92mqVFiwwtGuXZBiuAyMzYHK3KmXRvn3xat78SkVFVW3I27ev0SgDqUifO/AP8wb+aOh5Y8tG84bXgdPOnTvVvn17vwbkyfTp07Vy5UqtWbNGqamptZ7bsWNHp5WlX3/9VR07dnR5fmxsrGJjY52Ox8TEhNQPcaiNB+GBeQN/MG+kmBjpySdd1z+Zz73rKJGdHe3Qla+x1z8xd+AP5g380VDzxpf38HrZqD6CJsMwNH36dC1fvlyffPKJunTp4vGaXr166eOPP3Y4lp+fr169egV8fACAxstdG/PUVMlFhrdLtDIHgMhRp656dTVt2jTl5uZqxYoVatmypXbv3i1JatWqlb3RxMSJE5WSkqK5c+dKkmbMmKH+/fvrb3/7m6688kotWbJEmzZt0oIFC4L2fQAAwlNmpjRihLR2bVW6XUaG+drzz5uBkK/d+CwWs5X5VVdJ69c73rcxpvEBQKQIauA0f/58SdKAGv1jFy5cqMmTJ0uStm/f7lBP1bt3b+Xm5uovf/mL/vznP+uUU07RW2+9pbPOOquhhg0AaERctTGX6t7KPDVV2ru36nhjT+MDgMYuqIGTNw39CgoKnI6NHTvW5812AQDwhS2Vb8YMx32cEhOdU/RcqR40SVVpfHl5BE8AEI58bo23Y8cO7az2N8hnn32mnJwcUuUAAI1OZqZUWCitWiXl5pqPS5f6dy/b/xXm5Jhd/QAA4cXnFafs7GxNmTJF1113nXbv3q3BgwfrzDPP1Guvvabdu3fr3nvvrY9xAgAQFDVT+axWM+3O1/onqSqNr6BADm3MqX8CgNDn84rTN998o4suukiSuSnuWWedpfXr1+u1117TokWLAj0+AABCSlSUWaskmfVP/sjKkgYOlLKzzcf0dDrxAUCo8zlwqqiosO+L9NFHH+nqq6+WJHXv3l3FxcWBHR0AACHIXSvzpCTvrqeNOQCEH58DpzPPPFPPPvus1q5dq/z8fF1++eWSpF27dqlt27YBHyAAAKHIVf3Tzp1mGp+vK1HUPwFA6PO5xunhhx/WqFGj9Mgjj2jSpEnq0aOHJOntt9+2p/ABABAJXLUyr2sbc+qfACA0+Rw4DRgwQPv27VNpaanatGljPz5lyhTFx8cHdHAAAISburYxz8pyPI/9nwAgNPicqnfs2DGVl5fbg6ZffvlFjz/+uLZs2aL27dsHfIAAAISburQxp/4JAEKTzytOI0aMUGZmpm666SaVlJTo4osvVkxMjPbt26fHHntMN998c32MEwCAsBKoNuaGYab95eRIV10lrV9PGh8ABIPPK05ffvmlMjIyJEl5eXnq0KGDfvnlF7388st68sknAz5AAAAag7q0MbfVP6Wm0sYcAILF58CprKxMLVu2lCR9+OGHyszMVJMmTXTJJZfol19+CfgAAQBoLNy1MU9M9O76vXsdn5PGBwANx+fAqVu3bnrrrbe0Y8cOffDBBxoyZIgkac+ePUpISAj4AAEAaEzqUv9UE23MAaDh+FzjdO+99yo7O1u33XabLr30UvXq1UuSufp03nnnBXyAAAA0NoGqf5JoYw4ADcXnwGnMmDHq27eviouL7Xs4SdJll12mUaNGBXRwAABEAlv9kz/7P9nQxhwA6pfPqXqS1LFjR5133nnatWuXdv7/TSouuugide/ePaCDAwAgUrirf0pK8u562pgDQP3yOXCqrKzUX//6V7Vq1UqdO3dW586d1bp1a913332qrKysjzECABARXNU/7dxprh7504lPMuufTpwwU/kWLzYfqYcCAN/5nKp3991364UXXtBDDz2kPn36SJLWrVun2bNn6/jx43rggQcCPkgAACJFzfonyf80vuptzKt35CONDwB85/OK00svvaR//vOfuvnmm3XOOefonHPO0dSpU/X8889r0aJF9TBEAAAiG23MASD4fA6cDhw44LKWqXv37jpQM8EaAAAEBG3MASC4fE7V69Gjh5566ik9+eSTDsefeuophy57AAAgsOqjjfnatWbr8rVraWUOALXxOXCaN2+errzySn300Uf2PZw2bNigHTt26N133w34AAEAgGuBaGO+YoV03XVmEwobaqAAwJnPqXr9+/fXjz/+qFGjRqmkpEQlJSXKzMzUli1blJGRUR9jBAAAbtS1jfnjjzsGTRI1UADgis8rTpLUqVMnp+55O3fu1JQpU7RgwYKADAwAAHgnM1MaMcIx3a53b6lrV//T+CwWswZqxIh6GTIAhB2/NsB1Zf/+/XrhhRcCdTsAAOADW/3T+PHmY9OmZrqd5LwHlDd7QtlqoAoKpNWrLVqzJkWrV1toJgEgYgUscAIAAKHFXRpfaqq5muSNrCxp8OBoPfZYTw0eHK30dFL4AEQmAicAABoxV23Mt23zPgWv5k4j1D8BiFR+1TgBAIDwUbONuWS2HPenlXnN+ifalgOIFF4HTpkeepKWlJTUdSwAAKCB1KWVefX6p6go9n8CEBm8DpxatWrl8fWJEyfWeUAAAKBh2GqgZsxwbEmemOicoudKVpbjeez/BKAx8zpwWrhwYX2OAwAABIGrVuZWqzRokOdr3dU/5eURPAFofKhxAgAgwtWsgbJaqX8CgJoInAAAgAPqnwDAGe3IAQCAE3d7QCUmend9VpY0cKCUnW0+sv8TgHBH4AQAAFyy7QGVn/+bZs7cpPz837R0qXfXsv8TgMaGVD0AAOBWVJTUv7+ho0eL1L9/DzVpUvf6p6uuktavJ40PQHghcAIAAF4LRP1Taqq0d2/VcdqYAwgHpOoBAACf1LX+qXrQJJHGByA8sOIEAAB8Vpf9n2oijQ9AOCBwAgAAfgnU/k8SaXwAQh+pegAAICBs9U+SuYLkD9L4AIQqAicAABAw7uqfkpL8u59t5Sonx1zRAoBgIXACAAABZdv/adUqKTfXfNy500y782clypbGt3ZtwIcKAF6jxgkAAARczfonyf825jbFxQEZGgD4hRUnAADQIOqaxte+vVRQIC1ebD6SugegIbHiBAAAGoyrNua9e0tdu9beja9FC2nyZDPlz4aOewAaEoETAABoUP6k8R05Yn5VZ+u4l5dH8ASg/pGqBwAAgs5dGl9KihQX5/qa6h33TpwgjQ9A/WLFCQAAhARXaXxWqzRokPtr2DgXQEMhcAIAACGjZhrf4sXeXedu41zS+AAECql6AAAgZCUn+3cdG+cCCDRWnAAAQMjKyDDT7mrruOeOLY2voMBcybKl/2VkmM8BwBcETgAAIGRFRdV949ysLOnAgarn1D8B8AepegAAIKTVdePc6kGTVFX/tGxZYMYHIDIQOAEAgJCXmSkVFkqrVkm5uebjzp3m6pHF4tu9atY/Wa20MgfgWVADpzVr1mj48OHq1KmTLBaL3nrrLY/XvPbaa+rRo4fi4+OVnJysG264Qfv376//wQIAgKCyddwbP958bNrUTLmT/AueduyQHnhASk+XBg6UsrPNx/R0VqMAOAtq4HT06FH16NFDTz/9tFfnf/rpp5o4caJuvPFGffvtt3rjjTf02Wef6f/+7//qeaQAACAUuUvjS0z07vpZs8yVq+pI5QPgSlCbQwwbNkzDhg3z+vwNGzYoPT1dt956qySpS5cu+v3vf6+HH364voYIAABCnD8b59bGMMwVrJwc87504AMghVlXvV69eunPf/6z3n33XQ0bNkx79uxRXl6errjiCrfXlJeXq7y83P68tLRUklRRUaGKiop6H7MntjGEwlgQPpg38AfzBv4Kl7nTp0/Vr61WKSUlWrt2SYbhKo/PkOQ+v8+Wyrdq1W/q39+PVn4Im3mD0NLQ88aX97EYhj+NPQPPYrFo+fLlGjlyZK3nvfHGG7rhhht0/Phx/fbbbxo+fLjefPNNxcTEuDx/9uzZmjNnjtPx3NxcxcfHB2LoAAAgBG3YkKyHH77w/z+rHiQZLo65lpOzSW3bHtfBg3Fq0+a4zjhjPytQQCNSVlam7OxsHTp0SAkJCbWeG1aB03fffadBgwbptttu09ChQ1VcXKzbb79dF154oV544QWX17hacUpLS9O+ffs8fjgNoaKiQvn5+Ro8eLDb4A+oiXkDfzBv4K9wnjvLl1s0c2aUioqqgqTUVEM33FCpv/7VcwTUrp2hffuqrk1JMfTYY1aNGhUS/3wKaeE8bxA8DT1vSktL1a5dO68Cp7BK1Zs7d6769Omj22+/XZJ0zjnnqHnz5srIyND999+v5ORkp2tiY2MVGxvrdDwmJiakfohDbTwID8wb+IN5A3+F49zJypJGj3asf8rIsEiK0osvmo0gavsv5OpBkyTt2mXRuHHRystzrqvKyKAeypVwnDcIvoaaN768R1gFTmVlZYqOdhxy1P//EypEFs4AAECIsbUxr+mJJ8zueRZL7cFTdbbGEVOmSDNmOHbkS00175mZGZBhAwgxQW1HfuTIEW3evFmbN2+WJG3btk2bN2/W9u3bJUl33XWXJk6caD9/+PDhWrZsmebPn6+tW7fq008/1a233qqLLrpInTp1Csa3AAAAwpS7VuZJSbVfZxjS/v20MQciTVBXnDZt2qSBAwfan8+cOVOSNGnSJC1atEjFxcX2IEqSJk+erMOHD+upp57SH/7wB7Vu3VqXXnop7cgBAIBfXLUyLyqSrr3W93vRxhxo3IIaOA0YMKDWFLtFixY5Hbvlllt0yy231OOoAABAJKmZyldQ4P+9bG3M1651nR4IIHwFNVUPAAAg1GRkmPVKFs/dyt0qLg7ceACEBgInAACAaqKizCYPkv/BU/v25srV4sXmo9UaqNEBCBYCJwAAgBrcNY5ITZXatq09oGrWTJo0SRo4UMrONh/T02kaAYQ7AicAAAAXMjOlwkJp1SopN9d8LCyUFiwwX3cXPB07ZjaYqI6Oe0D4C6t9nAAAABqSqz2gbKtRrvZxKimRjhxxvk/1jntXXSWtX8/GuUC4IXACAADwkas25larNGiQ+2tsHfdSU6W9e6uOs3EuEB4InAAAAPxQczVq8WLvrqseNElVaXx5eQRPQCijxgkAACAAkpP9u862pWVODt33gFBG4AQAABAAddn/qfrGuQBCE4ETAABAAARi/yc2zgVCF4ETAABAgLjb/ykpybvr2TgXCF00hwAAAAggVx33eveWunY1G0HYappqat5cmjzZucU5HfeA0EDgBAAAEGCu9n964gmze57F4jp4OnrU/KqOjntA6CBVDwAAoAG4S+NLTZXi411fU73j3okTpPEBwcSKEwAAQANh41wgfBE4AQAANCA2zgXCE6l6AAAAQcTGuUB4IHACAAAIIjbOBcIDgRMAAEAQBWLj3KIiGkcA9Y3ACQAAIMjqunHubbdJAwdK2dnmY3q6tGxZwIcJRDQCJwAAgBCQmSkVFkqrVkm5uebjzp3epfG5axxB8AQEDoETAABAiLB13Bs/3nxs2tS/ND4aRwCBR+AEAAAQwvxN47M1jigooP4JCAT2cQIAAAhxrjbOLSqSrr3W87VZWdKBA1XP2TgX8A+BEwAAQBiouXFuQYF311UPmiQ2zgX8RaoeAABAGPJ3/yfqnwD/EDgBAACEobrs/8TGuYDvCJwAAADClLvGEYmJ3l1fXBz4MQGNFTVOAAAAYcxV4wirVRo0yPO1tnOrX5uRYa5mAXBE4AQAABDmajaOsFrN+qeioqqaJleee0667jpzo10buu4BrpGqBwAA0MjUVv9U/fmSJY5Bk1TVdW/ZsvodIxBuCJwAAAAaIXf1T6mp0tKlUps2rq+r3nXvxAk2zwVsSNUDAABopFzVP2VkmM8PHnR/na3rXmqqtHdv1XHS+BDJCJwAAAAasZr1T5L33fSqB00Sm+cispGqBwAAEGGSk/27js1zEclYcQIAAIgwGRnedd1zxZbGV1BgrmbRxhyRgsAJAAAgwti67o0ZY3bZ8zV4kqSsLOnAgarn1D+hsSNVDwAAIAK567qXlOTd9dWDJok25mj8CJwAAAAiVGamVFgorVol5eaajzt3mqtHNfd/8oT6JzR2pOoBAABEMFdd9/xN46te/2QYFq1Zk6LmzS0aOJD6J4Q/VpwAAADgwF0aX2Kid9dnZUmDB0frscd6avDgaKWnk8KH8EfgBAAAACeu0viWLvXuWuqf0BiRqgcAAACXaqbxWa3+tTE3DDPtLydHGjGCtD2EJ1acAAAA4BVbG3PJv+YRO3ZIa9cGflxAQyBwAgAAgNfqWv9UVGQ2j1i82HykAx/CBal6AAAA8Elmpplyt3atVFwsJSebAdCgQZ6vve02ae/equdsnItwQeAEAAAAn/lb/1Q9aJKqGkfk5RE8IbSRqgcAAIA687f+iY1zES4InAAAABAQ7uqfkpJqv47GEQgHBE4AAAAIGNv+T/n5v2nmzE3Kz/9Nf/+7d9cWF9fr0IA6ocYJAAAAARUVJfXvb+jo0SL1799Dn37q3XXt25ud9mwNJzIy2PMJoYPACQAAAPUqI8Nz44joaOm66xxXnei4h1BCqh4AAADqlTeNI377zTlVz9Zxb9kys3EE+z8hmAicAAAAUO/cNY5ITZVat3Z9jW11asoUKT1dGjhQys42H9PTzYAKaChBDZzWrFmj4cOHq1OnTrJYLHrrrbc8XlNeXq67775bnTt3VmxsrNLT0/Xiiy/W/2ABAABQJ7bGEatWSbm55uOiRVJJiftrDEPav1/audPxePXVKKAhBLXG6ejRo+rRo4duuOEGZXqZvJqVlaVff/1VL7zwgrp166bi4mJVVlbW80gBAAAQCDU3zl282L/7GIaZ9peTI40YQRMJ1L+gBk7Dhg3TsGHDvD7//fff1+rVq7V161YlJiZKktLT0+tpdAAAAKhvycn+X1t9/6fqwRhQH8Kqq97bb7+tnj17at68eXrllVfUvHlzXX311brvvvvUrFkzl9eUl5ervLzc/ry0tFSSVFFRoYqKigYZd21sYwiFsSB8MG/gD+YN/MXcgT+8nTeXXCKlpERr1y7JMNx0jvBgx47fVFHhpl0fwkpD/3njy/uEVeC0detWrVu3TnFxcVq+fLn27dunqVOnav/+/Vq4cKHLa+bOnas5c+Y4Hf/www8VHx9f30P2Wn5+frCHgDDEvIE/mDfwF3MH/vBm3lx7bbIefvhCSYak6sFTzeeubdu2UQ8/LB08GKc2bY7rjDP2k7oX5hrqz5uysjKvz7UYhrtu+g3LYrFo+fLlGjlypNtzhgwZorVr12r37t1q1aqVJGnZsmUaM2aMjh496nLVydWKU1pamvbt26eEhISAfx++qqioUH5+vgYPHqyYmJhgDwdhgnkDfzBv4C/mDvzh67xZvtyimTOjVFRUFSilpBg6dkw6eND9alTTpobatZN27XK87rHHrBo1KiT+mQsfNPSfN6WlpWrXrp0OHTrkMTYIqxWn5ORkpaSk2IMmSTr99NNlGIZ27typU045xema2NhYxcbGOh2PiYkJqT/8Q208CA/MG/iDeQN/MXfgD2/nTVaWNHq0Wa9UXGzWPmVkWLRihdk9z2JxvXnuiRMW7drleGzXLovGjYtWXh6b54arhvrzxpf3CKt9nPr06aNdu3bpyJEj9mM//vijmjRpotTU1CCODAAAAHVl67g3frz5GBVV+/5P7hYIbAFWTg4b5SJwgho4HTlyRJs3b9bmzZslSdu2bdPmzZu1fft2SdJdd92liRMn2s/Pzs5W27Ztdf311+u7777TmjVrdPvtt+uGG25w2xwCAAAA4c3d/k//v+eXS9U77lmtUkGB2fq8oIBgCv4Jaqrepk2bNHDgQPvzmTNnSpImTZqkRYsWqbi42B5ESVKLFi2Un5+vW265RT179lTbtm2VlZWl+++/v8HHDgAAgIbj7/5PK1ZI113nuIFuaqr0xBOk8cE3QQ2cBgwYoNp6UyxatMjpWPfu3enqAwAAEOG83f/p8cedjxUVmXVT1EDBF2FV4wQAAABIUkaGuXJk8WPrJ2qg4A8CJwAAAISdqCgz3U5yDp68Caaq10AB3iBwAgAAQFiqreNeTo539ygqonEEvBNW+zgBAAAA1WVmSiNG1Nz/yXzuqr6ppttuk/burXpO4wi4Q+AEAACAsFaz455UVQNVVOR641yb6kGTROMIuEeqHgAAABqd2mqgakPjCLhD4AQAAIBGyV0NVFJS7dfROAKukKoHAACARstVDVRRkXTttZ6vtTWOqF47FRVV70NGiCJwAgAAQKNWswaqoMC762gcgepI1QMAAEBE8XbzXHeNI5Ytq7+xIXQROAEAACCi0DgC/iBwAgAAQMShcQR8RY0TAAAAIhKNI+ALAicAAABELBpHwFuk6gEAAAD/H40j4A6BEwAAAPD/0TgC7hA4AQAAANXQOAKuUOMEAAAA1EDjCNRE4AQAAAC4QOMIVEeqHgAAAOAFGkdENgInAAAAwAs0johsBE4AAACAl2gcEbmocQIAAAB8QOOIyETgBAAAAPiIxhGRh1Q9AAAAoI5oHNH4ETgBAAAAdUTjiMaPwAkAAAAIgLo2jigoML8WLzYfCaRCCzVOAAAAQIDUpXFEVpZ04EDVc+qfQguBEwAAABBA/jaOqB40SVX1T3l5BE+hgFQ9AAAAoB552ziiJuqfQguBEwAAAFCP/G0cIbFxbighcAIAAADqmbvGEYmJ3l1fXBz4McE31DgBAAAADcBV4wirVRo0yPO17dubtVK26zIyzJUsNBwCJwAAAKCB1GwcYbWa9U9FRVU1Ta7QcS/4SNUDAAAAgqS2+qfqz9113Fu2rH7HhyoETgAAAEAQuat/SkmR2rZ1fQ0d9xoeqXoAAABAkPlT/2TruFdQYK5cUf9UvwicAAAAgBBQs/5p8WLvrqP+qWGQqgcAAACEoORk786j/qlhEDgBAAAAISgjw1w98mfTXIn6p0AjcAIAAABCUG0d9zyx1T+tXRv4cUUqAicAAAAgRLnruJeY6N31xcXmqlNBgVkzVVDAKpS/aA4BAAAAhDB/Ou7Z/PSTlJ4u7dxZdYzmEf4hcAIAAABCXM2Oe1arGQAVFVXVNLkya5bzMVvziLw8gidfkKoHAAAAhJna6p881UPRPMI/BE4AAABAGHJX/5SaKs2ZU/u1NI/wHYETAAAAEKYyM6XCQmnVKik313zctk065RTvri8urtfhNSrUOAEAAABhrGb9k+T95rnengcCJwAAAKDRsW2eW1vziE6dpN69zRbltm59GRlmIAZnBE4AAABAI2NrHjFmjNkswlXwdPSo1LmztHt31TFalbtHjRMAAADQCLlrHpGcLCUkSIcOOQZNUlWr8mXLGm6c4YLACQAAAGikXDWPKCyUmjd3fT6tyt0jVQ8AAABoxGo2j7DVNLlja1VeUGBeS/2TicAJAAAAiCDetiDPypIOHKh6Hun1T0FN1VuzZo2GDx+uTp06yWKx6K233vL62k8//VTR0dE699xz6218AAAAQGPjbQvy6kGT5Fj/ZLWaK1KLF5uPkZDWF9TA6ejRo+rRo4eefvppn64rKSnRxIkTddlll9XTyAAAAIDGydaq3GLx7Tpb/dOUKVJ6ujRwoJSdbT6mpzf+hhJBTdUbNmyYhg0b5vN1N910k7KzsxUVFeXTKhUAAAAQ6bxpVe6OYUj79zsft61G5eU13lS+sKtxWrhwobZu3apXX31V999/v8fzy8vLVV5ebn9eWloqSaqoqFBFRUW9jdNbtjGEwlgQPpg38AfzBv5i7sAfzJvQNny4tGSJRTNnRqmoqGrpKTHR0IEDPi5FyQyoLBZDM2ZIV1zxm99NJBp63vjyPmEVOP3000+68847tXbtWkVHezf0uXPnas6cOU7HP/zwQ8XHxwd6iH7Lz88P9hAQhpg38AfzBv5i7sAfzJvQFRsrPfmk9N13bXXwYJzatDmuykpp1qy+ft3PMCzauVN69NF/6+yzXSxL+aCh5k1ZWZnX54ZN4GS1WpWdna05c+bo1FNP9fq6u+66SzNnzrQ/Ly0tVVpamoYMGaKEhIT6GKpPKioqlJ+fr8GDBysmJibYw0GYYN7AH8wb+Iu5A38wb8LH8OFVv7ZapQULDO3aZQZC/ujc+RJdcYUP+X/VNPS8sWWjeSNsAqfDhw9r06ZN+uqrrzR9+nRJUmVlpQzDUHR0tD788ENdeumlTtfFxsYqNjbW6XhMTExI/RCH2ngQHpg38AfzBv5i7sAfzJvwEhNjrkL5U/9kk5YWrbr+ljfUvPHlPcImcEpISNDXX3/tcOyZZ57RJ598ory8PHXp0iVIIwMAAAAaj8xMs8nDjBnSzp1Vx1NTpWPHzDbl7gKq1FSza19jFNTA6ciRI/rf//5nf75t2zZt3rxZiYmJOumkk3TXXXepqKhIL7/8spo0aaKzzjrL4fr27dsrLi7O6TgAAAAA/2VmSiNGSGvXmhvmJiebAdGKFbWvRp10kpnuV/M6f5tFhJKgBk6bNm3SwIED7c9ttUiTJk3SokWLVFxcrO3btwdreAAAAEDEioqSBgxwPOZuNSopyVyJWr9eSkyUjh6tei011Wx/Hu5tyoO6Ae6AAQNkGIbT16JFiyRJixYtUkFBgdvrZ8+erc2bNzfIWAEAAACYAVBhobRqlZSbaz4WF0s5Oebr1YMmqWqPp3DfIDdsapwAAAAAhIaaq1FWq/T6667PNfd4MgOrESPCN20vqCtOAAAAAMLf2rWOqXs1GYa0Y4d5XrhixQkAAABAnRQXe3deUZFUUBCejSMInAAAAADUSXKyd+fl5Ej79lU9D6fGEaTqAQAAAKiTjAwzCLJYaj+vetAkhVfjCAInAAAAAHUSFWWuHEmeg6fqbHtB5eSYDSZCGYETAAAAgDqz7fGUkuJ4PCmp9uvCpXEENU4AAAAAAiIz02w5vnZtVQOIoiLp2ms9X+ttg4lgIXACAAAAEDA193gqKPDuOm8bTAQLqXoAAAAA6o2nxhEWi5SWZp4XygicAAAAANSb2hpH2J4//njo7+dE4AQAAACgXrlrHJGaah4Ph32cqHECAAAAUO9cNY7IyAj9lSYbAicAAAAADaJm44hwQqoeAAAAAHhA4AQAAAAAHhA4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBAAAAAAeEDgBAAAAgAcETgAAAADgAYETAAAAAHgQHewBNDTDMCRJpaWlQR6JqaKiQmVlZSotLVVMTEywh4MwwbyBP5g38BdzB/5g3sAfDT1vbDGBLUaoTcQFTocPH5YkpaWlBXkkAAAAAELB4cOH1apVq1rPsRjehFeNSGVlpXbt2qWWLVvKYrEEezgqLS1VWlqaduzYoYSEhGAPB2GCeQN/MG/gL+YO/MG8gT8aet4YhqHDhw+rU6dOatKk9iqmiFtxatKkiVJTU4M9DCcJCQn8oQKfMW/gD+YN/MXcgT+YN/BHQ84bTytNNjSHAAAAAAAPCJwAAAAAwAMCpyCLjY3VrFmzFBsbG+yhIIwwb+AP5g38xdyBP5g38Ecoz5uIaw4BAAAAAL5ixQkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMApiJ5++mmlp6crLi5OF198sT777LNgDwkhZO7cubrwwgvVsmVLtW/fXiNHjtSWLVsczjl+/LimTZumtm3bqkWLFho9erR+/fXXII0Yoeihhx6SxWJRTk6O/RjzBu4UFRXp2muvVdu2bdWsWTOdffbZ2rRpk/11wzB07733Kjk5Wc2aNdOgQYP0008/BXHECDar1ap77rlHXbp0UbNmzdS1a1fdd999qt57jHkDSVqzZo2GDx+uTp06yWKx6K233nJ43Zt5cuDAAU2YMEEJCQlq3bq1brzxRh05cqTBvgcCpyB5/fXXNXPmTM2aNUtffvmlevTooaFDh2rPnj3BHhpCxOrVqzVt2jRt3LhR+fn5qqio0JAhQ3T06FH7ObfddpveeecdvfHGG1q9erV27dqlzMzMII4aoeTzzz/Xc889p3POOcfhOPMGrhw8eFB9+vRRTEyM3nvvPX333Xf629/+pjZt2tjPmTdvnp588kk9++yz+ve//63mzZtr6NChOn78eBBHjmB6+OGHNX/+fD311FP6/vvv9fDDD2vevHn6xz/+YT+HeQNJOnr0qHr06KGnn37a5evezJMJEybo22+/VX5+vlauXKk1a9ZoypQpDfUtSAaC4qKLLjKmTZtmf261Wo1OnToZc+fODeKoEMr27NljSDJWr15tGIZhlJSUGDExMcYbb7xhP+f77783JBkbNmwI1jARIg4fPmyccsopRn5+vtG/f39jxowZhmEwb+Den/70J6Nv375uX6+srDQ6duxoPPLII/ZjJSUlRmxsrLF48eKGGCJC0JVXXmnccMMNDscyMzONCRMmGIbBvIFrkozly5fbn3szT7777jtDkvH555/bz3nvvfcMi8ViFBUVNci4WXEKghMnTuiLL77QoEGD7MeaNGmiQYMGacOGDUEcGULZoUOHJEmJiYmSpC+++EIVFRUO86h79+466aSTmEfQtGnTdOWVVzrMD4l5A/fefvtt9ezZU2PHjlX79u113nnn6fnnn7e/vm3bNu3evdth7rRq1UoXX3wxcyeC9e7dWx9//LF+/PFHSdJ//vMfrVu3TsOGDZPEvIF3vJknGzZsUOvWrdWzZ0/7OYMGDVKTJk3073//u0HGGd0g7wIH+/btk9VqVYcOHRyOd+jQQT/88EOQRoVQVllZqZycHPXp00dnnXWWJGn37t1q2rSpWrdu7XBuhw4dtHv37iCMEqFiyZIl+vLLL/X55587vca8gTtbt27V/PnzNXPmTP35z3/W559/rltvvVVNmzbVpEmT7PPD1d9dzJ3Ideedd6q0tFTdu3dXVFSUrFarHnjgAU2YMEGSmDfwijfzZPfu3Wrfvr3D69HR0UpMTGywuUTgBISBadOm6ZtvvtG6deuCPRSEuB07dmjGjBnKz89XXFxcsIeDMFJZWamePXvqwQcflCSdd955+uabb/Tss89q0qRJQR4dQtXSpUv12muvKTc3V2eeeaY2b96snJwcderUiXmDRodUvSBo166doqKinLpY/frrr+rYsWOQRoVQNX36dK1cuVKrVq1Samqq/XjHjh114sQJlZSUOJzPPIpsX3zxhfbs2aPzzz9f0dHRio6O1urVq/Xkk08qOjpaHTp0YN7ApeTkZJ1xxhkOx04//XRt375dkuzzg7+7UN3tt9+uO++8U+PGjdPZZ5+t6667Trfddpvmzp0riXkD73gzTzp27OjURO23337TgQMHGmwuETgFQdOmTXXBBRfo448/th+rrKzUxx9/rF69egVxZAglhmFo+vTpWr58uT755BN16dLF4fULLrhAMTExDvNoy5Yt2r59O/Mogl122WX6+uuvtXnzZvtXz549NWHCBPuvmTdwpU+fPk5bHvz444/q3LmzJKlLly7q2LGjw9wpLS3Vv//9b+ZOBCsrK1OTJo7/nIyKilJlZaUk5g2848086dWrl0pKSvTFF1/Yz/nkk09UWVmpiy++uGEG2iAtKOBkyZIlRmxsrLFo0SLju+++M6ZMmWK0bt3a2L17d7CHhhBx8803G61atTIKCgqM4uJi+1dZWZn9nJtuusk46aSTjE8++cTYtGmT0atXL6NXr15BHDVCUfWueobBvIFrn332mREdHW088MADxk8//WS89tprRnx8vPHqq6/az3nooYeM1q1bGytWrDD++9//GiNGjDC6dOliHDt2LIgjRzBNmjTJSElJMVauXGls27bNWLZsmdGuXTvjjjvusJ/DvIFhmN1ev/rqK+Orr74yJBmPPfaY8dVXXxm//PKLYRjezZPLL7/cOO+884x///vfxrp164xTTjnFGD9+fIN9DwROQfSPf/zDOOmkk4ymTZsaF110kbFx48ZgDwkhRJLLr4ULF9rPOXbsmDF16lSjTZs2Rnx8vDFq1CijuLg4eINGSKoZODFv4M4777xjnHXWWUZsbKzRvXt3Y8GCBQ6vV1ZWGvfcc4/RoUMHIzY21rjsssuMLVu2BGm0CAWlpaXGjBkzjJNOOsmIi4szTj75ZOPuu+82ysvL7ecwb2AYhrFq1SqX/66ZNGmSYRjezZP9+/cb48ePN1q0aGEkJCQY119/vXH48OEG+x4shlFta2cAAAAAgBNqnAAAAADAAwInAAAAAPCAwAkAAAAAPCBwAgAAAAAPCJwAAAAAwAMCJwAAAADwgMAJAAAAADwgcAIAAAAADwicAACohcVi0VtvvRXsYQAAgozACQAQsiZPniyLxeL0dfnllwd7aACACBMd7AEAAFCbyy+/XAsXLnQ4FhsbG6TRAAAiFStOAICQFhsbq44dOzp8tWnTRpKZRjd//nwNGzZMzZo108knn6y8vDyH67/++mtdeumlatasmdq2baspU6boyJEjDue8+OKLOvPMMxUbG6vk5GRNnz7d4fV9+/Zp1KhRio+P1ymnnKK3337b/trBgwc1YcIEJSUlqVmzZjrllFOcAj0AQPgjcAIAhLV77rlHo0eP1n/+8x9NmDBB48aN0/fffy9JOnr0qIYOHao2bdro888/1xtvvKGPPvrIITCaP3++pk2bpilTpujrr7/W22+/rW7dujm8x5w5c5SVlaX//ve/uuKKKzRhwgQdOHDA/v7fffed3nvvPX3//feaP3++2rVr13AfAACgQVgMwzCCPQgAAFyZPHmyXn31VcXFxTkc//Of/6w///nPslgsuummmzR//nz7a5dcconOP/98PfPMM3r++ef1pz/9STt27FDz5s0lSe+++66GDx+uXbt2qUOHDkpJSdH111+v+++/3+UYLBaL/vKXv+i+++6TZAZjLVq00HvvvafLL79cV199tdq1a6cXX3yxnj4FAEAooMYJABDSBg4c6BAYSVJiYqL917169XJ4rVevXtq8ebMk6fvvv1ePHj3sQZMk9enTR5WVldqyZYssFot27dqlyy67rNYxnHPOOfZfN2/eXAkJCdqzZ48k6eabb9bo0aP15ZdfasiQIRo5cqR69+7t1/cKAAhdBE4AgJDWvHlzp9S5QGnWrJlX58XExDg8t1gsqqyslCQNGzZMv/zyi959913l5+frsssu07Rp0/Too48GfLwAgOChxgkAENY2btzo9Pz000+XJJ1++un6z3/+o6NHj9pf//TTT9WkSROddtppatmypdLT0/Xxxx/XaQxJSUmaNGmSXn31VT3++ONasGBBne4HAAg9rDgBAEJaeXm5du/e7XAsOjra3oDhjTfeUM+ePdW3b1+99tpr+uyzz/TCCy9IkiZMmKBZs2Zp0qRJmj17tvbu3atbbrlF1113nTp06CBJmj17tm666Sa1b99ew4YN0+HDh/Xpp5/qlltu8Wp89957ry644AKdeeaZKi8v18qVK+2BGwCg8SBwAgCEtPfff1/JyckOx0477TT98MMPksyOd0uWLNHUqVOVnJysxYsX64wzzpAkxcfH64MPPtCMGTN04YUXKj4+XqNHj9Zjjz1mv9ekSZN0/Phx/f3vf9cf//hHtWvXTmPGjPF6fE2bNtVdd92lwsJCNWvWTBkZGVqyZEkAvnMAQCihqx4AIGxZLBYtX75cI0eODPZQAACNHDVOAAAAAOABgRMAAAAAeECNEwAgbJFtDgBoKKw4AQAAAIAHBE4AAAAA4AGBEwAAAAB4QOAEAAAAAB4QOAEAAACABwROAAAAAOABgRMAAAAAeEDgBAAAAAAe/D9KcbfSZkpy3gAAAABJRU5ErkJggg==",
+      "text/plain": [
+       "<Figure size 1000x600 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "epochs = list(range(configs_dict[\"max_training_steps\"]))\n",
+    "loss_values = ft_res[0].finetuning_losses\n",
+    "\n",
+    "plt.figure(figsize=(10, 6))\n",
+    "plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b')\n",
+    "\n",
+    "# Set plot labels and title\n",
+    "plt.xlabel('Epochs')\n",
+    "plt.ylabel('Loss Value')\n",
+    "plt.title('Loss Value vs. Number of Epochs')\n",
+    "\n",
+    "plt.grid(True)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Save finetuned model to HuggingFace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "subprocess.run(['python', '../../utils/upload_peft_model.py'] + f\"--peft-model-id {configs.finetuning_peft_model_id} --upload-peft-model-id {configs.finetuning_peft_model_id}-dolly\".split())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Stop LLM Co-serving system"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2024-07-22 06:46:20 - ###PEFT DEBUGGING### Background serving task completed.\n",
+      "Background server stopped.\n"
+     ]
+    }
+   ],
+   "source": [
+    "llm.stop_server()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Inference all over again with the finetuned model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n",
+      "Loading tokenizer...\n",
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n",
+      "Loading tokenizer...\n",
+      "[0 - 7ff1caf83280]    0.270628 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7ff1caf83280]    0.270673 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7ff1caf83280]    0.270699 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7ff1caf83280]    0.270744 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "[0 - 7ff1caf83280]    0.270753 {3}{Mapper}: Enabled Control Replication Optimizations.\n",
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n",
+      "workSpaceSize (128 MB)\n",
+      "Creating directory /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b (if it doesn't exist)...\n",
+      "Saving meta-llama/Meta-Llama-3-8B configs to file /root/.cache/flexflow/configs/meta-llama/meta-llama-3-8b/config.json...\n",
+      "Saving goliaro/llama-3-8b-lora-dolly configs to file /root/.cache/flexflow/configs/goliaro/llama-3-8b-lora-dolly/config.json...\n",
+      "Loading tokenizer...\n",
+      "Adding layer layers.0.mlp.down_proj.lora\n",
+      "Adding layer layers.1.mlp.down_proj.lora\n",
+      "Adding layer layers.2.mlp.down_proj.lora\n",
+      "Adding layer layers.3.mlp.down_proj.lora\n",
+      "Adding layer layers.4.mlp.down_proj.lora\n",
+      "Adding layer layers.5.mlp.down_proj.lora\n",
+      "Adding layer layers.6.mlp.down_proj.lora\n",
+      "Adding layer layers.7.mlp.down_proj.lora\n",
+      "Adding layer layers.8.mlp.down_proj.lora\n",
+      "Adding layer layers.9.mlp.down_proj.lora\n",
+      "Adding layer layers.10.mlp.down_proj.lora\n",
+      "Adding layer layers.11.mlp.down_proj.lora\n",
+      "Adding layer layers.12.mlp.down_proj.lora\n",
+      "Adding layer layers.13.mlp.down_proj.lora\n",
+      "Adding layer layers.14.mlp.down_proj.lora\n",
+      "Adding layer layers.15.mlp.down_proj.lora\n",
+      "Adding layer layers.16.mlp.down_proj.lora\n",
+      "Adding layer layers.17.mlp.down_proj.lora\n",
+      "Adding layer layers.18.mlp.down_proj.lora\n",
+      "Adding layer layers.19.mlp.down_proj.lora\n",
+      "Adding layer layers.20.mlp.down_proj.lora\n",
+      "Adding layer layers.21.mlp.down_proj.lora\n",
+      "Adding layer layers.22.mlp.down_proj.lora\n",
+      "Adding layer layers.23.mlp.down_proj.lora\n",
+      "Adding layer layers.24.mlp.down_proj.lora\n",
+      "Adding layer layers.25.mlp.down_proj.lora\n",
+      "Adding layer layers.26.mlp.down_proj.lora\n",
+      "Adding layer layers.27.mlp.down_proj.lora\n",
+      "Adding layer layers.28.mlp.down_proj.lora\n",
+      "Adding layer layers.29.mlp.down_proj.lora\n",
+      "Adding layer layers.30.mlp.down_proj.lora\n",
+      "Adding layer layers.31.mlp.down_proj.lora\n",
+      "Background server started.\n",
+      "[<flexflow.core.flexflow_cffi.Request object at 0x7ff16b115bd0>]\n",
+      "2024-07-22 06:42:43 - ###PEFT DEBUGGING### Starting background serving task.\n",
+      "2024-07-22 06:42:43 - ###PEFT DEBUGGING### Updated models' configuration.\n",
+      "###PEFT DEBUGGING### LLM Model object exists.\n",
+      "###PEFT DEBUGGING### Model object exists.\n",
+      "###PEFT DEBUGGING### Model object still exists.\n",
+      "###PEFT DEBUGGING### Entering compile_inference.\n",
+      "###PEFT DEBUGGING### Configuration check passed: At least four CPU cores per node.\n",
+      "###PEFT DEBUGGING### Launching graph optimization task.\n",
+      "num_nodes = 1 num_gpus_per_node = 1\n",
+      "[0]10445\n",
+      "[1]649\n",
+      "[2]6730\n",
+      "[3]2053\n",
+      "[4]18167\n",
+      "[5]369\n",
+      "[6]1317\n",
+      "[7]2085\n",
+      "[8]3090\n",
+      "[9]30\n",
+      "No small speculative model registered, using incremental decoding.\n",
+      "[0 - 7ff1caf83280]    1.100415 {3}{RequestManager}: [1000000]New request tokens: 128000 10445 649 6730 2053 18167 369 1317 2085 3090 30\n",
+      "optimal_views.size = 262\n",
+      "views.size() = 262\n",
+      "###PEFT DEBUGGING### Operators reconstructed from optimized graph.\n",
+      "###PEFT DEBUGGING### Starting inplace optimizations.\n",
+      "###PEFT DEBUGGING### Mapping output tensors.\n",
+      "ndim(1) dims[1 0 0 0]\n",
+      "###PEFT DEBUGGING### Setting up NCCL communications.\n",
+      "###PEFT DEBUGGING### compile_inference completed successfully.\n",
+      "Loading weight file embed_tokens.weight\n",
+      "Loading weight file layers.0.input_layernorm.weight\n",
+      "Loading weight file layers.0.self_attn.q_proj.weight\n",
+      "Loading weight file layers.0.self_attn.k_proj.weight\n",
+      "Loading weight file layers.0.self_attn.v_proj.weight\n",
+      "Loading weight file layers.0.self_attn.o_proj.weight\n",
+      "Loading weight file layers.0.post_attention_layernorm.weight\n",
+      "Loading weight file layers.0.mlp.gate_proj.weight\n",
+      "Loading weight file layers.0.mlp.up_proj.weight\n",
+      "Loading weight file layers.0.mlp.down_proj.weight\n",
+      "Loading weight file layers.1.input_layernorm.weight\n",
+      "Loading weight file layers.1.self_attn.q_proj.weight\n",
+      "Loading weight file layers.1.self_attn.k_proj.weight\n",
+      "Loading weight file layers.1.self_attn.v_proj.weight\n",
+      "Loading weight file layers.1.self_attn.o_proj.weight\n",
+      "Loading weight file layers.1.post_attention_layernorm.weight\n",
+      "Loading weight file layers.1.mlp.gate_proj.weight\n",
+      "Loading weight file layers.1.mlp.up_proj.weight\n",
+      "Loading weight file layers.1.mlp.down_proj.weight\n",
+      "Loading weight file layers.2.input_layernorm.weight\n",
+      "Loading weight file layers.2.self_attn.q_proj.weight\n",
+      "Loading weight file layers.2.self_attn.k_proj.weight\n",
+      "Loading weight file layers.2.self_attn.v_proj.weight\n",
+      "Loading weight file layers.2.self_attn.o_proj.weight\n",
+      "Loading weight file layers.2.post_attention_layernorm.weight\n",
+      "Loading weight file layers.2.mlp.gate_proj.weight\n",
+      "Loading weight file layers.2.mlp.up_proj.weight\n",
+      "Loading weight file layers.2.mlp.down_proj.weight\n",
+      "Loading weight file layers.3.input_layernorm.weight\n",
+      "Loading weight file layers.3.self_attn.q_proj.weight\n",
+      "Loading weight file layers.3.self_attn.k_proj.weight\n",
+      "Loading weight file layers.3.self_attn.v_proj.weight\n",
+      "Loading weight file layers.3.self_attn.o_proj.weight\n",
+      "Loading weight file layers.3.post_attention_layernorm.weight\n",
+      "Loading weight file layers.3.mlp.gate_proj.weight\n",
+      "Loading weight file layers.3.mlp.up_proj.weight\n",
+      "Loading weight file layers.3.mlp.down_proj.weight\n",
+      "Loading weight file layers.4.input_layernorm.weight\n",
+      "Loading weight file layers.4.self_attn.q_proj.weight\n",
+      "Loading weight file layers.4.self_attn.k_proj.weight\n",
+      "Loading weight file layers.4.self_attn.v_proj.weight\n",
+      "Loading weight file layers.4.self_attn.o_proj.weight\n",
+      "Loading weight file layers.4.post_attention_layernorm.weight\n",
+      "Loading weight file layers.4.mlp.gate_proj.weight\n",
+      "Loading weight file layers.4.mlp.up_proj.weight\n",
+      "Loading weight file layers.4.mlp.down_proj.weight\n",
+      "Loading weight file layers.5.input_layernorm.weight\n",
+      "Loading weight file layers.5.self_attn.q_proj.weight\n",
+      "Loading weight file layers.5.self_attn.k_proj.weight\n",
+      "Loading weight file layers.5.self_attn.v_proj.weight\n",
+      "Loading weight file layers.5.self_attn.o_proj.weight\n",
+      "Loading weight file layers.5.post_attention_layernorm.weight\n",
+      "Loading weight file layers.5.mlp.gate_proj.weight\n",
+      "Loading weight file layers.5.mlp.up_proj.weight\n",
+      "Loading weight file layers.5.mlp.down_proj.weight\n",
+      "Loading weight file layers.6.input_layernorm.weight\n",
+      "Loading weight file layers.6.self_attn.q_proj.weight\n",
+      "Loading weight file layers.6.self_attn.k_proj.weight\n",
+      "Loading weight file layers.6.self_attn.v_proj.weight\n",
+      "Loading weight file layers.6.self_attn.o_proj.weight\n",
+      "Loading weight file layers.6.post_attention_layernorm.weight\n",
+      "Loading weight file layers.6.mlp.gate_proj.weight\n",
+      "Loading weight file layers.6.mlp.up_proj.weight\n",
+      "Loading weight file layers.6.mlp.down_proj.weight\n",
+      "Loading weight file layers.7.input_layernorm.weight\n",
+      "Loading weight file layers.7.self_attn.q_proj.weight\n",
+      "Loading weight file layers.7.self_attn.k_proj.weight\n",
+      "Loading weight file layers.7.self_attn.v_proj.weight\n",
+      "Loading weight file layers.7.self_attn.o_proj.weight\n",
+      "Loading weight file layers.7.post_attention_layernorm.weight\n",
+      "Loading weight file layers.7.mlp.gate_proj.weight\n",
+      "Loading weight file layers.7.mlp.up_proj.weight\n",
+      "Loading weight file layers.7.mlp.down_proj.weight\n",
+      "Loading weight file layers.8.input_layernorm.weight\n",
+      "Loading weight file layers.8.self_attn.q_proj.weight\n",
+      "Loading weight file layers.8.self_attn.k_proj.weight\n",
+      "Loading weight file layers.8.self_attn.v_proj.weight\n",
+      "Loading weight file layers.8.self_attn.o_proj.weight\n",
+      "Loading weight file layers.8.post_attention_layernorm.weight\n",
+      "Loading weight file layers.8.mlp.gate_proj.weight\n",
+      "Loading weight file layers.8.mlp.up_proj.weight\n",
+      "Loading weight file layers.8.mlp.down_proj.weight\n",
+      "Loading weight file layers.9.input_layernorm.weight\n",
+      "Loading weight file layers.9.self_attn.q_proj.weight\n",
+      "Loading weight file layers.9.self_attn.k_proj.weight\n",
+      "Loading weight file layers.9.self_attn.v_proj.weight\n",
+      "Loading weight file layers.9.self_attn.o_proj.weight\n",
+      "Loading weight file layers.9.post_attention_layernorm.weight\n",
+      "Loading weight file layers.9.mlp.gate_proj.weight\n",
+      "Loading weight file layers.9.mlp.up_proj.weight\n",
+      "Loading weight file layers.9.mlp.down_proj.weight\n",
+      "Loading weight file layers.10.input_layernorm.weight\n",
+      "Loading weight file layers.10.self_attn.q_proj.weight\n",
+      "Loading weight file layers.10.self_attn.k_proj.weight\n",
+      "Loading weight file layers.10.self_attn.v_proj.weight\n",
+      "Loading weight file layers.10.self_attn.o_proj.weight\n",
+      "Loading weight file layers.10.post_attention_layernorm.weight\n",
+      "Loading weight file layers.10.mlp.gate_proj.weight\n",
+      "Loading weight file layers.10.mlp.up_proj.weight\n",
+      "Loading weight file layers.10.mlp.down_proj.weight\n",
+      "Loading weight file layers.11.input_layernorm.weight\n",
+      "Loading weight file layers.11.self_attn.q_proj.weight\n",
+      "Loading weight file layers.11.self_attn.k_proj.weight\n",
+      "Loading weight file layers.11.self_attn.v_proj.weight\n",
+      "Loading weight file layers.11.self_attn.o_proj.weight\n",
+      "Loading weight file layers.11.post_attention_layernorm.weight\n",
+      "Loading weight file layers.11.mlp.gate_proj.weight\n",
+      "Loading weight file layers.11.mlp.up_proj.weight\n",
+      "Loading weight file layers.11.mlp.down_proj.weight\n",
+      "Loading weight file layers.12.input_layernorm.weight\n",
+      "Loading weight file layers.12.self_attn.q_proj.weight\n",
+      "Loading weight file layers.12.self_attn.k_proj.weight\n",
+      "Loading weight file layers.12.self_attn.v_proj.weight\n",
+      "Loading weight file layers.12.self_attn.o_proj.weight\n",
+      "Loading weight file layers.12.post_attention_layernorm.weight\n",
+      "Loading weight file layers.12.mlp.gate_proj.weight\n",
+      "Loading weight file layers.12.mlp.up_proj.weight\n",
+      "Loading weight file layers.12.mlp.down_proj.weight\n",
+      "Loading weight file layers.13.input_layernorm.weight\n",
+      "Loading weight file layers.13.self_attn.q_proj.weight\n",
+      "Loading weight file layers.13.self_attn.k_proj.weight\n",
+      "Loading weight file layers.13.self_attn.v_proj.weight\n",
+      "Loading weight file layers.13.self_attn.o_proj.weight\n",
+      "Loading weight file layers.13.post_attention_layernorm.weight\n",
+      "Loading weight file layers.13.mlp.gate_proj.weight\n",
+      "Loading weight file layers.13.mlp.up_proj.weight\n",
+      "Loading weight file layers.13.mlp.down_proj.weight\n",
+      "Loading weight file layers.14.input_layernorm.weight\n",
+      "Loading weight file layers.14.self_attn.q_proj.weight\n",
+      "Loading weight file layers.14.self_attn.k_proj.weight\n",
+      "Loading weight file layers.14.self_attn.v_proj.weight\n",
+      "Loading weight file layers.14.self_attn.o_proj.weight\n",
+      "Loading weight file layers.14.post_attention_layernorm.weight\n",
+      "Loading weight file layers.14.mlp.gate_proj.weight\n",
+      "Loading weight file layers.14.mlp.up_proj.weight\n",
+      "Loading weight file layers.14.mlp.down_proj.weight\n",
+      "Loading weight file layers.15.input_layernorm.weight\n",
+      "Loading weight file layers.15.self_attn.q_proj.weight\n",
+      "Loading weight file layers.15.self_attn.k_proj.weight\n",
+      "Loading weight file layers.15.self_attn.v_proj.weight\n",
+      "Loading weight file layers.15.self_attn.o_proj.weight\n",
+      "Loading weight file layers.15.post_attention_layernorm.weight\n",
+      "Loading weight file layers.15.mlp.gate_proj.weight\n",
+      "Loading weight file layers.15.mlp.up_proj.weight\n",
+      "Loading weight file layers.15.mlp.down_proj.weight\n",
+      "Loading weight file layers.16.input_layernorm.weight\n",
+      "Loading weight file layers.16.self_attn.q_proj.weight\n",
+      "Loading weight file layers.16.self_attn.k_proj.weight\n",
+      "Loading weight file layers.16.self_attn.v_proj.weight\n",
+      "Loading weight file layers.16.self_attn.o_proj.weight\n",
+      "Loading weight file layers.16.post_attention_layernorm.weight\n",
+      "Loading weight file layers.16.mlp.gate_proj.weight\n",
+      "Loading weight file layers.16.mlp.up_proj.weight\n",
+      "Loading weight file layers.16.mlp.down_proj.weight\n",
+      "Loading weight file layers.17.input_layernorm.weight\n",
+      "Loading weight file layers.17.self_attn.q_proj.weight\n",
+      "Loading weight file layers.17.self_attn.k_proj.weight\n",
+      "Loading weight file layers.17.self_attn.v_proj.weight\n",
+      "Loading weight file layers.17.self_attn.o_proj.weight\n",
+      "Loading weight file layers.17.post_attention_layernorm.weight\n",
+      "Loading weight file layers.17.mlp.gate_proj.weight\n",
+      "Loading weight file layers.17.mlp.up_proj.weight\n",
+      "Loading weight file layers.17.mlp.down_proj.weight\n",
+      "Loading weight file layers.18.input_layernorm.weight\n",
+      "Loading weight file layers.18.self_attn.q_proj.weight\n",
+      "Loading weight file layers.18.self_attn.k_proj.weight\n",
+      "Loading weight file layers.18.self_attn.v_proj.weight\n",
+      "Loading weight file layers.18.self_attn.o_proj.weight\n",
+      "Loading weight file layers.18.post_attention_layernorm.weight\n",
+      "Loading weight file layers.18.mlp.gate_proj.weight\n",
+      "Loading weight file layers.18.mlp.up_proj.weight\n",
+      "Loading weight file layers.18.mlp.down_proj.weight\n",
+      "Loading weight file layers.19.input_layernorm.weight\n",
+      "Loading weight file layers.19.self_attn.q_proj.weight\n",
+      "Loading weight file layers.19.self_attn.k_proj.weight\n",
+      "Loading weight file layers.19.self_attn.v_proj.weight\n",
+      "Loading weight file layers.19.self_attn.o_proj.weight\n",
+      "Loading weight file layers.19.post_attention_layernorm.weight\n",
+      "Loading weight file layers.19.mlp.gate_proj.weight\n",
+      "Loading weight file layers.19.mlp.up_proj.weight\n",
+      "Loading weight file layers.19.mlp.down_proj.weight\n",
+      "Loading weight file layers.20.input_layernorm.weight\n",
+      "Loading weight file layers.20.self_attn.q_proj.weight\n",
+      "Loading weight file layers.20.self_attn.k_proj.weight\n",
+      "Loading weight file layers.20.self_attn.v_proj.weight\n",
+      "Loading weight file layers.20.self_attn.o_proj.weight\n",
+      "Loading weight file layers.20.post_attention_layernorm.weight\n",
+      "Loading weight file layers.20.mlp.gate_proj.weight\n",
+      "Loading weight file layers.20.mlp.up_proj.weight\n",
+      "Loading weight file layers.20.mlp.down_proj.weight\n",
+      "Loading weight file layers.21.input_layernorm.weight\n",
+      "Loading weight file layers.21.self_attn.q_proj.weight\n",
+      "Loading weight file layers.21.self_attn.k_proj.weight\n",
+      "Loading weight file layers.21.self_attn.v_proj.weight\n",
+      "Loading weight file layers.21.self_attn.o_proj.weight\n",
+      "Loading weight file layers.21.post_attention_layernorm.weight\n",
+      "Loading weight file layers.21.mlp.gate_proj.weight\n",
+      "Loading weight file layers.21.mlp.up_proj.weight\n",
+      "Loading weight file layers.21.mlp.down_proj.weight\n",
+      "Loading weight file layers.22.input_layernorm.weight\n",
+      "Loading weight file layers.22.self_attn.q_proj.weight\n",
+      "Loading weight file layers.22.self_attn.k_proj.weight\n",
+      "Loading weight file layers.22.self_attn.v_proj.weight\n",
+      "Loading weight file layers.22.self_attn.o_proj.weight\n",
+      "Loading weight file layers.22.post_attention_layernorm.weight\n",
+      "Loading weight file layers.22.mlp.gate_proj.weight\n",
+      "Loading weight file layers.22.mlp.up_proj.weight\n",
+      "Loading weight file layers.22.mlp.down_proj.weight\n",
+      "Loading weight file layers.23.input_layernorm.weight\n",
+      "Loading weight file layers.23.self_attn.q_proj.weight\n",
+      "Loading weight file layers.23.self_attn.k_proj.weight\n",
+      "Loading weight file layers.23.self_attn.v_proj.weight\n",
+      "Loading weight file layers.23.self_attn.o_proj.weight\n",
+      "Loading weight file layers.23.post_attention_layernorm.weight\n",
+      "Loading weight file layers.23.mlp.gate_proj.weight\n",
+      "Loading weight file layers.23.mlp.up_proj.weight\n",
+      "Loading weight file layers.23.mlp.down_proj.weight\n",
+      "Loading weight file layers.24.input_layernorm.weight\n",
+      "Loading weight file layers.24.self_attn.q_proj.weight\n",
+      "Loading weight file layers.24.self_attn.k_proj.weight\n",
+      "Loading weight file layers.24.self_attn.v_proj.weight\n",
+      "Loading weight file layers.24.self_attn.o_proj.weight\n",
+      "Loading weight file layers.24.post_attention_layernorm.weight\n",
+      "Loading weight file layers.24.mlp.gate_proj.weight\n",
+      "Loading weight file layers.24.mlp.up_proj.weight\n",
+      "Loading weight file layers.24.mlp.down_proj.weight\n",
+      "Loading weight file layers.25.input_layernorm.weight\n",
+      "Loading weight file layers.25.self_attn.q_proj.weight\n",
+      "Loading weight file layers.25.self_attn.k_proj.weight\n",
+      "Loading weight file layers.25.self_attn.v_proj.weight\n",
+      "Loading weight file layers.25.self_attn.o_proj.weight\n",
+      "Loading weight file layers.25.post_attention_layernorm.weight\n",
+      "Loading weight file layers.25.mlp.gate_proj.weight\n",
+      "Loading weight file layers.25.mlp.up_proj.weight\n",
+      "Loading weight file layers.25.mlp.down_proj.weight\n",
+      "Loading weight file layers.26.input_layernorm.weight\n",
+      "Loading weight file layers.26.self_attn.q_proj.weight\n",
+      "Loading weight file layers.26.self_attn.k_proj.weight\n",
+      "Loading weight file layers.26.self_attn.v_proj.weight\n",
+      "Loading weight file layers.26.self_attn.o_proj.weight\n",
+      "Loading weight file layers.26.post_attention_layernorm.weight\n",
+      "Loading weight file layers.26.mlp.gate_proj.weight\n",
+      "Loading weight file layers.26.mlp.up_proj.weight\n",
+      "Loading weight file layers.26.mlp.down_proj.weight\n",
+      "Loading weight file layers.27.input_layernorm.weight\n",
+      "Loading weight file layers.27.self_attn.q_proj.weight\n",
+      "Loading weight file layers.27.self_attn.k_proj.weight\n",
+      "Loading weight file layers.27.self_attn.v_proj.weight\n",
+      "Loading weight file layers.27.self_attn.o_proj.weight\n",
+      "Loading weight file layers.27.post_attention_layernorm.weight\n",
+      "Loading weight file layers.27.mlp.gate_proj.weight\n",
+      "Loading weight file layers.27.mlp.up_proj.weight\n",
+      "Loading weight file layers.27.mlp.down_proj.weight\n",
+      "Loading weight file layers.28.input_layernorm.weight\n",
+      "Loading weight file layers.28.self_attn.q_proj.weight\n",
+      "Loading weight file layers.28.self_attn.k_proj.weight\n",
+      "Loading weight file layers.28.self_attn.v_proj.weight\n",
+      "Loading weight file layers.28.self_attn.o_proj.weight\n",
+      "Loading weight file layers.28.post_attention_layernorm.weight\n",
+      "Loading weight file layers.28.mlp.gate_proj.weight\n",
+      "Loading weight file layers.28.mlp.up_proj.weight\n",
+      "Loading weight file layers.28.mlp.down_proj.weight\n",
+      "Loading weight file layers.29.input_layernorm.weight\n",
+      "Loading weight file layers.29.self_attn.q_proj.weight\n",
+      "Loading weight file layers.29.self_attn.k_proj.weight\n",
+      "Loading weight file layers.29.self_attn.v_proj.weight\n",
+      "Loading weight file layers.29.self_attn.o_proj.weight\n",
+      "Loading weight file layers.29.post_attention_layernorm.weight\n",
+      "Loading weight file layers.29.mlp.gate_proj.weight\n",
+      "Loading weight file layers.29.mlp.up_proj.weight\n",
+      "Loading weight file layers.29.mlp.down_proj.weight\n",
+      "Loading weight file layers.30.input_layernorm.weight\n",
+      "Loading weight file layers.30.self_attn.q_proj.weight\n",
+      "Loading weight file layers.30.self_attn.k_proj.weight\n",
+      "Loading weight file layers.30.self_attn.v_proj.weight\n",
+      "Loading weight file layers.30.self_attn.o_proj.weight\n",
+      "Loading weight file layers.30.post_attention_layernorm.weight\n",
+      "Loading weight file layers.30.mlp.gate_proj.weight\n",
+      "Loading weight file layers.30.mlp.up_proj.weight\n",
+      "Loading weight file layers.30.mlp.down_proj.weight\n",
+      "Loading weight file layers.31.input_layernorm.weight\n",
+      "Loading weight file layers.31.self_attn.q_proj.weight\n",
+      "Loading weight file layers.31.self_attn.k_proj.weight\n",
+      "Loading weight file layers.31.self_attn.v_proj.weight\n",
+      "Loading weight file layers.31.self_attn.o_proj.weight\n",
+      "Loading weight file layers.31.post_attention_layernorm.weight\n",
+      "Loading weight file layers.31.mlp.gate_proj.weight\n",
+      "Loading weight file layers.31.mlp.up_proj.weight\n",
+      "Loading weight file layers.31.mlp.down_proj.weight\n",
+      "Loading weight file norm.weight\n",
+      "Loading weight file lm_head.weight\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.0.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.1.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.2.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.3.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.4.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.5.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.6.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.7.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.8.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.9.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.10.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.11.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.12.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.13.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.14.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.15.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.16.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.17.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.18.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.19.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.20.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.21.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.22.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.23.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.24.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.25.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.26.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.27.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.28.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.29.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.30.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_A.weight, num_rows: 14336, num_cols: 16, num_shards: 1, shard_id: 0\n",
+      "Loading LORA weight layers.31.mlp.down_proj.lora_B.weight, num_rows: 16, num_cols: 4096, num_shards: 1, shard_id: 0\n",
+      "[0 - 7ff1680b6740]   16.224181 {3}{RequestManager}: Output token is: 3639\n",
+      "[0 - 7ff1680b6740]   16.321885 {3}{RequestManager}: Output token is: 374\n",
+      "[0 - 7ff168092740]   16.407712 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7ff1680b6740]   16.492788 {3}{RequestManager}: Output token is: 2944\n",
+      "[0 - 7ff168092740]   16.563500 {3}{RequestManager}: Output token is: 4920\n",
+      "[0 - 7ff168092740]   16.624616 {3}{RequestManager}: Output token is: 279\n",
+      "[0 - 7ff168092740]   16.675778 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   16.725625 {3}{RequestManager}: Output token is: 13272\n",
+      "[0 - 7ff168092740]   16.776205 {3}{RequestManager}: Output token is: 315\n",
+      "[0 - 7ff168092740]   16.827883 {3}{RequestManager}: Output token is: 41389\n",
+      "[0 - 7ff168092740]   16.878348 {3}{RequestManager}: Output token is: 2715\n",
+      "[0 - 7ff168092740]   16.929025 {3}{RequestManager}: Output token is: 288\n",
+      "[0 - 7ff168092740]   16.979287 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff1680b6740]   17.029879 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   17.078696 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   17.127942 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff1680b6740]   17.177796 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff1680b6740]   17.227023 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff1680b6740]   17.277136 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff1680b6740]   17.328143 {3}{RequestManager}: Output token is: 64614\n",
+      "[0 - 7ff1680b6740]   17.378508 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   17.430618 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   17.482129 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   17.533479 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   17.584503 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   17.634591 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   17.685727 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   17.736768 {3}{RequestManager}: Output token is: 14535\n",
+      "[0 - 7ff168092740]   17.785909 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   17.836515 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   17.886526 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   17.936502 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   17.986222 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   18.037888 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   18.088468 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   18.138261 {3}{RequestManager}: Output token is: 25212\n",
+      "[0 - 7ff168092740]   18.187102 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   18.237270 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   18.289979 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   18.340895 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   18.391145 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   18.441155 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   18.499716 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff1680b6740]   18.552423 {3}{RequestManager}: Output token is: 97814\n",
+      "[0 - 7ff168092740]   18.603261 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   18.654986 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   18.706227 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   18.756543 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   18.807690 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff1680b6740]   18.857508 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   18.907649 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   18.958208 {3}{RequestManager}: Output token is: 41759\n",
+      "[0 - 7ff168092740]   19.009971 {3}{RequestManager}: Output token is: 388\n",
+      "[0 - 7ff168092740]   19.060626 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   19.112370 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   19.161425 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   19.206435 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   19.254004 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   19.306102 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   19.356853 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   19.408861 {3}{RequestManager}: Output token is: 89435\n",
+      "[0 - 7ff1680b6740]   19.460391 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff1680b6740]   19.511207 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   19.565692 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   19.617057 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff1680b6740]   19.669739 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff1680b6740]   19.722325 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff1680b6740]   19.773583 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff1680b6740]   19.824646 {3}{RequestManager}: Output token is: 68550\n",
+      "[0 - 7ff1680b6740]   19.876650 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff1680b6740]   19.926939 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   19.977325 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   20.028247 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff1680b6740]   20.078419 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   20.128614 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   20.179748 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   20.230542 {3}{RequestManager}: Output token is: 18311\n",
+      "[0 - 7ff1680b6740]   20.281634 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   20.330089 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   20.375491 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   20.422220 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   20.475078 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   20.526058 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   20.577651 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   20.628505 {3}{RequestManager}: Output token is: 7013\n",
+      "[0 - 7ff168092740]   20.681354 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   20.734160 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   20.786299 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   20.837268 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   20.888265 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   20.939708 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   20.990707 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   21.041260 {3}{RequestManager}: Output token is: 18742\n",
+      "[0 - 7ff1680b6740]   21.091386 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   21.145432 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff168092740]   21.197149 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   21.249242 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   21.301514 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   21.352632 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   21.404018 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   21.455101 {3}{RequestManager}: Output token is: 56994\n",
+      "[0 - 7ff1680b6740]   21.506371 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   21.559369 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   21.611370 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff168092740]   21.663655 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff1680b6740]   21.715270 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff168092740]   21.766481 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff168092740]   21.818563 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff168092740]   21.872108 {3}{RequestManager}: Output token is: 29505\n",
+      "[0 - 7ff168092740]   21.922670 {3}{RequestManager}: Output token is: 30\n",
+      "[0 - 7ff168092740]   21.973973 {3}{RequestManager}: Output token is: 8595\n",
+      "[0 - 7ff1680b6740]   22.024297 {3}{RequestManager}: Output token is: 656\n",
+      "[0 - 7ff1680b6740]   22.076266 {3}{RequestManager}: Output token is: 1063\n",
+      "[0 - 7ff168092740]   22.127594 {3}{RequestManager}: Output token is: 10099\n",
+      "[0 - 7ff1680b6740]   22.179008 {3}{RequestManager}: Output token is: 617\n",
+      "[0 - 7ff1680b6740]   22.230414 {3}{RequestManager}: Output token is: 1317\n",
+      "[0 - 7ff1680b6740]   22.281805 {3}{RequestManager}: Output token is: 993\n",
+      "[0 - 7ff1680b6740]   22.282235 {3}{RequestManager}: [Done] guid(1000000) final_length(128)\n",
+      "[0 - 7ff1680b6740]   22.282243 {3}{RequestManager}: Final output: <s> <|begin_of_text|>Why can camels survive for long without water? What is the reason behind the long neck of giraffes? Why do some animals have long tails? Why do some animals have long legs? Why do some animals have long ears? Why do some animals have long noses? Why do some animals have long whiskers? Why do some animals have long tongues? Why do some animals have long claws? Why do some animals have long teeth? Why do some animals have long hair? Why do some animals have long fur? Why do some animals have long feathers? Why do some animals have long scales? Why do some animals have long sp\n",
+      "[0 - 7ff1680b6740]   22.282250 {3}{RequestManager}: [Profile] guid(1000000) llm_decoding_steps(117) start(15892528.0) finish(22282245.0) latency(6389717.0) ttft(15123707.0)\n",
+      "2024-07-22 06:43:05 - ###PEFT DEBUGGING### Background serving task completed.\n",
+      "Background server stopped.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json, random, subprocess, os\n",
+    "from datasets import load_dataset\n",
+    "from types import SimpleNamespace\n",
+    "from huggingface_hub import HfFolder\n",
+    "import flexflow.serve as ff\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "configs_dict = {\n",
+    "    \"num_gpus\": 1,\n",
+    "    \"memory_per_gpu\": 21000,\n",
+    "    \"zero_copy_memory_per_node\": 40000,\n",
+    "    \"num_cpus\": 4,\n",
+    "    \"legion_utility_processors\": 4,\n",
+    "    \"data_parallelism_degree\": 1,\n",
+    "    \"tensor_parallelism_degree\": 1,\n",
+    "    \"pipeline_parallelism_degree\": 1,\n",
+    "    \"offload\": False,\n",
+    "    \"offload_reserve_space_size\": 8 * 1024,  # 8GB\n",
+    "    \"use_4bit_quantization\": False,\n",
+    "    \"use_8bit_quantization\": False,\n",
+    "    \"enable_peft\": True,\n",
+    "    \"peft_activation_reserve_space_size\": 1024,  # 1GB\n",
+    "    \"peft_weight_reserve_space_size\": 1024,  # 1GB\n",
+    "    \"profiling\": False,\n",
+    "    \"inference_debugging\": False,\n",
+    "    \"fusion\": False,\n",
+    "    \"max_requests_per_batch\": 1,\n",
+    "    \"max_sequence_length\": 128,\n",
+    "    \"max_tokens_per_batch\": 128,\n",
+    "    \"max_training_steps\": 100,\n",
+    "    \"seed\": 42,\n",
+    "}\n",
+    "model_configs = {\n",
+    "    \"base_model\": \"meta-llama/Meta-Llama-3-8B\",\n",
+    "    \"inference_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n",
+    "    \"finetuning_peft_model_id\": \"goliaro/llama-3-8b-lora\",\n",
+    "    \"cache_path\": os.environ.get(\"FF_CACHE_PATH\", \"\"),\n",
+    "    \"refresh_cache\": False,\n",
+    "    \"full_precision\": False,\n",
+    "    # relative paths\n",
+    "    \"inference_dataset\": \"inference_dataset.json\",\n",
+    "    \"finetuning_dataset\": \"/usr/FlexFlow/inference/prompt/peft_dataset.json\",\n",
+    "    \"output_file\": \"peft_demo.txt\",\n",
+    "}\n",
+    "generation_configs = {\n",
+    "    \"do_sample\": False,\n",
+    "    \"temperature\": 0.9,\n",
+    "    \"topp\": 0.8,\n",
+    "    \"topk\": 1,\n",
+    "}\n",
+    "finetuning_configs = {\n",
+    "    \"learning_rate\": 0.001,\n",
+    "    \"momentum\": 0.0,\n",
+    "    \"weight_decay\": 0.0,\n",
+    "    \"nesterov\": False,\n",
+    "}\n",
+    "# Merge dictionaries\n",
+    "configs_dict.update(model_configs)\n",
+    "configs_dict.update(generation_configs)\n",
+    "configs_dict.update(finetuning_configs)\n",
+    "\n",
+    "configs = SimpleNamespace(**configs_dict)\n",
+    "\n",
+    "\n",
+    "args = [configs.finetuning_peft_model_id+\"-dolly\", '--base_model_name', configs.base_model]\n",
+    "subprocess.run(['python', '../../utils/download_peft_model.py'] + args)\n",
+    "\n",
+    "# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs\n",
+    "ff.init(configs_dict)\n",
+    "\n",
+    "# Create the FlexFlow LLM\n",
+    "ff_data_type = (\n",
+    "    ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF\n",
+    ")\n",
+    "llm = ff.LLM(\n",
+    "    configs.base_model,\n",
+    "    data_type=ff_data_type,\n",
+    "    cache_path=configs.cache_path,\n",
+    "    refresh_cache=configs.refresh_cache,\n",
+    "    output_file=configs.output_file,\n",
+    ")\n",
+    "\n",
+    "lora_inference_config2 = ff.LoraLinearConfig(\n",
+    "    llm.cache_path, \n",
+    "    configs.finetuning_peft_model_id+\"-dolly\",\n",
+    "    base_model_name_or_path=configs.base_model\n",
+    ")\n",
+    "llm.add_peft(lora_inference_config2)\n",
+    "\n",
+    "\n",
+    "# Compile the LLM for inference and load the weights into memory\n",
+    "generation_config = ff.GenerationConfig(\n",
+    "    do_sample=configs.do_sample,\n",
+    "    temperature=configs.temperature,\n",
+    "    topp=configs.topp,\n",
+    "    topk=configs.topk\n",
+    ")\n",
+    "llm.compile(\n",
+    "    generation_config,\n",
+    "    max_requests_per_batch=configs.max_requests_per_batch,\n",
+    "    max_seq_length=configs.max_sequence_length,\n",
+    "    max_tokens_per_batch=configs.max_tokens_per_batch,\n",
+    ")\n",
+    "\n",
+    "llm.start_server()\n",
+    "\n",
+    "prompts = [s for s in json.load(open(configs.inference_dataset))]\n",
+    "inference_requests = [\n",
+    "    ff.Request(\n",
+    "        ff.RequestType.REQ_INFERENCE,\n",
+    "        prompt=prompt,\n",
+    "        max_sequence_length=configs.max_sequence_length,\n",
+    "        peft_model_id=llm.get_ff_peft_id(lora_inference_config2),\n",
+    "    )\n",
+    "    for prompt in prompts\n",
+    "]\n",
+    "inf_req_res_2 = llm.generate(inference_requests)\n",
+    "\n",
+    "llm.stop_server()\n",
+    "\n",
+    "with open(\"after_finetuning.txt\", \"w\") as file:\n",
+    "    file.write(str(inf_req_res_2[0].output_text))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/inference/python/peft_demo/demo.py b/inference/python/peft_demo/demo.py
new file mode 100644
index 0000000000..9e01b4645b
--- /dev/null
+++ b/inference/python/peft_demo/demo.py
@@ -0,0 +1,240 @@
+import json, random, subprocess
+from datasets import load_dataset
+from types import SimpleNamespace
+from huggingface_hub import HfFolder
+import os
+import flexflow.serve as ff
+import matplotlib.pyplot as plt
+
+
+def create_datasets(finetune_dataset_size=2, inference_file_path='inference_dataset.json', finetuning_file_path='finetuning_dataset.json'):
+    """Creates the inference and finetuning datasets according to the data from https://huggingface.co/datasets/databricks/databricks-dolly-15k.
+    Only the 'open_qa' and 'closed_qa' prompts without context are kept.
+    The datasets are saved into the files given as arguments.
+
+    Keyword arguments:
+    dataset_size -- the number of prompts to consider
+    inference_file_path -- the file in which to save the inference data
+    finetuning_file_path -- the file in which to save the finetuning data
+    """
+    dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
+    inference_data = []
+    finetuning_data = []
+    for row in dataset:
+        if len(finetuning_data) == finetune_dataset_size:
+            break
+        if ("open_qa" in row['category'] or "closed_qa" in row['category']) and len(row['context']) == 0:
+            inference_data.append(row['instruction'])
+            finetuning_data.append(row['instruction'] + " " + row['response'])
+    with open(inference_file_path, 'w') as file:
+        json.dump(inference_data[:1], file)
+    with open(finetuning_file_path, 'w') as file:
+        json.dump(finetuning_data[:1], file, indent=2, separators=(',', ': '))
+
+
+configs_dict = {
+    "num_gpus": 1,
+    "memory_per_gpu": 21000,
+    "zero_copy_memory_per_node": 40000,
+    "num_cpus": 4,
+    "legion_utility_processors": 4,
+    "data_parallelism_degree": 1,
+    "tensor_parallelism_degree": 1,
+    "pipeline_parallelism_degree": 1,
+    "offload": False,
+    "offload_reserve_space_size": 8 * 1024,  # 8GB
+    "use_4bit_quantization": False,
+    "use_8bit_quantization": False,
+    "enable_peft": True,
+    "peft_activation_reserve_space_size": 1024,  # 1GB
+    "peft_weight_reserve_space_size": 1024,  # 1GB
+    "profiling": False,
+    "inference_debugging": False,
+    "fusion": False,
+    "max_requests_per_batch": 1,
+    "max_sequence_length": 128,
+    "max_tokens_per_batch": 128,
+    "max_training_steps": 100,
+    "seed": 42,
+}
+model_configs = {
+    "base_model": "meta-llama/Meta-Llama-3-8B",
+    "inference_peft_model_id": "goliaro/llama-3-8b-lora",
+    "finetuning_peft_model_id": "goliaro/llama-3-8b-lora",
+    "cache_path": os.environ.get("FF_CACHE_PATH", ""),
+    "refresh_cache": False,
+    "full_precision": False,
+    # relative paths
+    "inference_dataset": "inference_dataset.json",
+    "finetuning_dataset": "/usr/FlexFlow/inference/prompt/peft_dataset.json",
+    "output_file": "peft_demo.txt",
+}
+generation_configs = {
+    "do_sample": False,
+    "temperature": 0.9,
+    "topp": 0.8,
+    "topk": 1,
+}
+finetuning_configs = {
+    "learning_rate": 0.001,
+    "momentum": 0.0,
+    "weight_decay": 0.0,
+    "nesterov": False,
+}
+# Merge dictionaries
+configs_dict.update(model_configs)
+configs_dict.update(generation_configs)
+configs_dict.update(finetuning_configs)
+
+
+random.seed(configs_dict["seed"])
+
+create_datasets(inference_file_path=configs_dict["inference_dataset"], 
+                finetuning_file_path=configs_dict["finetuning_dataset"])
+
+configs = SimpleNamespace(**configs_dict)
+
+# Clear output file
+with open(configs.output_file, 'w') as file:
+    file.write('')
+
+# Download base and peft inference models
+args = [configs.inference_peft_model_id, '--base_model_name', configs.base_model]
+# hf_token = input("Please enter your HuggingFace personal access token: ")
+# subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
+subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
+
+
+# Initialize the FlexFlow runtime. ff.init() takes a dictionary or the path to a JSON file with the configs
+ff.init(configs_dict)
+
+# Create the FlexFlow LLM
+ff_data_type = (
+    ff.DataType.DT_FLOAT if configs.full_precision else ff.DataType.DT_HALF
+)
+llm = ff.LLM(
+    configs.base_model,
+    data_type=ff_data_type,
+    cache_path=configs.cache_path,
+    refresh_cache=configs.refresh_cache,
+    output_file=configs.output_file,
+)
+# Add inference and/or finetuning lora
+lora_inference_config = None
+lora_finetuning_config = None
+if len(configs.inference_dataset) > 0:
+    lora_inference_config = ff.LoraLinearConfig(
+        llm.cache_path, 
+        configs.inference_peft_model_id,
+        base_model_name_or_path=configs.base_model
+    )
+    llm.add_peft(lora_inference_config)
+if len(configs.finetuning_dataset) > 0:
+    lora_finetuning_config = ff.LoraLinearConfig(
+        llm.cache_path,
+        configs.finetuning_peft_model_id,
+        trainable=True,
+        init_lora_weights=False,
+        rank=16,
+        lora_alpha=16.0,
+        # target_modules = ["down_proj"],
+        base_model_name_or_path=configs.base_model,
+        optimizer_type=ff.OptimizerType.OPTIMIZER_TYPE_SGD,
+        optimizer_kwargs={
+            "learning_rate": configs.learning_rate,
+            "momentum": configs.momentum,
+            "weight_decay": configs.weight_decay,
+            "nesterov": configs.nesterov,
+        },
+    )
+    llm.add_peft(lora_finetuning_config)
+
+# Compile the LLM for inference and load the weights into memory
+generation_config = ff.GenerationConfig(
+    do_sample=configs.do_sample,
+    temperature=configs.temperature,
+    topp=configs.topp,
+    topk=configs.topk
+)
+enable_peft_finetuning = len(configs.finetuning_dataset) > 0
+llm.compile(
+    generation_config,
+    enable_peft_finetuning=enable_peft_finetuning,
+    max_requests_per_batch=configs.max_requests_per_batch+int(enable_peft_finetuning),
+    max_seq_length=configs.max_sequence_length,
+    max_tokens_per_batch=configs.max_tokens_per_batch,
+)
+
+
+llm.start_server()
+
+
+# prompts = [s for s in json.load(open(configs.inference_dataset))]
+# inference_requests = [
+#     ff.Request(
+#         ff.RequestType.REQ_INFERENCE,
+#         prompt=prompt,
+#         max_sequence_length=configs.max_sequence_length,
+#         peft_model_id=llm.get_ff_peft_id(lora_inference_config),
+#     )
+#     for prompt in prompts
+# ]
+# inf_req_res_1 = llm.generate(inference_requests)
+
+
+finetuning_request = ff.Request(
+    ff.RequestType.REQ_FINETUNING,
+    max_sequence_length=configs.max_sequence_length,
+    peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
+    dataset_filepath=os.path.join(os.getcwd(), configs.finetuning_dataset),
+    max_training_steps=configs.max_training_steps,
+)
+ft_res = llm.generate([finetuning_request])
+for res in ft_res:
+    print(res.finetuning_losses)
+
+# exit(0)
+# hf_token = input("Please enter your HuggingFace personal access token: ")
+# subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
+subprocess.run(['python', '../../utils/upload_peft_model.py'] + f"--peft-model-id {configs.finetuning_peft_model_id} --upload-peft-model-id {configs.finetuning_peft_model_id}-dolly".split())
+
+
+
+lora_inference_config = ff.LoraLinearConfig(
+    llm.cache_path, 
+    configs.finetuning_peft_model_id,
+    base_model_name_or_path=configs.base_model
+)
+llm.add_peft(lora_inference_config)
+
+args = [configs.finetuning_peft_model_id, '--base_model_name', configs.base_model]
+#hf_token = input("Please enter your HuggingFace personal access token: ")
+# subprocess.run(['huggingface-cli', 'login', '--token', hf_token])
+# subprocess.run(['python', '../../utils/download_peft_model.py'] + args)
+
+
+prompts = [s for s in json.load(open(configs.inference_dataset))]
+inference_requests = [
+    ff.Request(
+        ff.RequestType.REQ_INFERENCE,
+        prompt=prompt,
+        max_sequence_length=configs.max_sequence_length,
+        peft_model_id=llm.get_ff_peft_id(lora_inference_config),
+    )
+    for prompt in prompts
+]
+inf_req_res_2 = llm.generate(inference_requests)
+
+
+llm.stop_server()
+
+
+print("==Inference result before finetuning: ", inf_req_res_1[0].output_text)
+print("==Inference result after finetuning: ", inf_req_res_2[0].output_text)
+
+
+epochs = list(range(configs_dict["max_training_steps"]))
+loss_values = ft_res[0].finetuning_losses
+
+plt.figure(figsize=(10, 6))
+plt.plot(epochs, loss_values, marker='o', linestyle='-', color='b')
\ No newline at end of file
diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
index a6dfa8042e..39529abda3 100644
--- a/inference/python/spec_infer.py
+++ b/inference/python/spec_infer.py
@@ -51,9 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 1024**2,
+            "offload_reserve_space_size": 8 * 1024, # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
+            "enable_peft": False,
+            "peft_activation_reserve_space_size": 1024, # 1GB
+            "peft_weight_reserve_space_size": 1024, # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -76,7 +79,7 @@ def get_configs():
                     "full_precision": False,
                 }
             ],
-            # "prompt": "",
+            "prompt": "",
             "output_file": "",
         }
         # Merge dictionaries
diff --git a/inference/spec_infer/spec_infer.cc b/inference/spec_infer/spec_infer.cc
index 60233ac8d1..9689080825 100644
--- a/inference/spec_infer/spec_infer.cc
+++ b/inference/spec_infer/spec_infer.cc
@@ -414,15 +414,18 @@ void FlexFlow::top_level_task(Task const *task,
                                    /*allow_exceptions */ true,
                                    /*ignore_comments */ true);
 
-    std::vector<std::string> prompts;
+    std::vector<Request> requests;
     for (auto &prompt : prompt_json) {
       std::string text = prompt.get<std::string>();
       printf("Prompt[%d]: %s\n", total_num_requests, text.c_str());
+      // Add inference request
+      Request inference_req;
+      inference_req.prompt = text;
+      inference_req.max_sequence_length = 128;
+      requests.push_back(inference_req);
       total_num_requests++;
-      prompts.push_back(text);
-      // tree_model.generate(text, 128 /*max_sequence_length*/);
     }
-    tree_model.generate(prompts, 128 /*max_sequence_length*/);
+    tree_model.generate(requests);
   }
 
   // terminate the request manager by stopping the background thread
diff --git a/inference/utils/download_peft_model.py b/inference/utils/download_peft_model.py
new file mode 100644
index 0000000000..38dd577574
--- /dev/null
+++ b/inference/utils/download_peft_model.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+import flexflow.serve as ff
+import argparse, os
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base_model_name", type=str, help="Name of the model to download"
+    )
+    parser.add_argument(
+        "peft_model_ids",
+        type=str,
+        nargs="+",
+        help="Name of the PEFT model(s) to download",
+    )
+    parser.add_argument(
+        "--cache-folder",
+        type=str,
+        help="Folder to use to store the model(s) assets in FlexFlow format",
+        default=os.environ.get("FF_CACHE_PATH", ""),
+    )
+    parser.add_argument(
+        "--refresh-cache",
+        action="store_true",
+        help="Use this flag to force the refresh of the model(s) weights/tokenizer cache",
+    )
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument(
+        "--full-precision-only",
+        action="store_true",
+        help="Only download the full precision version of the weights",
+    )
+    group.add_argument(
+        "--half-precision-only",
+        action="store_true",
+        help="Only download the half precision version of the weights",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main(args):
+    if args.full_precision_only:
+        data_types = (ff.DataType.DT_FLOAT,)
+    elif args.half_precision_only:
+        data_types = (ff.DataType.DT_HALF,)
+    else:
+        data_types = (ff.DataType.DT_FLOAT, ff.DataType.DT_HALF)
+
+    for data_type in data_types:
+        llm = ff.LLM(
+            args.base_model_name,
+            data_type=data_type,
+            cache_path=args.cache_folder,
+            refresh_cache=args.refresh_cache,
+        )
+        for peft_model_id in args.peft_model_ids:
+            lora_config = ff.LoraLinearConfig(llm.cache_path, peft_model_id)
+            llm.add_peft(lora_config)
+        llm.download_hf_weights_if_needed()
+        llm.download_hf_config()
+        llm.download_hf_tokenizer_if_needed()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/inference/utils/upload_peft_model.py b/inference/utils/upload_peft_model.py
new file mode 100644
index 0000000000..7098d72f98
--- /dev/null
+++ b/inference/utils/upload_peft_model.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+import argparse, os
+from huggingface_hub import HfApi, HfFolder
+from transformers import AutoModelForCausalLM
+from peft import LoraConfig, PeftModel
+import torch
+import numpy as np
+import flexflow.serve as ff
+from peft import LoraConfig, get_peft_model
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Download a PEFT model with FlexFlow, process it, and upload it to the Hugging Face Hub."
+    )
+    parser.add_argument(
+        "--peft-model-id",
+        type=str,
+        required=True,
+        help="(Local) Hugging Face model ID of the PEFT model to upload.",
+    )
+    parser.add_argument(
+        "--upload-peft-model-id",
+        type=str,
+        required=True,
+        help="(Remote) Hugging Face model ID of the PEFT model to upload.",
+    )
+    parser.add_argument(
+        "--cache-folder",
+        type=str,
+        default=os.environ.get(
+            "FF_CACHE_PATH", os.path.expanduser("~/.cache/flexflow")
+        ),
+        help="Path to the FlexFlow cache folder",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Whether to upload the processed PEFT model as a private model on Hugging Face Hub.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # Ensure Hugging Face CLI is logged in
+    if not HfFolder.get_token():
+        raise RuntimeError(
+            "Hugging Face token not found. Please login using `huggingface-cli login`."
+        )
+
+    lora_config_filepath = os.path.join(
+        args.cache_folder,
+        "finetuned_models",
+        args.peft_model_id,
+        "config",
+        "ff_config.json",
+    )
+    peft_config = ff.LoraLinearConfig.from_jsonfile(lora_config_filepath)
+    print(peft_config)
+    hf_peft_config = peft_config.to_hf_config()
+    print(hf_peft_config)
+    if peft_config.precision != "fp32" and peft_config.precision != "fp16":
+        raise ValueError(f"Unsupported precision: {peft_config.precision}")
+    model = AutoModelForCausalLM.from_pretrained(
+        peft_config.base_model_name_or_path,
+        torch_dtype=torch.float32 if peft_config.precision == "fp32" else torch.float16,
+        device_map="auto",
+    )
+    model = get_peft_model(model, hf_peft_config)
+    in_dim = model.config.intermediate_size
+    out_dim = model.config.hidden_size
+
+    weight_folder = os.path.join(
+        args.cache_folder, "finetuned_models", args.peft_model_id, "weights", "shard_0"
+    )
+    num_shards = 1
+    while os.path.exists(weight_folder.replace("shard_0", f"shard_{num_shards}")):
+        num_shards += 1
+    if not in_dim % num_shards == 0:
+        raise ValueError(
+            f"Number of shards ({num_shards}) must divide the input dimension ({in_dim})"
+        )
+    lora_weight_files = os.listdir(weight_folder)
+    for lora_file in sorted(lora_weight_files):
+        lora_filename = ".weight".join(lora_file.split(".weight")[:-1])
+        hf_parameter_name = f"base_model.model.model.{lora_filename}.default.weight"
+        if hf_parameter_name not in model.state_dict().keys():
+            raise KeyError(f"Parameter {lora_file} not found in HF model.")
+
+        ff_dtype = np.float32 if peft_config.precision == "fp32" else np.float16
+        weight_path = os.path.join(weight_folder, lora_file)
+        # LoRA_A: [in_dim, rank]
+        # LoRA_B: [rank, out_dim]
+        if "lora_A" in lora_file:
+            weight_data = []
+            for shard_id in range(num_shards):
+                weight_path_shard = weight_path.replace("shard_0", f"shard_{shard_id}")
+                weight_data_shard = np.fromfile(weight_path_shard, dtype=ff_dtype)
+                print("===in_dim:", in_dim)
+                print("===out_dim:", out_dim)
+                print("===rank:", peft_config.rank)
+                print("===num_shards:", num_shards)
+                weight_data_shard = weight_data_shard.reshape(
+                    (in_dim // num_shards, peft_config.rank), order="F"
+                )
+                weight_data.append(weight_data_shard)
+            weight_data = np.concatenate(weight_data, axis=0).T
+        elif "lora_B" in lora_file:
+            weight_data = np.fromfile(weight_path, dtype=ff_dtype)
+            weight_data = weight_data.reshape((peft_config.rank, out_dim), order="F").T
+        weight_tensor = torch.from_numpy(weight_data)
+
+        param = model.state_dict()[hf_parameter_name]
+
+        actual_numel = weight_tensor.numel()
+        expected_numel = param.numel()
+        if actual_numel != expected_numel:
+            raise ValueError(
+                f"Parameter {lora_file} has unexpected parameter count: {actual_numel} (actual) != {expected_numel} (expected)"
+            )
+
+        if weight_tensor.shape != param.shape:
+            raise ValueError(
+                f"Parameter {lora_file} has unexpected shape: {weight_tensor.shape} (actual) != {param.shape} (expected)"
+            )
+        if weight_tensor.dtype != param.dtype:
+            raise ValueError(
+                f"Parameter {lora_file} has unexpected dtype: {weight_tensor.dtype} (actual) != {param.dtype} (expected)"
+            )
+
+        with torch.no_grad():
+            param.copy_(weight_tensor)
+
+    model.push_to_hub(f"{args.upload_peft_model_id}", use_auth_token=True, private=args.private)
+
+    print("Upload process completed.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/flexflow/core/__init__.py b/python/flexflow/core/__init__.py
index 2820cf485a..b8ed15eaea 100644
--- a/python/flexflow/core/__init__.py
+++ b/python/flexflow/core/__init__.py
@@ -88,7 +88,10 @@
     "offload": "-offload",
     "offload_reserve_space_size": "-offload-reserve-space-size",
     "use_4bit_quantization": "--4bit-quantization",
-    "use_8bit_quantization": "--8bit-quantization"
+    "use_8bit_quantization": "--8bit-quantization",
+    "enable_peft": "-enable-peft",
+    "peft_activation_reserve_space_size": "-peft-activation-reserve-space-size",
+    "peft_weight_reserve_space_size": "-peft-weight-reserve-space-size",
 }
 
 
diff --git a/python/flexflow/core/flexflow_cffi.py b/python/flexflow/core/flexflow_cffi.py
index 14cf4eebf7..7692ccb88f 100644
--- a/python/flexflow/core/flexflow_cffi.py
+++ b/python/flexflow/core/flexflow_cffi.py
@@ -28,6 +28,8 @@
     CompMode,
     MetricsType,
     InferenceMode,
+    RequestType,
+    OptimizerType,
     ModelType,
     OpType,
     ParameterSyncType,
@@ -36,6 +38,9 @@
 )
 from flexflow.config import *
 from .flexflowlib import ffi, flexflow_library
+from typing import Union, List
+from peft import LoraConfig
+import json
 
 
 def ffc():
@@ -1243,1009 +1248,935 @@ def get_weights(self, ffmodel):
 
 
 # -----------------------------------------------------------------------
-# FFModel
+# SGDOptimizer
 # -----------------------------------------------------------------------
 
 
-class FFModel(object):
-    """ """
+class SGDOptimizer(object):
+    __slots__ = ["handle", "_handle"]
 
-    __slots__ = [
-        "handle",
-        "_handle",
-        "_layers",
-        "_nb_layers",
-        "_ffconfig",
-        "_tracing_id",
-        "initializers",
-        "attr_tensors",
-    ]
+    def __init__(
+        self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0
+    ):
+        self.handle = ffc().flexflow_sgd_optimizer_create(
+            ffmodel.handle, lr, momentum, nesterov, weight_decay
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy)
 
-    def __init__(self, ffconfig):
-        """Constructor of FFModel.
+    def set_learning_rate(self, learning_rate):
+        ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate)
 
-        :param ffconfig: configurations of FlexFlow and the created model.
-        :type ffconfig: FFConfig
 
-        :returns:  FFModel -- the model.
-        """
-        self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload)
-        self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy)
-        self._layers = dict()
-        self._nb_layers = 0
-        self._ffconfig = ffconfig
-        global ff_tracing_id
-        self._tracing_id = ff_tracing_id
-        ff_tracing_id += 1
-        self.initializers = {}
-        self.attr_tensors = {}
+# -----------------------------------------------------------------------
+# AdamOptimizer
+# -----------------------------------------------------------------------
 
-    def get_layers(self):
-        return self._layers
 
-    def add_layer(self, op_type, name):
-        layer_id = self._nb_layers
-        op_handle = ffc().flexflow_model_get_last_layer(self.handle)
-        self._layers[self._nb_layers] = convert_op_handle_to_op(
-            op_type, op_handle, idx=layer_id, name=name
+class AdamOptimizer(object):
+    __slots__ = ["handle", "_handle"]
+
+    def __init__(
+        self,
+        ffmodel,
+        alpha=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        weight_decay=0.0,
+        epsilon=1e-8,
+    ):
+        self.handle = ffc().flexflow_adam_optimizer_create(
+            ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon
         )
-        self._nb_layers += 1
+        self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy)
 
-    def create_tensor(self, dims, data_type, create_grad=True):
-        """Instantiate a FlexFlow tensor.
+    def set_learning_rate(self, learning_rate):
+        ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate)
 
-        :param x: a shape tuple/list (integers), including the batch size.
-        :type x: list of int
 
-        :param data_type: the datatype of the created tensor. Options are
-          DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN.
-        :type data_type: DataType
+# -----------------------------------------------------------------------
+# Initializer
+# -----------------------------------------------------------------------
+class Initializer(object):
+    __slots__ = ["handle", "p_handle"]
 
-        :param create_grad: weather the tensor creates a gradients vector.
-          If you don't specify anything, a gradients vector is used.
-        :type create_grad: bool
+    def __init__(self, handle, p_handle=0):
+        self.p_handle = ffi.new("flexflow_initializer_t *")
+        if handle == None:
+            self.p_handle.impl = ffi.NULL
+        else:
+            self.p_handle.impl = handle.impl
+        self.handle = self.p_handle[0]
+        assert ffi.typeof(self.handle) == ffi.typeof(
+            "flexflow_initializer_t"
+        ), "Initializer handle is wrong"
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_dims = ffi.new("int[]", dims)
-        c_data_type = enum_to_int(DataType, data_type)
-        num_dims = len(dims)
-        handle = ffc().flexflow_tensor_create(
-            self.handle, num_dims, c_dims, c_data_type, create_grad
-        )
-        return Tensor(handle)
 
-    def map_tensor(self, tensor, parallel_op=None):
-        op_handle = self.__get_op_handle(parallel_op)
-        ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle)
+# -----------------------------------------------------------------------
+# GlorotUniform
+# -----------------------------------------------------------------------
 
-    def create_constant(self, dims, value, data_type):
-        c_dims = ffi.new("int[]", dims)
-        c_data_type = enum_to_int(DataType, data_type)
-        num_dims = len(dims)
-        handle = ffc().flexflow_constant_create(
-            self.handle, num_dims, c_dims, value, c_data_type
-        )
-        return Tensor(handle)
 
-    def exp(self, x, name=None):
-        """Exponential activation function.
+class GlorotUniformInitializer(Initializer):
+    __slots__ = ["glorot_handle", "_glorot_handle"]
 
-        :param x: the input Tensor.
-        :type x: Tensor
+    def __init__(self, seed):
+        self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed)
+        self._glorot_handle = ffi.gc(
+            self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy
+        )
+        super(GlorotUniformInitializer, self).__init__(self.glorot_handle)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name)
-        self.add_layer(OpType.EXP, name)
-        return Tensor(handle, owner_op_type=OpType.EXP)
+# -----------------------------------------------------------------------
+# ZeroInitializer
+# -----------------------------------------------------------------------
 
-    def sin(self, x, name=None):
-        """Elementwise sine function.
 
-        :param x: the input Tensor.
-        :type x: Tensor
+class ZeroInitializer(Initializer):
+    __slots__ = ["zero_handle", "_zero_handle"]
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def __init__(self):
+        self.zero_handle = ffc().flexflow_zero_initializer_create()
+        self._zero_handle = ffi.gc(
+            self.zero_handle, ffc().flexflow_zero_initializer_destroy
+        )
+        super(ZeroInitializer, self).__init__(self.zero_handle)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name)
-        self.add_layer(OpType.SIN, name)
-        return Tensor(handle, owner_op_type=OpType.SIN)
 
-    def cos(self, x, name=None):
-        """Elementwise cosine function.
+# -----------------------------------------------------------------------
+# UniformInitializer
+# -----------------------------------------------------------------------
 
-        :param x: the input Tensor.
-        :type x: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class UniformInitializer(Initializer):
+    __slots__ = ["uniform_handle", "_uniform_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name)
-        self.add_layer(OpType.COS, name)
-        return Tensor(handle, owner_op_type=OpType.COS)
+    def __init__(self, seed, minv, maxv):
+        self.uniform_handle = ffc().flexflow_uniform_initializer_create(
+            seed, minv, maxv
+        )
+        self._uniform_handle = ffi.gc(
+            self.uniform_handle, ffc().flexflow_uniform_initializer_destroy
+        )
+        super(UniformInitializer, self).__init__(self.uniform_handle)
 
-    def add(self, x, y, inplace_a=False, name=None):
-        """Layer that adds two input Tensors, :attr:`output = x + y`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+# -----------------------------------------------------------------------
+# NormInitializer
+# -----------------------------------------------------------------------
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class NormInitializer(Initializer):
+    __slots__ = ["norm_handle", "_norm_handle"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_add(
-            self.handle, x.handle, y.handle, inplace_a, c_name
+    def __init__(self, seed, mean, stddev):
+        self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev)
+        self._norm_handle = ffi.gc(
+            self.norm_handle, ffc().flexflow_norm_initializer_destroy
         )
-        self.add_layer(OpType.ADD, name)
-        return Tensor(handle, owner_op_type=OpType.ADD)
-
-    def subtract(self, x, y, inplace_a=False, name=None):
-        """Layer that subtracts two input Tensors, :attr:`output = x * y`.
+        super(NormInitializer, self).__init__(self.norm_handle)
 
-        :param x: the first input Tensor.
-        :type x: Tensor
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+# -----------------------------------------------------------------------
+# PerfMetrics
+# -----------------------------------------------------------------------
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_subtract(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.SUBTRACT, name)
-        return Tensor(handle, owner_op_type=OpType.SUBTRACT)
+class PerfMetrics(object):
+    __slots__ = ["handle", "_handle"]
 
-    def multiply(self, x, y, inplace_a=False, name=None):
-        """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`.
+    def __init__(self, handle):
+        self.handle = handle
+        self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy)
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+    def get_accuracy(self):
+        return ffc().flexflow_per_metrics_get_accuracy(self.handle)
 
-        :param y: the second input Tensor.
-        :type y: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+# -----------------------------------------------------------------------
+# NetConfig
+# -----------------------------------------------------------------------
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_multiply(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.MULTIPLY, name)
-        return Tensor(handle, owner_op_type=OpType.MULTIPLY)
 
-    def divide(self, x, y, inplace_a=False, name=None):
-        """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`.
-
-        :param x: the first input Tensor.
-        :type x: Tensor
-
-        :param y: the second input Tensor.
-        :type y: Tensor
+class NetConfig(object):
+    def __init__(self):
+        self.handle = ffc().flexflow_net_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy)
+        cpath = ffc().flexflow_net_config_get_dataset_path(self.handle)
+        self.dataset_path = ffi.string(cpath)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_divide(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.DIVIDE, name)
-        return Tensor(handle, owner_op_type=OpType.DIVIDE)
+# -----------------------------------------------------------------------
+# DLRMConfig
+# -----------------------------------------------------------------------
 
-    def max(self, x, y, inplace_a=False, name=None):
-        """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`.
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+class DLRMConfig(object):
+    def __init__(self):
+        self.handle = ffc().flexflow_dlrm_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy)
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+        cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle)
+        self.dataset_path = ffi.string(cstr)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle)
+        self.arch_interaction_op = ffi.string(cstr)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_max(
-            self.handle, x.handle, y.handle, inplace_a, c_name
+        self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(
+            self.handle
         )
-        self.add_layer(OpType.MAX, name)
-        return Tensor(handle, owner_op_type=OpType.MAX)
+        self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle)
+        self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle)
+        self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(
+            self.handle
+        )
+        self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle)
 
-    def min(self, x, y, inplace_a=False, name=None):
-        """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`.
+        mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle)
+        self.mlp_bot = []
+        for i in range(0, mlp_bot_c[0]):
+            self.mlp_bot.append(mlp_bot_c[i + 1])
 
-        :param x: the first input Tensor.
-        :type x: Tensor
+        mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle)
+        self.mlp_top = []
+        for i in range(0, mlp_top_c[0]):
+            self.mlp_top.append(mlp_top_c[i + 1])
 
-        :param y: the second input Tensor.
-        :type y: Tensor
+        embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle)
+        self.embedding_size = []
+        for i in range(0, embedding_size_c[0]):
+            self.embedding_size.append(embedding_size_c[i + 1])
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_min(
-            self.handle, x.handle, y.handle, inplace_a, c_name
-        )
-        self.add_layer(OpType.MIN, name)
-        return Tensor(handle, owner_op_type=OpType.MIN)
+# -----------------------------------------------------------------------
+# Single DataLoader
+# -----------------------------------------------------------------------
 
-    def reduce_sum(self, input, axes, keepdims=False, name=None):
-        """Layer that computes the sum of the input Tensor along given axes.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+class SingleDataLoader(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param axes: the axes along which reduction is applied
-        :type axes: List[int]
+    def __init__(self, ffmodel, input, full_input, num_samples, data_type):
+        assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong"
+        assert type(input) is Tensor, "SingleDataLoader input is wrong"
+        if type(full_input) is Tensor:
+            self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type)
+        else:
+            self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type):
+        assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
+        c_data_type = enum_to_int(DataType, data_type)
+        self.handle = ffc().flexflow_single_dataloader_create(
+            ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type
+        )
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handle = ffc().flexflow_model_add_reduce_sum(
-            self.handle, input.handle, c_axes, len(axes), keepdims, c_name
+    def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type):
+        # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
+        c_data_type = enum_to_int(DataType, data_type)
+        self.handle = ffc().flexflow_single_dataloader_create2(
+            ffmodel.handle, input.handle, full_input, num_samples, c_data_type
         )
-        self.add_layer(OpType.REDUCE_SUM, name)
-        return Tensor(handle, owner_op_type=OpType.REDUCE_SUM)
 
-    def rsqrt(self, input, name=None):
-        """Layer that computes the element-wise reciprocal square-root.
+    @property
+    def num_samples(self):
+        return ffc().flexflow_single_dataloader_get_num_samples(self.handle)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    @num_samples.setter
+    def num_samples(self, samples):
+        ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def next_batch(self, ffmodel):
+        """Ask the dataloder to load the next batch to the :attr:`batch_tensor`.
 
-        :returns:  Tensor -- the output tensor.
+        :returns:  None -- no returns.
         """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name)
-        self.add_layer(OpType.RSQRT, name)
-        return Tensor(handle, owner_op_type=OpType.RSQRT)
+        ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle)
 
-    def pow(self, input, exponent, name=None):
-        """Layer that computes the element-wise power.
+    def reset(self):
+        """Reset the current position of the dataloder to 0.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_single_dataloader_reset(self.handle)
 
-        :param exponent: exponent to raise each element in the input tensor.
-        :type exponent: float
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+class RegionNdarray(object):
+    __slots__ = ["__array_interface__"]
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_pow(
-            self.handle, input.handle, exponent, c_name
-        )
-        self.add_layer(OpType.POW, name)
-        return Tensor(handle, owner_op_type=OpType.POW)
+    def __init__(self, shape, data_type, base_ptr, strides, read_only):
+        # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html
+        if data_type == DataType.DT_HALF:
+            field_type = "<f2"
+        elif data_type == DataType.DT_FLOAT:
+            field_type = "<f4"
+        elif data_type == DataType.DT_INT32:
+            field_type = "<i4"
+        else:
+            assert 0, "unknown data type"
+            field_type = "<f4"
+        self.__array_interface__ = {
+            "version": 3,
+            "shape": shape,
+            "typestr": field_type,
+            "data": (base_ptr, read_only),
+            "strides": strides,
+        }
 
-    def mean(self, input, dims, keepdims=False, name=None):
-        """Layer that computes the mean of the input tensor across the given
-        dimensions.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+# -----------------------------------------------------------------------
+# BatchConfig
+# -----------------------------------------------------------------------
 
-        :param dims: dimensions to take the mean over.
-        :type dims: list
 
-        :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and
-                         collapses the dimension if False. Default is False.
-        :type keepdims: bool
+class BatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def __init__(self):
+        self.handle = ffc().flexflow_batch_config_create()
+        self._handle = ffi.gc(self.handle, ffc().flexflow_batch_config_destroy)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        dims = list(dims)
-        c_dims = ffi.new("int[]", dims)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_mean(
-            self.handle, input.handle, c_dims, len(dims), keepdims, c_name
-        )
-        self.add_layer(OpType.MEAN, name)
-        return Tensor(handle, owner_op_type=OpType.MEAN)
 
-    def conv2d(
-        self,
-        input,
-        out_channels,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        padding_h,
-        padding_w,
-        activation=ActiMode.AC_MODE_NONE,
-        groups=1,
-        use_bias=True,
-        shared_op=None,
-        kernel_initializer=None,
-        bias_initializer=None,
-        name=None,
-    ):
-        """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input`
-        to produce a tensor of :attr:`output`.
+# -----------------------------------------------------------------------
+# TreeVerifyBatchConfig
+# -----------------------------------------------------------------------
 
-        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
-        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        .. math::
-          C_{out} = out\_channels
+class TreeVerifyBatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        .. math::
-          K_{H} = kernel\_h
+    def __init__(self):
+        self.handle = ffc().flexflow_tree_verify_batch_config_create()
+        self._handle = ffi.gc(
+            self.handle, ffc().flexflow_tree_verify_batch_config_destroy
+        )
 
-        .. math::
-          K_{W} = kernel\_w
 
-        .. math::
-          S_{H} = stride\_h
-
-        .. math::
-          S_{W} = stride\_w
+# -----------------------------------------------------------------------
+# BeamSearchBatchConfig
+# -----------------------------------------------------------------------
 
-        .. math::
-          P_{H} = padding\_h
 
-        .. math::
-          P_{S} = padding\_s
+class BatchConfig(object):
+    __slots__ = ["handle", "_handle"]
 
-        .. math::
-          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
+    def __init__(self):
+        self.handle = ffc().flexflow_beam_search_batch_config_create()
+        self._handle = ffi.gc(
+            self.handle, ffc().flexflow_beam_search_batch_config_destroy
+        )
 
-        .. math::
-          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
-        :param input: the input Tensor.
-        :type input: Tensor
+# -----------------------------------------------------------------------
+# RequestManager
+# -----------------------------------------------------------------------
 
-        :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution).
-        :type out\_channels: int
 
-        :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`.
-        :type kernel_h: int
+class RequestManager(object):
+    __slots__ = ["handle"]
 
-        :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`.
-        :type kernel_w: int
+    def __init__(self):
+        self.handle = ffc().flexflow_request_manager_get_request_manager()
+        # self._handle = ffi.gc(self.handle, ffc().flexflow_request_manager_destroy)
 
-        :param stride_h: the stride of the convolution along the height: :math:`S_{H}`.
-        :type stride_h: int
+    def register_tokenizer(
+        self, model_type, bos_token_id, eos_token_id, tokenizer_filepath
+    ):
+        c_model_type = enum_to_int(ModelType, model_type)
+        c_tokenizer_filepath = get_c_name(tokenizer_filepath)
+        return ffc().flexflow_request_manager_register_tokenizer(
+            self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath
+        )
 
-        :param stride_w: the stride of the convolution along the width: :math:`S_{W}`.
-        :type stride_w: int
+    def register_output_filepath(self, output_filepath):
+        c_output_filepath = get_c_name(output_filepath)
+        return ffc().flexflow_request_manager_register_output_filepath(
+            self.handle, c_output_filepath
+        )
 
-        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
-        :type padding_h: int
+    def register_ssm_model(self, model):
+        return ffc().flexflow_request_manager_register_ssm_model(
+            self.handle, model.handle
+        )
 
-        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
-        :type padding_w: int
+    def set_max_requests_per_batch(self, max_requests):
+        return ffc().flexflow_request_manager_set_max_requests_per_batch(
+            self.handle, max_requests
+        )
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
+    def set_max_tokens_per_batch(self, max_tokens):
+        return ffc().flexflow_request_manager_set_max_tokens_per_batch(
+            self.handle, max_tokens
+        )
 
-        :param groups: the number of groups in this convolution
-        :type groups: int
+    def set_max_spec_tree_token_num(self, max_tokens):
+        return ffc().flexflow_request_manager_set_max_spec_tree_token_num(
+            self.handle, max_tokens
+        )
 
-        :param use_bias: whether the layer uses a bias vector. Default is True.
-        :type use_bias: bool
+    def set_max_sequence_length(self, max_length):
+        return ffc().flexflow_request_manager_set_max_sequence_length(
+            self.handle, max_length
+        )
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
+    def set_enable_peft_finetuning(self, enable_peft_finetuning):
+        return ffc().flexflow_request_manager_set_enable_peft_finetuning(
+            self.handle, enable_peft_finetuning
+        )
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def start_server(self, model):
+        return ffc().flexflow_request_manager_start_background_server(
+            self.handle, model.handle
+        )
 
-        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
-        :type bias_initializer: Initializer
+    def stop_server(self):
+        return ffc().flexflow_request_manager_terminate_background_server(self.handle)
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_activation = enum_to_int(ActiMode, activation)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        bias_init_handle = self.__get_initializer_handle(bias_initializer)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_conv2d(
-            self.handle,
-            input.handle,
-            out_channels,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            padding_h,
-            padding_w,
-            c_activation,
-            groups,
-            use_bias,
-            shared_op_handle,
-            kernel_init_handle,
-            bias_init_handle,
-            c_name,
-        )
-        self.add_layer(OpType.CONV2D, name)
-        return Tensor(handle, owner_op_type=OpType.CONV2D)
+# -----------------------------------------------------------------------
+# InferenceManager
+# -----------------------------------------------------------------------
 
-    def embedding(
-        self,
-        input,
-        num_embeddings,
-        embedding_dim,
-        aggr,
-        dtype=DataType.DT_FLOAT,
-        shared_op=None,
-        kernel_initializer=None,
-        name=None,
-    ):
-        """Layer that turns positive integers into dense vectors of fixed size
 
-        :param input: the input Tensor.
-        :type input: Tensor
+class InferenceManager(object):
+    __slots__ = ["handle"]
 
-        :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1
-        :type num_embeddings: int
+    def __init__(self):
+        self.handle = ffc().flexflow_inference_manager_get_inference_manager()
+        # self._handle = ffi.gc(self.handle, ffc().flexflow_inference_manager_destroy)
 
-        :param embedding_dim: dimension of the dense embedding.
-        :type embedding_dim: int
+    def compile_model_and_allocate_buffer(self, model):
+        ffc().flexflow_inference_manager_compile_model_and_allocate_buffer(
+            self.handle, model.handle
+        )
 
-        :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG.
-        :type aggr: AggrMode
+    def init_operators_inference(self, model):
+        ffc().flexflow_inference_manager_init_operators_inference(
+            self.handle, model.handle
+        )
 
-        :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE
-        :type dtype: DataType
+    def register_model_weights_loader(self, model, fileloader):
+        ffc().flexflow_inference_manager_register_model_weights_loader(
+            self.handle, model.handle, fileloader.handle
+        )
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+# -----------------------------------------------------------------------
+# FileDataLoader
+# -----------------------------------------------------------------------
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_aggr = enum_to_int(AggrMode, aggr)
-        c_dtype = enum_to_int(DataType, dtype)
-        if kernel_initializer is None:
-            kernel_initializer = GlorotUniformInitializer(42)
-        assert (
-            (type(kernel_initializer) is GlorotUniformInitializer)
-            or (type(kernel_initializer) is ZeroInitializer)
-            or (type(kernel_initializer) is UniformInitializer)
-            or (type(kernel_initializer) is NormInitializer)
-        ), f"Unknown initializer type: {kernel_initializer}"
-        handle = ffc().flexflow_model_add_embedding(
-            self.handle,
-            input.handle,
-            num_embeddings,
-            embedding_dim,
-            c_aggr,
-            c_dtype,
-            shared_op_handle,
-            kernel_initializer.handle,
-            c_name,
-        )
-        # NOTE: We must keep a reference to the initializer or else it will be
-        # immediately destructed
-        self.initializers[name] = kernel_initializer
-        self.add_layer(OpType.EMBEDDING, name)
-        return Tensor(handle, owner_op_type=OpType.EMBEDDING)
+class FileDataLoader(object):
+    __slots__ = ["handle", "_handle"]
 
-    def pool2d(
+    def __init__(
         self,
-        input,
-        kernel_h,
-        kernel_w,
-        stride_h,
-        stride_w,
-        padding_h,
-        padding_w,
-        pool_type=PoolType.POOL_MAX,
-        activation=ActiMode.AC_MODE_NONE,
-        name=None,
+        weight_file_path,
+        num_q_heads,
+        num_kv_heads,
+        hidden_dim,
+        qkv_inner_dim,
+        tensor_parallelism_degree,
+        use_full_precision,
     ):
-        """Pooling operation for 2D spatial data.
+        c_weight_file_path = get_c_name(weight_file_path)
+        self.handle = ffc().flexflow_file_data_loader_create(
+            c_weight_file_path,
+            num_q_heads,
+            num_kv_heads,
+            hidden_dim,
+            qkv_inner_dim,
+            tensor_parallelism_degree,
+            use_full_precision,
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy)
 
-        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
-        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
+    def load_weights(self, model):
+        # Check data type and create use_full_precision boolean
+        # assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF
+        # use_full_precision = data_type == DataType.DT_FLOAT
+        ffc().flexflow_file_data_loader_load_weights(self.handle, model.handle)
 
-        .. math::
-          C_{out} = out\_channels
 
-        .. math::
-          K_{H} = kernel\_h
+# -----------------------------------------------------------------------
+# GenerationConfig
+# -----------------------------------------------------------------------
 
-        .. math::
-          K_{W} = kernel\_w
 
-        .. math::
-          S_{H} = stride\_h
+class GenerationConfig(object):
+    """A class to store the sampling configs."""
 
-        .. math::
-          S_{W} = stride\_w
+    def __init__(
+        self,
+        do_sample: bool = False,
+        temperature: float = 0.9,
+        topp: float = 0.8,
+        topk: int = 1,
+    ):
+        """Initialize the sampling configs
+
+        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
+        :type do_sample: bool, optional
+        :param temperature: The temperature setting, defaults to 0.9
+        :type temperature: float, optional
+        :param topp: The top probabilities (top-p) setting, defaults to 0.8
+        :type topp: float, optional
+        :param topk: The top-k setting, defaults to 1
+        :type topk: int, optional
+        """
+        self.do_sample = do_sample
+        self.temperature = temperature
+        self.topp = topp
+        self.topk = topk
 
-        .. math::
-          P_{H} = padding\_h
 
-        .. math::
-          P_{S} = padding\_s
+# -----------------------------------------------------------------------
+# GenerationResult
+# -----------------------------------------------------------------------
 
-        .. math::
-          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
 
-        .. math::
-          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
+class GenerationResult(object):
+    """A class to store the output of a generation request."""
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def __init__(
+        self, text: str = None, tokens: list = None, finetuning_losses: list = []
+    ):
+        self.output_text = text
+        self.output_tokens = tokens
+        self.finetuning_losses = finetuning_losses
 
-        :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`.
-        :type kernel_h: int
 
-        :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`.
-        :type kernel_w: int
+# -----------------------------------------------------------------------
+# LoraLinearConfig
+# -----------------------------------------------------------------------
 
-        :param stride_h: the stride of the pooling along the height: :math:`S_{H}`.
-        :type stride_h: int
 
-        :param stride_w: the stride of the pooling along the width: :math:`S_{W}`.
-        :type stride_w: int
+class LoraLinearConfig(object):
+    def __init__(
+        self,
+        cache_folder: str,
+        peft_model_id: str,
+        trainable: bool = False,
+        init_lora_weights: bool = False,
+        base_model_name_or_path: str = "",
+        precision: str = "fp16",
+        rank: int = None,
+        lora_alpha: float = None,
+        lora_dropout: float = None,
+        target_modules: List[str] = [],
+        optimizer_type: OptimizerType = OptimizerType.OPTIMIZER_TYPE_NONE,
+        optimizer_kwargs: dict = {},
+    ):
+        if trainable:
+            if (
+                optimizer_type != OptimizerType.OPTIMIZER_TYPE_SGD
+                and optimizer_type != OptimizerType.OPTIMIZER_TYPE_ADAM
+            ):
+                raise ValueError(
+                    "Please specify optimizer to be used to train LoRA module. Supported optimizers: SGD and Adam"
+                )
+            if init_lora_weights and len(target_modules) == 0:
+                raise ValueError(
+                    "Please specify target modules to be used to train LoRA module"
+                )
+            if not init_lora_weights and len(target_modules) > 0:
+                raise ValueError(
+                    "Target modules can only be specified when init_lora_weights=True"
+                )
+        else:
+            if init_lora_weights:
+                raise ValueError(
+                    "LORA weights initialization from scratch not supported in inference model"
+                )
+            if len(target_modules) > 0:
+                raise ValueError(
+                    "Target modules can only be specified when trainable=True"
+                )
+        
+        # Check rank, lora_alpha, lora_dropout values
+        if rank is not None or lora_alpha is not None or lora_dropout is not None:
+            if not trainable or not init_lora_weights:
+                raise ValueError(
+                    "rank, lora_alpha, and lora_dropout can only be set when trainable=True and init_lora_weights=True"
+                )
+        rank = rank if rank is not None else 8
+        lora_alpha = lora_alpha if lora_alpha is not None else 8.0
+        lora_dropout = lora_dropout if lora_dropout is not None else 0.0
+        
+        # If passed, check if the values of rank, lora_alpha, and lora_dropout are valid
+        if rank < 1 or type(rank) != int:
+            raise ValueError("Rank must be >= 1 and an integer")
+        if lora_alpha <= 0:
+            raise ValueError("Lora_alpha must be > 0")
+        if lora_dropout < 0 or lora_dropout > 1:
+            raise ValueError("Lora_dropout must be in the interval [0, 1]")
+        
+        self.ff_initialized = False
+        self._cache_folder = cache_folder
+        self._peft_model_id = peft_model_id
+        self._trainable = trainable
+        self._init_lora_weights = init_lora_weights
+        self._base_model_name_or_path = base_model_name_or_path
+        self._precision = precision
+        self._rank = rank
+        self._lora_alpha = lora_alpha
+        self._lora_dropout = lora_dropout
+        self._target_modules = target_modules
+        self.optimizer_type = optimizer_type
+        self.optimizer_kwargs = optimizer_kwargs
+
+    def ff_compile(self):
+        c_cache_folder = get_c_name(os.path.expanduser(self.cache_folder))
+        peft_model_id = get_c_name(self.peft_model_id)
+        base_model_name_or_path = get_c_name(self.base_model_name_or_path)
+        precision = get_c_name(self.precision)
+        c_target_modules = [
+            get_c_name(target_module) for target_module in self.target_modules
+        ]
+        c_optimizer_type = enum_to_int(OptimizerType, self.optimizer_type)
+        # SGD optional optimizer args
+        sgd_learning_rate = self.optimizer_kwargs.get("learning_rate", 0.001)
+        sgd_momentum = self.optimizer_kwargs.get("momentum", 0.0)
+        sgd_nesterov = self.optimizer_kwargs.get("nesterov", False)
+        sgd_weight_decay = self.optimizer_kwargs.get("weight_decay", 0.0)
+        # Adam optional optimizer args
+        adam_alpha = self.optimizer_kwargs.get("alpha", 0.001)
+        adam_beta1 = self.optimizer_kwargs.get("beta1", 0.9)
+        adam_beta2 = self.optimizer_kwargs.get("beta2", 0.999)
+        adam_weight_decay = self.optimizer_kwargs.get("weight_decay", 0.0)
+        adam_epsilon = self.optimizer_kwargs.get("epsilon", 1e-8)
+        self.handle = ffc().flexflow_lora_linear_config_create(
+            c_cache_folder,
+            peft_model_id,
+            self.trainable,
+            self.init_lora_weights,
+            base_model_name_or_path,
+            precision,
+            self.rank,
+            self.lora_alpha,
+            self.lora_dropout,
+            len(self.target_modules),
+            c_target_modules,
+            c_optimizer_type,
+            sgd_learning_rate,
+            sgd_momentum,
+            sgd_nesterov,
+            sgd_weight_decay,
+            adam_alpha,
+            adam_beta1,
+            adam_beta2,
+            adam_weight_decay,
+            adam_epsilon,
+        )
+        self._handle = ffi.gc(self.handle, ffc().flexflow_lora_linear_config_destroy)
+        self.ff_initialized = True
+
+    @classmethod
+    def from_jsonfile(self, jsonfile: str):
+        with open(jsonfile, "r") as file:
+            config = json.load(file)
+        config_dict = dict(config)
+        config_dict["optimizer_type"] = OptimizerType.OPTIMIZER_TYPE_SGD
+        return LoraLinearConfig(**config_dict)
+
+    def to_hf_config(self) -> LoraConfig:
+        return LoraConfig(
+            base_model_name_or_path=self.base_model_name_or_path,
+            r=self.rank,
+            target_modules=self.target_modules,
+            lora_alpha=self.lora_alpha,
+            lora_dropout=self.lora_dropout,
+        )
 
-        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
-        :type padding_h: int
+    @property
+    def cache_folder(self):
+        if self.ff_initialized:
+            c_cache_folder = ffc().flexflow_lora_linear_config_get_cache_folder(
+                self.handle
+            )
+            return ffi.string(c_cache_folder).decode("utf-8")
+        else:
+            return self._cache_folder
 
-        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
-        :type padding_w: int
+    @property
+    def peft_model_id(self):
+        if self.ff_initialized:
+            c_peft_model_id = ffc().flexflow_lora_linear_config_get_peft_model_id(
+                self.handle
+            )
+            return ffi.string(c_peft_model_id).decode("utf-8")
+        else:
+            return self._peft_model_id
 
-        :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied.
-        :type activation: PoolType
+    @property
+    def rank(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_rank(self.handle)
+        else:
+            return self._rank
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
+    @property
+    def lora_alpha(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_lora_alpha(self.handle)
+        else:
+            return self._lora_alpha
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    @property
+    def lora_dropout(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_lora_dropout(self.handle)
+        else:
+            return self._lora_dropout
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        c_pool_type = enum_to_int(PoolType, pool_type)
-        c_activation = enum_to_int(ActiMode, activation)
-        handle = ffc().flexflow_model_add_pool2d(
-            self.handle,
-            input.handle,
-            kernel_h,
-            kernel_w,
-            stride_h,
-            stride_w,
-            padding_h,
-            padding_w,
-            c_pool_type,
-            c_activation,
-            c_name,
-        )
-        self.add_layer(OpType.POOL2D, name)
-        return Tensor(handle, owner_op_type=OpType.POOL2D)
+    @property
+    def trainable(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_trainable(self.handle)
+        else:
+            return self._trainable
 
-    def batch_norm(self, input, relu=True, name=None):
-        """Layer that normalizes its inputs.
+    @property
+    def init_lora_weights(self):
+        if self.ff_initialized:
+            return ffc().flexflow_lora_linear_config_get_init_lora_weights(self.handle)
+        else:
+            return self._init_lora_weights
 
-        Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1.
+    @property
+    def base_model_name_or_path(self):
+        if self.ff_initialized:
+            c_base_model_name_or_path = (
+                ffc().flexflow_lora_linear_config_get_base_model_name_or_path(
+                    self.handle
+                )
+            )
+            return ffi.string(c_base_model_name_or_path).decode("utf-8")
+        else:
+            return self._base_model_name_or_path
 
-        :param input: the list of input Tensors.
-        :type input: Tensor
+    @property
+    def precision(self):
+        if self.ff_initialized:
+            c_precision = ffc().flexflow_lora_linear_config_get_precision(self.handle)
+            return ffi.string(c_precision).decode("utf-8")
+        else:
+            return self._precision
 
-        :param relu: whether a ReLU function is applied. Default is True.
-        :type relu: bool
+    @property
+    def target_modules(self):
+        if self.ff_initialized:
+            num_target_modules = ffi.new("int *")
+            c_target_modules = ffc().flexflow_lora_linear_config_get_target_modules(
+                self.handle, num_target_modules
+            )
+            target_modules = []
+            for i in range(num_target_modules[0]):
+                target_modules.append(ffi.string(c_target_modules[i]).decode("utf-8"))
+            return target_modules
+        else:
+            return self._target_modules
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    @cache_folder.setter
+    def cache_folder(self, value: str):
+        self._cache_folder = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_cache_folder(self.handle, value)
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_batch_norm(
-            self.handle, input.handle, relu, c_name
-        )
-        self.add_layer(OpType.BATCH_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.BATCH_NORM)
+    @peft_model_id.setter
+    def peft_model_id(self, value: str):
+        self._peft_model_id = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_peft_model_id(self.handle, value)
 
-    def layer_norm(
-        self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None
-    ):
-        """Add a LayerNorm layer
+    @rank.setter
+    def rank(self, value: int):
+        self._rank = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_rank(self.handle, value)
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: Union[int, List[int]]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: _type_, optional
-        :return: The LayerNorm output tensor
-        :rtype: Tensor
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handle = ffc().flexflow_model_add_layer_norm(
-            self.handle,
-            input.handle,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            c_name,
-        )
-        self.add_layer(OpType.LAYER_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.LAYER_NORM)
+    @lora_alpha.setter
+    def lora_alpha(self, value: float):
+        self._lora_alpha = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_lora_alpha(self.handle, value)
 
-    def residual_layer_norm(
-        self,
-        input,
-        residual1,
-        residual2,
-        use_two_residuals,
-        axes,
-        elementwise_affine=True,
-        eps=1e-5,
-        use_bias=True,
-        name=None,
-    ):
-        """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in 
-        better efficiency compared to using separate element-wise add and LayerNorm operators.
+    @lora_dropout.setter
+    def lora_dropout(self, value: float):
+        self._lora_dropout = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_lora_dropout(self.handle, value)
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param residual1: The residual tensor to add to the input before computing the LayerNorm
-        :type residual1: Tensor
-        :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm
-        :type residual2: Tensor
-        :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise
-        :type use_two_residuals: bool
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: List[int]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: str, optional
-        :return: A tensor with the sum of the input and residual(s), and the LayerNorm output
-        :rtype: (Tensor, Tensor)
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        residual2_handle = (
-            residual1.handle
-        )  # This is intentional. Data will be ignored, and we cannot pass None
-        if use_two_residuals:
-            assert residual2 is not None
-            residual2_handle = residual2.handle
-        handles_array = ffc().flexflow_model_add_residual_layer_norm(
-            self.handle,
-            input.handle,
-            residual1.handle,
-            residual2_handle,
-            use_two_residuals,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            c_name,
-        )
-        self.add_layer(OpType.RESIDUAL_LAYERNORM, name)
-        return Tensor(
-            handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM
-        ), Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM)
+    @trainable.setter
+    def trainable(self, value: bool):
+        self._trainable = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_trainable(self.handle, value)
 
-    def add_bias_residual_layer_norm(
-        self,
-        input,
-        residual,
-        axes,
-        elementwise_affine=True,
-        eps=1e-5,
-        use_bias=True,
-        name=None,
-    ):
-        """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel, 
-        resulting in better efficiency compared to using separate attention bias addition + 
-        element-wise residual addition + LayerNorm operators.
+    @init_lora_weights.setter
+    def init_lora_weights(self, value: bool):
+        self._init_lora_weights = value
+        if self.ff_initialized:
+            ffc().flexflow_lora_linear_config_set_init_lora_weights(self.handle, value)
 
-        :param input: The input tensor
-        :type input: Tensor
-        :param residual: The residual tensor
-        :type residual: Tensor
-        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
-        :type axes: Union[int, List[int]]
-        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
-        :type elementwise_affine: bool, optional
-        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
-        :type eps: float, optional
-        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
-        :type use_bias: bool, optional
-        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
-        :type name: _type_, optional
-        :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output
-        :rtype: (Tensor, Tensor)
-        """
-        c_name = get_c_name(name)
-        c_axes = ffi.new("int[]", axes)
-        handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(
-            self.handle,
-            input.handle,
-            residual.handle,
-            len(axes),
-            c_axes,
-            elementwise_affine,
-            eps,
-            use_bias,
-            c_name,
-        )
-        self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name)
-        return Tensor(
-            handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM
-        ), Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM)
 
-    def sigmoid_silu_multi(self, input1, input2, name=None):
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sigmoid_silu_multi(
-            self.handle, input1.handle, input2.handle, c_name
-        )
-        self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
-        return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
+# -----------------------------------------------------------------------
+# PEFTModelID
+# -----------------------------------------------------------------------
 
-    def batch_matmul(
-        self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None
-    ):
-        """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`.
 
-        :param A: the first input Tensor.
-        :type A: Tensor
+class PEFTModelID(object):
+    __slots__ = ["handle", "_handle"]
 
-        :param B: the second input Tensor.
-        :type B: Tensor
+    __no_id_h = None
 
-        :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension
-        :type a_seq_length_dim: int
+    def __init__(self, id=None):
+        if id is None:
+            self.handle = ffc().flexflow_peft_model_id_create()
+        else:
+            self.handle = ffc().flexflow_peft_model_id_create_id(id)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_peft_model_id_destroy)
 
-        :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension
-        :type b_seq_length_dim: int
+    @staticmethod
+    def no_id_handle():
+        if PEFTModelID.__no_id_h is None:
+            PEFTModelID.__no_id_h = ffc().flexflow_peft_model_id_no_id()
+        return PEFTModelID.__no_id_h
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
 
-        :param name:  Whether to add use bias in layer normalization
-        :type name: bool
+# -----------------------------------------------------------------------
+# Request
+# -----------------------------------------------------------------------
 
-        :returns:  Tensor -- the output tensor.
-        """
-        if a_seq_length_dim is None:
-            a_seq_length_dim = -1
-        if b_seq_length_dim is None:
-            b_seq_length_dim = -1
-        handle = ffc().flexflow_model_add_batch_matmul(
-            self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim
-        )
-        self.add_layer(OpType.BATCH_MATMUL, name)
-        return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL)
 
-    def dense(
+class Request:
+    """A class to record the metadata of an inference or finetuning request."""
+
+    def __init__(
         self,
-        input,
-        out_dim,
-        activation=ActiMode.AC_MODE_NONE,
-        use_bias=True,
-        datatype=DataType.DT_NONE,
-        shared_op=None,
-        kernel_initializer=None,
-        bias_initializer=None,
-        kernel_regularizer=None,
-        name=None,
+        req_type: RequestType,
+        prompt: str = None,
+        max_sequence_length: int = 128,
+        peft_model_id: PEFTModelID = None,
+        dataset_filepath: str = None,
+        max_training_steps: int = 1,
     ):
-        """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where
-        :attr:`activation` is the element-wise activation function passed as the activation argument,
-        :attr:`kernel` is a weights matrix created by the layer, and
-        :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True).
+        self.req_type = req_type
+        self.prompt = prompt
+        self.max_sequence_length = max_sequence_length
+        self.peft_model_id = peft_model_id
+        self.dataset_filepath = dataset_filepath
+        self.max_training_steps = max_training_steps
 
-        The size of input tensor is :math:`(N, C_{in})` and the size of output tensor
-        is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim`
-
-        :param input: the input Tensor.
-        :type input: Tensor
 
-        :param out\_dim: dimensionality of the output space.
-        :type out\_dim: int
+# -----------------------------------------------------------------------
+# FFModel
+# -----------------------------------------------------------------------
 
-        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
-        :type activation: ActiMode
 
-        :param use_bias: whether the layer uses a bias vector. Default is True.
-        :type use_bias: bool
+class FFModel(object):
+    """ """
 
-        :param shared_op: the layer whose parameters are shared with. Default is None.
-        :type shared_op: Op
+    __slots__ = [
+        "handle",
+        "_handle",
+        "_layers",
+        "_nb_layers",
+        "_ffconfig",
+        "_tracing_id",
+        "initializers",
+        "attr_tensors",
+    ]
 
-        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def __init__(self, ffconfig):
+        """Constructor of FFModel.
 
-        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
-        :type bias_initializer: Initializer
+        :param ffconfig: configurations of FlexFlow and the created model.
+        :type ffconfig: FFConfig
 
-        :param kernel_regularizer: Regularizer for the kernel weights matrix
-        :type bias_initializer: Regularizer
+        :returns:  FFModel -- the model.
+        """
+        self.handle = ffc().flexflow_model_create(ffconfig.handle, ffconfig.cpu_offload)
+        self._handle = ffi.gc(self.handle, ffc().flexflow_model_destroy)
+        self._layers = dict()
+        self._nb_layers = 0
+        self._ffconfig = ffconfig
+        global ff_tracing_id
+        self._tracing_id = ff_tracing_id
+        ff_tracing_id += 1
+        self.initializers = {}
+        self.attr_tensors = {}
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+    def get_layers(self):
+        return self._layers
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        shared_op_handle = self.__get_op_handle(shared_op)
-        c_activation = enum_to_int(ActiMode, activation)
-        c_datatype = enum_to_int(DataType, datatype)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        bias_init_handle = self.__get_initializer_handle(bias_initializer)
-        if kernel_regularizer:
-            c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type)
-            kernel_reg_lambda = kernel_regularizer._lambda
-        else:
-            c_kernel_reg_type = enum_to_int(
-                RegularizerMode, RegularizerMode.REG_MODE_NONE
-            )
-            kernel_reg_lambda = 0.0
-        handle = ffc().flexflow_model_add_dense(
-            self.handle,
-            input.handle,
-            out_dim,
-            c_activation,
-            use_bias,
-            c_datatype,
-            shared_op_handle,
-            kernel_init_handle,
-            bias_init_handle,
-            c_kernel_reg_type,
-            kernel_reg_lambda,
-            c_name,
+    def add_layer(self, op_type, name):
+        layer_id = self._nb_layers
+        op_handle = ffc().flexflow_model_get_last_layer(self.handle)
+        self._layers[self._nb_layers] = convert_op_handle_to_op(
+            op_type, op_handle, idx=layer_id, name=name
         )
-        self.add_layer(OpType.LINEAR, name)
-        return Tensor(handle, owner_op_type=OpType.LINEAR)
-
-    def concat(self, tensors, axis, name=None):
-        """Layer that concatenates a list of inputs.
+        self._nb_layers += 1
 
-        It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs.
+    def create_tensor(self, dims, data_type, create_grad=True):
+        """Instantiate a FlexFlow tensor.
 
-        :param input: the list of input Tensors.
-        :type input: List of Tensors
+        :param x: a shape tuple/list (integers), including the batch size.
+        :type x: list of int
 
-        :param axis: the dimension along which to concatenate.
-        :type axis: int
+        :param data_type: the datatype of the created tensor. Options are
+          DT_FLOAT, DT_DOUBLE, DT_INT32, DT_INT64, DT_BOOLEAN.
+        :type data_type: DataType
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param create_grad: weather the tensor creates a gradients vector.
+          If you don't specify anything, a gradients vector is used.
+        :type create_grad: bool
 
         :returns:  Tensor -- the output tensor.
         """
-        assert type(tensors) is list, "tensors should be a list"
-        tensor_handle_list = []
-        n = len(tensors)
-        assert n <= 256, "Please increase MAX_NUM_INPUTS"
-        for tensor in tensors:
-            tensor_handle_list.append(tensor.handle)
-        c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list)
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_concat(
-            self.handle, n, c_tensor_handle_list, axis, c_name
+        c_dims = ffi.new("int[]", dims)
+        c_data_type = enum_to_int(DataType, data_type)
+        num_dims = len(dims)
+        handle = ffc().flexflow_tensor_create(
+            self.handle, num_dims, c_dims, c_data_type, create_grad
         )
-        self.add_layer(OpType.CONCAT, name)
-        return Tensor(handle, owner_op_type=OpType.CONCAT)
+        return Tensor(handle)
 
-    def split(self, input, sizes, axis, name=None):
-        """Layer that splits a :attr:`input` tensor into a list of tensors.
+    def map_tensor(self, tensor, parallel_op=None):
+        op_handle = self.__get_op_handle(parallel_op)
+        ffc().flexflow_tensor_map(self.handle, tensor.handle, op_handle)
 
-        :param input: the input Tensor.
-        :type input: Tensor
+    def create_constant(self, dims, value, data_type):
+        c_dims = ffi.new("int[]", dims)
+        c_data_type = enum_to_int(DataType, data_type)
+        num_dims = len(dims)
+        handle = ffc().flexflow_constant_create(
+            self.handle, num_dims, c_dims, value, c_data_type
+        )
+        return Tensor(handle)
 
-        :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`.
-        :type sizes: int or list of int
+    def exp(self, x, name=None):
+        """Exponential activation function.
 
-        :param axis: the dimension along which to split.
-        :type axis: int
+        :param x: the input Tensor.
+        :type x: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
-        :returns:  list of Tensors -- the output tensors.
+        :returns:  Tensor -- the output tensor.
         """
-        if type(sizes) is list:
-            split = sizes
-        else:
-            assert input.dims[axis] % sizes == 0, "Split dimension is not divisible"
-            split = [input.dims[axis] // sizes for i in range(sizes)]
-        n = len(split)
-        assert n <= 256, "Please increase MAX_NUM_OUTPUTS"
-        c_split = ffi.new("int[]", split)
-        c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]")
         c_name = get_c_name(name)
-        ffc().flexflow_model_add_split(
-            self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name
-        )
-        output_tensor_list = []
-        for i in range(n):
-            tensor_p_handle = ffi.new("flexflow_tensor_t*")
-            tensor_p_handle.impl = c_outputs_handle_list[i].impl
-            output_tensor_list.append(
-                Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle)
-            )
-        self.add_layer(OpType.SPLIT, name)
-        del c_outputs_handle_list
-        return output_tensor_list
+        handle = ffc().flexflow_model_add_exp(self.handle, x.handle, c_name)
+        self.add_layer(OpType.EXP, name)
+        return Tensor(handle, owner_op_type=OpType.EXP)
 
-    def flat(self, input, name=None):
-        """Flattens the input. Does not affect the batch size.
+    def sin(self, x, name=None):
+        """Elementwise sine function.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the input Tensor.
+        :type x: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2253,15 +2184,15 @@ def flat(self, input, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name)
-        self.add_layer(OpType.FLAT, name)
-        return Tensor(handle, owner_op_type=OpType.FLAT)
+        handle = ffc().flexflow_model_add_sin(self.handle, x.handle, c_name)
+        self.add_layer(OpType.SIN, name)
+        return Tensor(handle, owner_op_type=OpType.SIN)
 
-    def softmax(self, input, axis=-1, name=None):
-        """Softmax activation function.
+    def cos(self, x, name=None):
+        """Elementwise cosine function.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the input Tensor.
+        :type x: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2269,23 +2200,18 @@ def softmax(self, input, axis=-1, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_softmax(
-            self.handle, input.handle, axis, c_name
-        )
-        self.add_layer(OpType.SOFTMAX, name)
-        return Tensor(handle, owner_op_type=OpType.SOFTMAX)
-
-    def reshape(self, input, shape, name=None):
-        """Layer that reshapes inputs into the given shape.
+        handle = ffc().flexflow_model_add_cos(self.handle, x.handle, c_name)
+        self.add_layer(OpType.COS, name)
+        return Tensor(handle, owner_op_type=OpType.COS)
 
-        Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order,
-        except with a new shape given by :attr:`shape`.
+    def add(self, x, y, inplace_a=False, name=None):
+        """Layer that adds two input Tensors, :attr:`output = x + y`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param shape: A list defining the shape of the output tensor.
-        :type shape: list of int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2293,45 +2219,41 @@ def reshape(self, input, shape, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        c_shape = ffi.new("int[]", shape)
-        handle = ffc().flexflow_model_add_reshape(
-            self.handle, input.handle, len(shape), c_shape, c_name
+        handle = ffc().flexflow_model_add_add(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.RESHAPE, name)
-        return Tensor(handle, owner_op_type=OpType.RESHAPE)
-
-    def gather(self, input, index, dim, name=None):
-        """Layer that gathers values along the dim axis.
+        self.add_layer(OpType.ADD, name)
+        return Tensor(handle, owner_op_type=OpType.ADD)
 
-        :param input: the input tensor
-        :type input: Tensor
+    def subtract(self, x, y, inplace_a=False, name=None):
+        """Layer that subtracts two input Tensors, :attr:`output = x * y`.
 
-        :param index: the index tensor, which specifies the indices of elements to gather
-        :type index: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param dim: the axis along which to index
-        :type dim: int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
-        :param name: the name of the layer. Default is None
+        :param name: the name of the layer. Default is None.
         :type name: string
 
-        :returns: Tensor -- the output tensor
+        :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_gather(
-            self.handle, input.handle, index.handle, dim, c_name
+        handle = ffc().flexflow_model_add_subtract(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.GATHER, name)
-        return Tensor(handle, owner_op_type=OpType.GATHER)
+        self.add_layer(OpType.SUBTRACT, name)
+        return Tensor(handle, owner_op_type=OpType.SUBTRACT)
 
-    def transpose(self, input, perm, name=None):
-        """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm
+    def multiply(self, x, y, inplace_a=False, name=None):
+        """Layer that multiplies (element-wise) two input Tensors, :attr:`output = x * y`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param perm: A permutation of the dimensions of a.
-        :type perm: List of int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2339,23 +2261,20 @@ def transpose(self, input, perm, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        c_perm = ffi.new("int[]", perm)
-        handle = ffc().flexflow_model_add_transpose(
-            self.handle, input.handle, len(perm), c_perm, c_name
+        handle = ffc().flexflow_model_add_multiply(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.TRANSPOSE, name)
-        return Tensor(handle, owner_op_type=OpType.TRANSPOSE)
-
-    def reverse(self, input, axis, name=None):
-        """Layer that reverses specific dimensions of a tensor.
+        self.add_layer(OpType.MULTIPLY, name)
+        return Tensor(handle, owner_op_type=OpType.MULTIPLY)
 
-        Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`.
+    def divide(self, x, y, inplace_a=False, name=None):
+        """Layer that divides (element-wise) two input Tensors, :attr:`output = x / y`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param axis: the dimension to reverse.
-        :type axis: int
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2363,20 +2282,20 @@ def reverse(self, input, axis, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_reverse(
-            self.handle, input.handle, axis, c_name
+        handle = ffc().flexflow_model_add_divide(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.REVERSE, name)
-        return Tensor(handle, owner_op_type=OpType.REVERSE)
+        self.add_layer(OpType.DIVIDE, name)
+        return Tensor(handle, owner_op_type=OpType.DIVIDE)
 
-    def scalar_multiply(self, input, scalar, inplace=True, name=None):
-        """Scalar multiplication of a tensor by an scalar.
+    def max(self, x, y, inplace_a=False, name=None):
+        """Layer that computes the max (element-wise) two input Tensors, :attr:`output = max(x,y)`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param input: the scalar
-        :type scalar: float
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2384,20 +2303,20 @@ def scalar_multiply(self, input, scalar, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_multiply(
-            self.handle, input.handle, scalar, inplace, c_name
+        handle = ffc().flexflow_model_add_max(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.SCALAR_MULTIPLY, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY)
+        self.add_layer(OpType.MAX, name)
+        return Tensor(handle, owner_op_type=OpType.MAX)
 
-    def scalar_add(self, input, scalar, inplace=True, name=None):
-        """Scalar addition of a scalar to each entry of a tensor.
+    def min(self, x, y, inplace_a=False, name=None):
+        """Layer that computes the min (element-wise) two input Tensors, :attr:`output = min(x,y)`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param x: the first input Tensor.
+        :type x: Tensor
 
-        :param input: the scalar
-        :type scalar: float
+        :param y: the second input Tensor.
+        :type y: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2405,20 +2324,20 @@ def scalar_add(self, input, scalar, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_add(
-            self.handle, input.handle, scalar, inplace, c_name
+        handle = ffc().flexflow_model_add_min(
+            self.handle, x.handle, y.handle, inplace_a, c_name
         )
-        self.add_layer(OpType.SCALAR_ADD, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_ADD)
+        self.add_layer(OpType.MIN, name)
+        return Tensor(handle, owner_op_type=OpType.MIN)
 
-    def scalar_sub(self, input, scalar, inplace=True, name=None):
-        """Scalar subtraction of a scalar to each entry of a tensor.
+    def reduce_sum(self, input, axes, keepdims=False, name=None):
+        """Layer that computes the sum of the input Tensor along given axes.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param input: the scalar
-        :type scalar: float
+        :param axes: the axes along which reduction is applied
+        :type axes: List[int]
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2426,215 +2345,234 @@ def scalar_sub(self, input, scalar, inplace=True, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_sub(
-            self.handle, input.handle, scalar, inplace, c_name
+        c_axes = ffi.new("int[]", axes)
+        handle = ffc().flexflow_model_add_reduce_sum(
+            self.handle, input.handle, c_axes, len(axes), keepdims, c_name
         )
-        self.add_layer(OpType.SCALAR_SUB, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_SUB)
+        self.add_layer(OpType.REDUCE_SUM, name)
+        return Tensor(handle, owner_op_type=OpType.REDUCE_SUM)
 
-    def scalar_true_divide(self, input, scalar, inplace=True, name=None):
-        """Scalar regular division of a tensor by an scalar.
+    def rsqrt(self, input, name=None):
+        """Layer that computes the element-wise reciprocal square-root.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param input: the scalar
-        :type scalar: float
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_scalar_truediv(
-            self.handle, input.handle, scalar, inplace, c_name
-        )
-        self.add_layer(OpType.SCALAR_TRUEDIV, name)
-        return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV)
+        handle = ffc().flexflow_model_add_rsqrt(self.handle, input.handle, c_name)
+        self.add_layer(OpType.RSQRT, name)
+        return Tensor(handle, owner_op_type=OpType.RSQRT)
 
-    def gelu(self, input, inplace=True, name=None):
-        """Gaussian Error Linear Unit activation function.
+    def pow(self, input, exponent, name=None):
+        """Layer that computes the element-wise power.
 
         :param input: the input Tensor.
         :type input: Tensor
 
+        :param exponent: exponent to raise each element in the input tensor.
+        :type exponent: float
+
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name)
-        self.add_layer(OpType.GELU, name)
-        return Tensor(handle, owner_op_type=OpType.GELU)
+        handle = ffc().flexflow_model_add_pow(
+            self.handle, input.handle, exponent, c_name
+        )
+        self.add_layer(OpType.POW, name)
+        return Tensor(handle, owner_op_type=OpType.POW)
 
-    def relu(self, input, inplace=True, name=None):
-        """Rectified Linear Unit activation function.
+    def mean(self, input, dims, keepdims=False, name=None):
+        """Layer that computes the mean of the input tensor across the given
+        dimensions.
 
         :param input: the input Tensor.
         :type input: Tensor
 
+        :param dims: dimensions to take the mean over.
+        :type dims: list
+
+        :param keepdims: keeps the dimensions in :attr:`dims` as size 1 if True and
+                         collapses the dimension if False. Default is False.
+        :type keepdims: bool
+
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
+        dims = list(dims)
+        c_dims = ffi.new("int[]", dims)
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_relu(
-            self.handle, input.handle, inplace, c_name
+        handle = ffc().flexflow_model_add_mean(
+            self.handle, input.handle, c_dims, len(dims), keepdims, c_name
         )
-        self.add_layer(OpType.RELU, name)
-        return Tensor(handle, owner_op_type=OpType.RELU)
+        self.add_layer(OpType.MEAN, name)
+        return Tensor(handle, owner_op_type=OpType.MEAN)
 
-    def identity(self, input, name=None):
-        """Identity function.
+    def conv2d(
+        self,
+        input,
+        out_channels,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        padding_h,
+        padding_w,
+        activation=ActiMode.AC_MODE_NONE,
+        groups=1,
+        use_bias=True,
+        shared_op=None,
+        kernel_initializer=None,
+        bias_initializer=None,
+        name=None,
+    ):
+        """This layer creates a 2D convolution kernel that is convolved with the layer :attr:`input`
+        to produce a tensor of :attr:`output`.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
+        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          C_{out} = out\_channels
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name)
-        self.add_layer(OpType.IDENTITY, name)
-        return Tensor(handle, owner_op_type=OpType.IDENTITY)
+        .. math::
+          K_{H} = kernel\_h
 
-    def sigmoid(self, input, name=None):
-        """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`.
+        .. math::
+          K_{W} = kernel\_w
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        .. math::
+          S_{H} = stride\_h
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        .. math::
+          S_{W} = stride\_w
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name)
-        self.add_layer(OpType.SIGMOID, name)
-        return Tensor(handle, owner_op_type=OpType.SIGMOID)
+        .. math::
+          P_{H} = padding\_h
 
-    def tanh(self, input, name=None):
-        """Hyperbolic tangent activation function.
+        .. math::
+          P_{S} = padding\_s
+
+        .. math::
+          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
+
+        .. math::
+          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param out\_channels: the dimensionality of the output space (i.e. the number of output filters in the convolution).
+        :type out\_channels: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name)
-        self.add_layer(OpType.TANH, name)
-        return Tensor(handle, owner_op_type=OpType.TANH)
+        :param kernel_h: the height of the 2D convolution window: :math:`K_{H}`.
+        :type kernel_h: int
 
-    def elu(self, input, inplace=True, name=None):
-        """Exponential Linear Unit. activation function.
+        :param kernel_w: the width of the 2D convolution window: :math:`K_{W}`.
+        :type kernel_w: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param stride_h: the stride of the convolution along the height: :math:`S_{H}`.
+        :type stride_h: int
 
-        :param name: the name of the layer. Default is None.
-        :type name: string
+        :param stride_w: the stride of the convolution along the width: :math:`S_{W}`.
+        :type stride_w: int
 
-        :returns:  Tensor -- the output tensor.
-        """
-        c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_elu(
-            self.handle, input.handle, inplace, c_name
-        )
-        self.add_layer(OpType.ELU, name)
-        return Tensor(handle, owner_op_type=OpType.ELU)
+        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
+        :type padding_h: int
 
-    def dropout(self, input, rate, seed, name=None):
-        """The Dropout layer randomly sets input units to 0 with
-        a frequency of :attr:`rate` at each step during training time,
-        which helps prevent overfitting.
-        Inputs not set to 0 are scaled up by 1/(1 - rate) such that the
-        sum over all inputs is unchanged.
+        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
+        :type padding_w: int
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
-        :param rate: Fraction of the input units to drop.
-        :type rate: float(0-1)
+        :param groups: the number of groups in this convolution
+        :type groups: int
 
-        :param seed: random seed.
-        :type seed: int
+        :param use_bias: whether the layer uses a bias vector. Default is True.
+        :type use_bias: bool
+
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
+
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
+        :type bias_initializer: Initializer
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_activation = enum_to_int(ActiMode, activation)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        bias_init_handle = self.__get_initializer_handle(bias_initializer)
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_dropout(
-            self.handle, input.handle, rate, seed, c_name
+        handle = ffc().flexflow_model_add_conv2d(
+            self.handle,
+            input.handle,
+            out_channels,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            padding_h,
+            padding_w,
+            c_activation,
+            groups,
+            use_bias,
+            shared_op_handle,
+            kernel_init_handle,
+            bias_init_handle,
+            c_name,
         )
-        self.add_layer(OpType.DROPOUT, name)
-        return Tensor(handle, owner_op_type=OpType.DROPOUT)
+        self.add_layer(OpType.CONV2D, name)
+        return Tensor(handle, owner_op_type=OpType.CONV2D)
 
-    def multihead_attention(
+    def embedding(
         self,
-        query,
-        key,
-        value,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
+        input,
+        num_embeddings,
+        embedding_dim,
+        aggr,
+        dtype=DataType.DT_FLOAT,
+        shared_op=None,
         kernel_initializer=None,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`,
-        and returns the dot-product attention between them:.
-
-        :param query: the query Tensor.
-        :type query: Tensor
-
-        :param key: the key Tensor.
-        :type key: Tensor
-
-        :param value: the value Tensor.
-        :type value: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
+        """Layer that turns positive integers into dense vectors of fixed size
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+        :param num_embeddings: size of the vocabulary, i.e. maximum integer index + 1
+        :type num_embeddings: int
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param embedding_dim: dimension of the dense embedding.
+        :type embedding_dim: int
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param aggr: aggregation mode. Options are AGGR_MODE_NONE, AGGR_MODE_SUM and AGGR_MODE_AVG.
+        :type aggr: AggrMode
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param dtype: the tensor data type. Options are DT_BOOLEAN, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_INT4, DT_INT8, DT_NONE
+        :type dtype: DataType
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
         :param name: the name of the layer. Default is None.
@@ -2643,97 +2581,105 @@ def multihead_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        handle = ffc().flexflow_model_add_multihead_attention(
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_aggr = enum_to_int(AggrMode, aggr)
+        c_dtype = enum_to_int(DataType, dtype)
+        if kernel_initializer is None:
+            kernel_initializer = GlorotUniformInitializer(42)
+        assert (
+            (type(kernel_initializer) is GlorotUniformInitializer)
+            or (type(kernel_initializer) is ZeroInitializer)
+            or (type(kernel_initializer) is UniformInitializer)
+            or (type(kernel_initializer) is NormInitializer)
+        ), f"Unknown initializer type: {kernel_initializer}"
+        handle = ffc().flexflow_model_add_embedding(
             self.handle,
-            query.handle,
-            key.handle,
-            value.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            kernel_init_handle,
+            input.handle,
+            num_embeddings,
+            embedding_dim,
+            c_aggr,
+            c_dtype,
+            shared_op_handle,
+            kernel_initializer.handle,
             c_name,
         )
-        self.add_layer(OpType.MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION)
+        # NOTE: We must keep a reference to the initializer or else it will be
+        # immediately destructed
+        self.initializers[name] = kernel_initializer
+        self.add_layer(OpType.EMBEDDING, name)
+        return Tensor(handle, owner_op_type=OpType.EMBEDDING)
 
-    def inc_multihead_self_attention(
+    def pool2d(
         self,
         input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        kernel_h,
+        kernel_w,
+        stride_h,
+        stride_w,
+        padding_h,
+        padding_w,
+        pool_type=PoolType.POOL_MAX,
+        activation=ActiMode.AC_MODE_NONE,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        In inference mode, the attention is computed using incremental decoding.
+        """Pooling operation for 2D spatial data.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        The size of input tensor is :math:`(N, C_{in}, H, W)` and the size of output tensor
+        is :math:`(N, C_{out}, H_{out}, W_{out})`, which can be calculated by:
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
+        .. math::
+          C_{out} = out\_channels
 
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
+        .. math::
+          K_{H} = kernel\_h
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        .. math::
+          K_{W} = kernel\_w
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+        .. math::
+          S_{H} = stride\_h
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        .. math::
+          S_{W} = stride\_w
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        .. math::
+          P_{H} = padding\_h
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        .. math::
+          P_{S} = padding\_s
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        .. math::
+          H_{out} = (H - K_{H} + 2 * P_{H}) / S_{H} + 1
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        .. math::
+          W_{out} = (W - K_{W} + 2 * P_{W}) / S_{W} + 1
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param kernel_h: the height of the 2D pooling window: :math:`K_{H}`.
+        :type kernel_h: int
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :param kernel_w: the width of the 2D pooling window: :math:`K_{W}`.
+        :type kernel_w: int
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :param stride_h: the stride of the pooling along the height: :math:`S_{H}`.
+        :type stride_h: int
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param stride_w: the stride of the pooling along the width: :math:`S_{W}`.
+        :type stride_w: int
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param padding_h: the amount of implicit zero-paddings along the height: :math:`P_{H}`.
+        :type padding_h: int
+
+        :param padding_w: the amount of implicit zero-paddings along the width: :math:`P_{W}`.
+        :type padding_w: int
+
+        :param activation: Tyoe of pooling function to use. If you don't specify anything, PoolType.POOL_MAX is applied.
+        :type activation: PoolType
+
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2741,102 +2687,34 @@ def inc_multihead_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
+        c_pool_type = enum_to_int(PoolType, pool_type)
+        c_activation = enum_to_int(ActiMode, activation)
+        handle = ffc().flexflow_model_add_pool2d(
             self.handle,
             input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
+            kernel_h,
+            kernel_w,
+            stride_h,
+            stride_w,
+            padding_h,
+            padding_w,
+            c_pool_type,
+            c_activation,
             c_name,
         )
-        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
-
-    def spec_inc_multihead_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (beam search) mode.
-
-        :param input: the input Tensor.
-        :type input: Tensor
-
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
-
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
-
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        self.add_layer(OpType.POOL2D, name)
+        return Tensor(handle, owner_op_type=OpType.POOL2D)
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+    def batch_norm(self, input, relu=True, name=None):
+        """Layer that normalizes its inputs.
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1.
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param input: the list of input Tensors.
+        :type input: Tensor
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param relu: whether a ReLU function is applied. Default is True.
+        :type relu: bool
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -2844,209 +2722,255 @@ def spec_inc_multihead_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
+        handle = ffc().flexflow_model_add_batch_norm(
+            self.handle, input.handle, relu, c_name
+        )
+        self.add_layer(OpType.BATCH_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.BATCH_NORM)
+
+    def layer_norm(
+        self, input, axes, elementwise_affine=True, eps=1e-5, use_bias=True, name=None
+    ):
+        """Add a LayerNorm layer
+
+        :param input: The input tensor
+        :type input: Tensor
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: Union[int, List[int]]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: _type_, optional
+        :return: The LayerNorm output tensor
+        :rtype: Tensor
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        handle = ffc().flexflow_model_add_layer_norm(
             self.handle,
             input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
             c_name,
         )
-        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.LAYER_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.LAYER_NORM)
 
-    def inc_multihead_self_attention_verify(
+    def residual_layer_norm(
         self,
         input,
-        embed_dim,
-        num_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        residual1,
+        residual2,
+        use_two_residuals,
+        axes,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True,
+        inplace_residual=False,
         name=None,
     ):
-        """Defines the MultiHead Attention operation as described in Attention Is All You Need
-        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (tree verify) mode.
+        """Add a fused LayerNorm + Residual layer. This operator uses a single kernel, resulting in
+        better efficiency compared to using separate element-wise add and LayerNorm operators.
 
-        :param input: the input Tensor.
+        :param input: The input tensor
         :type input: Tensor
+        :param residual1: The residual tensor to add to the input before computing the LayerNorm
+        :type residual1: Tensor
+        :param residual2: An optional second residual tensor to add to the input (in addition to residual1) before computing the LayerNorm
+        :type residual2: Tensor
+        :param use_two_residuals: A boolean that should be set to True if using the second optional residual, False otherwise
+        :type use_two_residuals: bool
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: List[int]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: str, optional
+        :return: A tensor with the sum of the input and residual(s), and the LayerNorm output
+        :rtype: (Tensor, Tensor)
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        residual2_handle = (
+            residual1.handle
+        )  # This is intentional. Data will be ignored, and we cannot pass None
+        if use_two_residuals:
+            assert residual2 is not None
+            residual2_handle = residual2.handle
+        handles_array = ffc().flexflow_model_add_residual_layer_norm(
+            self.handle,
+            input.handle,
+            residual1.handle,
+            residual2_handle,
+            use_two_residuals,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
+            inplace_residual,
+            c_name,
+        )
+        self.add_layer(OpType.RESIDUAL_LAYERNORM, name)
+        return (
+            Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_LAYERNORM),
+            Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_LAYERNORM),
+        )
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_heads: Number of attention heads.
-        :type num_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
-
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
-
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
-
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+    def add_bias_residual_layer_norm(
+        self,
+        input,
+        residual,
+        axes,
+        elementwise_affine=True,
+        eps=1e-5,
+        use_bias=True,
+        inplace_residual=False,
+        name=None,
+    ):
+        """Add a Attention Bias + Residual + LayerNorm layer. This operator uses a single kernel,
+        resulting in better efficiency compared to using separate attention bias addition +
+        element-wise residual addition + LayerNorm operators.
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param input: The input tensor
+        :type input: Tensor
+        :param residual: The residual tensor
+        :type residual: Tensor
+        :param axes: Indicate which axes (starting from the end) the LayerNorm should normalize over
+        :type axes: Union[int, List[int]]
+        :param elementwise_affine: Whether the LayerNorm should use the gamma weight for scaling, defaults to True
+        :type elementwise_affine: bool, optional
+        :param eps: A small float value added to the LayerNorm denominator for numerical stability, defaults to 1e-5
+        :type eps: float, optional
+        :param use_bias: Whether to add a beta bias to the LayerNorm result, defaults to True
+        :type use_bias: bool, optional
+        :param inplace_residual: Whether to perform the residual computation inplace in the input tensor, defaults to False
+        :type inplace_residual: bool, optional
+        :param name: Name of the operator, also used for loading weights in inference mode, defaults to None
+        :type name: _type_, optional
+        :return: A tensor with the sum of the attention bias, input and residual(s), and the LayerNorm output
+        :rtype: (Tensor, Tensor)
+        """
+        c_name = get_c_name(name)
+        c_axes = ffi.new("int[]", axes)
+        handles_array = ffc().flexflow_model_add_add_bias_residual_layer_norm(
+            self.handle,
+            input.handle,
+            residual.handle,
+            len(axes),
+            c_axes,
+            elementwise_affine,
+            eps,
+            use_bias,
+            inplace_residual,
+            c_name,
+        )
+        self.add_layer(OpType.ADD_BIAS_RESIDUAL_LAYERNORM, name)
+        return (
+            Tensor(handles_array[0], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM),
+            Tensor(handles_array[1], owner_op_type=OpType.ADD_BIAS_RESIDUAL_LAYERNORM),
+        )
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+    def sigmoid_silu_multi(self, input1, input2, name=None):
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sigmoid_silu_multi(
+            self.handle, input1.handle, input2.handle, c_name
+        )
+        self.add_layer(OpType.SIGMOID_SILU_MULTI, name)
+        return Tensor(handle, owner_op_type=OpType.SIGMOID_SILU_MULTI)
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+    def batch_matmul(
+        self, A, B, a_seq_length_dim=None, b_seq_length_dim=None, name=None
+    ):
+        """Layer that applied batched matrix multiplication onto two input Tensors, :attr:`output = x * y`.
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :param A: the first input Tensor.
+        :type A: Tensor
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param B: the second input Tensor.
+        :type B: Tensor
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param a_seq_length_dim: an int when set indicating the a_seq_length_dim dimention of A is a sequence_length dimension
+        :type a_seq_length_dim: int
+
+        :param b_seq_length_dim: an int when set indicating the b_seq_length_dim dimention of B is a sequence_length dimension
+        :type b_seq_length_dim: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
 
+        :param name:  Whether to add use bias in layer normalization
+        :type name: bool
+
         :returns:  Tensor -- the output tensor.
         """
-        c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        if a_seq_length_dim is None:
+            a_seq_length_dim = -1
+        if b_seq_length_dim is None:
+            b_seq_length_dim = -1
+        handle = ffc().flexflow_model_add_batch_matmul(
+            self.handle, A.handle, B.handle, a_seq_length_dim, b_seq_length_dim
         )
-        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.BATCH_MATMUL, name)
+        return Tensor(handle, owner_op_type=OpType.BATCH_MATMUL)
 
-    def inc_multiquery_self_attention(
+    def dense(
         self,
         input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
+        out_dim,
+        activation=ActiMode.AC_MODE_NONE,
+        use_bias=True,
+        datatype=DataType.DT_NONE,
+        shared_op=None,
         kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
+        bias_initializer=None,
+        kernel_regularizer=None,
         name=None,
     ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        In inference mode, the attention is computed using incremental decoding.
+        """Dense implements the operation: :attr:`output = activation(dot(input, kernel) + bias)` where
+        :attr:`activation` is the element-wise activation function passed as the activation argument,
+        :attr:`kernel` is a weights matrix created by the layer, and
+        :attr:`bias` is a bias vector created by the layer (only applicable if :attr:`use_bias` is True).
+
+        The size of input tensor is :math:`(N, C_{in})` and the size of output tensor
+        is :math:`(N, C_{out})`, where :math:`C_{out} = out\_dim`
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
-
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
-
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
-
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
-
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
-
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
-
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param out\_dim: dimensionality of the output space.
+        :type out\_dim: int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param activation: Activation function to use. Default is ActiMode.AC_MODE_NONE.
+        :type activation: ActiMode
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param use_bias: whether the layer uses a bias vector. Default is True.
+        :type use_bias: bool
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :param shared_op: the layer whose parameters are shared with. Default is None.
+        :type shared_op: Op
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :param kernel_initializer: Initializer for the kernel weights matrix. If it is set to None, the GlorotUniformInitializer is applied.
         :type kernel_initializer: Initializer
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
-
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
-
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
-
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+        :param bias_initializer: Initializer for the bias vector. If it is set to None, the ZeroInitializer is applied.
+        :type bias_initializer: Initializer
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param kernel_regularizer: Regularizer for the kernel weights matrix
+        :type bias_initializer: Regularizer
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3054,107 +2978,128 @@ def inc_multiquery_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
+        shared_op_handle = self.__get_op_handle(shared_op)
+        c_activation = enum_to_int(ActiMode, activation)
+        c_datatype = enum_to_int(DataType, datatype)
         kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
+        bias_init_handle = self.__get_initializer_handle(bias_initializer)
+        if kernel_regularizer:
+            c_kernel_reg_type = enum_to_int(RegularizerMode, kernel_regularizer.type)
+            kernel_reg_lambda = kernel_regularizer._lambda
+        else:
+            c_kernel_reg_type = enum_to_int(
+                RegularizerMode, RegularizerMode.REG_MODE_NONE
+            )
+            kernel_reg_lambda = 0.0
+        handle = ffc().flexflow_model_add_dense(
+            self.handle,
+            input.handle,
+            out_dim,
+            c_activation,
+            use_bias,
+            c_datatype,
+            shared_op_handle,
             kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
+            bias_init_handle,
+            c_kernel_reg_type,
+            kernel_reg_lambda,
             c_name,
         )
-        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
+        self.add_layer(OpType.LINEAR, name)
+        return Tensor(handle, owner_op_type=OpType.LINEAR)
 
-    def spec_inc_multiquery_self_attention(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (beam search) mode.
+    def concat(self, tensors, axis, name=None):
+        """Layer that concatenates a list of inputs.
 
-        :param input: the input Tensor.
-        :type input: Tensor
+        It takes as input a list of tensors, all of the same shape except for the concatenation axis, and returns a single tensor that is the concatenation of all inputs.
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
+        :param input: the list of input Tensors.
+        :type input: List of Tensors
 
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
+        :param axis: the dimension along which to concatenate.
+        :type axis: int
 
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+        :returns:  Tensor -- the output tensor.
+        """
+        assert type(tensors) is list, "tensors should be a list"
+        tensor_handle_list = []
+        n = len(tensors)
+        assert n <= 256, "Please increase MAX_NUM_INPUTS"
+        for tensor in tensors:
+            tensor_handle_list.append(tensor.handle)
+        c_tensor_handle_list = ffi.new("flexflow_tensor_t[]", tensor_handle_list)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_concat(
+            self.handle, n, c_tensor_handle_list, axis, c_name
+        )
+        self.add_layer(OpType.CONCAT, name)
+        return Tensor(handle, owner_op_type=OpType.CONCAT)
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+    def split(self, input, sizes, axis, name=None):
+        """Layer that splits a :attr:`input` tensor into a list of tensors.
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param sizes: either an int indicating the number of splits along axis or a Python list containing the sizes of each output tensor along axis. If a scalar, then it must evenly divide :attr:`input.dims[axis]`; otherwise the sum of sizes along the split axis must match that of the :attr:`input`.
+        :type sizes: int or list of int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param axis: the dimension along which to split.
+        :type axis: int
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+        :returns:  list of Tensors -- the output tensors.
+        """
+        if type(sizes) is list:
+            split = sizes
+        else:
+            assert input.dims[axis] % sizes == 0, "Split dimension is not divisible"
+            split = [input.dims[axis] // sizes for i in range(sizes)]
+        n = len(split)
+        assert n <= 256, "Please increase MAX_NUM_OUTPUTS"
+        c_split = ffi.new("int[]", split)
+        c_outputs_handle_list = ffi.new("flexflow_tensor_t[256]")
+        c_name = get_c_name(name)
+        ffc().flexflow_model_add_split(
+            self.handle, input.handle, n, c_outputs_handle_list, c_split, axis, c_name
+        )
+        output_tensor_list = []
+        for i in range(n):
+            tensor_p_handle = ffi.new("flexflow_tensor_t*")
+            tensor_p_handle.impl = c_outputs_handle_list[i].impl
+            output_tensor_list.append(
+                Tensor(None, owner_op_type=OpType.SPLIT, p_handle=tensor_p_handle)
+            )
+        self.add_layer(OpType.SPLIT, name)
+        del c_outputs_handle_list
+        return output_tensor_list
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+    def flat(self, input, name=None):
+        """Flattens the input. Does not affect the batch size.
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_flat(self.handle, input.handle, c_name)
+        self.add_layer(OpType.FLAT, name)
+        return Tensor(handle, owner_op_type=OpType.FLAT)
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+    def softmax(self, input, axis=-1, name=None):
+        """Softmax activation function.
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        :param input: the input Tensor.
+        :type input: Tensor
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3162,107 +3107,93 @@ def spec_inc_multiquery_self_attention(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        handle = ffc().flexflow_model_add_softmax(
+            self.handle, input.handle, axis, c_name
         )
-        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.SOFTMAX, name)
+        return Tensor(handle, owner_op_type=OpType.SOFTMAX)
 
-    def inc_multiquery_self_attention_verify(
-        self,
-        input,
-        embed_dim,
-        num_q_heads,
-        num_kv_heads,
-        kdim=0,
-        vdim=0,
-        dropout=0.0,
-        bias=True,
-        add_bias_kv=False,
-        add_zero_attn=False,
-        data_type=DataType.DT_NONE,
-        kernel_initializer=None,
-        apply_rotary_embedding=False,
-        scaling_query=False,
-        scaling_factor=1.0,
-        qk_prod_scaling=True,
-        position_bias=False,
-        name=None,
-    ):
-        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
-        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
-        This operator only supports computing the attention in inference (tree verify) mode.
+    def reshape(self, input, shape, name=None):
+        """Layer that reshapes inputs into the given shape.
+
+        Given a :attr:`input` tensor, this operation returns a output tensor that has the same values as tensor in the same order,
+        except with a new shape given by :attr:`shape`.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param embed_dim: total dimension of the model
-        :type embed_dim: int
+        :param shape: A list defining the shape of the output tensor.
+        :type shape: list of int
 
-        :param num_q_heads: Number of query attention heads.
-        :type num_q_heads: int
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param num_kv_heads: Number of key/value attention heads.
-        :type num_kv_heads: int
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        c_shape = ffi.new("int[]", shape)
+        handle = ffc().flexflow_model_add_reshape(
+            self.handle, input.handle, len(shape), c_shape, c_name
+        )
+        self.add_layer(OpType.RESHAPE, name)
+        return Tensor(handle, owner_op_type=OpType.RESHAPE)
 
-        :param kdim: total number of features in key. Default is 0
-        :type kdim: int
+    def gather(self, input, index, dim, name=None):
+        """Layer that gathers values along the dim axis.
 
-        :param vdim: total number of features in value. Default is 0
-        :type vdim: int
+        :param input: the input tensor
+        :type input: Tensor
 
-        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
-        :type dropout: float(0-1)
+        :param index: the index tensor, which specifies the indices of elements to gather
+        :type index: Tensor
 
-        :param bias: Whether the dense layers use bias vectors. Default is True.
-        :type bias: bool
+        :param dim: the axis along which to index
+        :type dim: int
 
-        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
-        :type add_bias_kv: bool
+        :param name: the name of the layer. Default is None
+        :type name: string
 
-        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
-        :type add_zero_attn: bool
+        :returns: Tensor -- the output tensor
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_gather(
+            self.handle, input.handle, index.handle, dim, c_name
+        )
+        self.add_layer(OpType.GATHER, name)
+        return Tensor(handle, owner_op_type=OpType.GATHER)
 
-        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
-        :type data_type: DataType
+    def transpose(self, input, perm, name=None):
+        """Transposes the :attr:`input` tensor. Permutes the dimensions according to perm
 
-        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
-        :type kernel_initializer: Initializer
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
-        :type apply_rotary_embedding: bool
+        :param perm: A permutation of the dimensions of a.
+        :type perm: List of int
 
-        :param scaling_query: Whether to apply scaling query. Default is False.
-        :type scaling_query: bool
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
-        :type scaling_factor: float
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        c_perm = ffi.new("int[]", perm)
+        handle = ffc().flexflow_model_add_transpose(
+            self.handle, input.handle, len(perm), c_perm, c_name
+        )
+        self.add_layer(OpType.TRANSPOSE, name)
+        return Tensor(handle, owner_op_type=OpType.TRANSPOSE)
 
-        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
-        :type qk_prod_scaling: bool
+    def reverse(self, input, axis, name=None):
+        """Layer that reverses specific dimensions of a tensor.
 
-        :param position_bias: Whether to add position bias to the QK product. Default is False.
-        :type position_bias: bool
+        Given a :attr:`input` tensor, this operation reverses the dimension :attr:`axis`.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param axis: the dimension to reverse.
+        :type axis: int
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3270,43 +3201,20 @@ def inc_multiquery_self_attention_verify(
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
-        c_data_type = enum_to_int(DataType, data_type)
-        handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(
-            self.handle,
-            input.handle,
-            embed_dim,
-            num_q_heads,
-            num_kv_heads,
-            kdim,
-            vdim,
-            dropout,
-            bias,
-            add_bias_kv,
-            add_zero_attn,
-            c_data_type,
-            kernel_init_handle,
-            apply_rotary_embedding,
-            scaling_query,
-            scaling_factor,
-            qk_prod_scaling,
-            position_bias,
-            c_name,
+        handle = ffc().flexflow_model_add_reverse(
+            self.handle, input.handle, axis, c_name
         )
-        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
-        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+        self.add_layer(OpType.REVERSE, name)
+        return Tensor(handle, owner_op_type=OpType.REVERSE)
 
-    def rms_norm(self, input, eps, dim, name=None):
-        """Defines the RMS Norm layer.
+    def scalar_multiply(self, input, scalar, inplace=True, name=None):
+        """Scalar multiplication of a tensor by an scalar.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param eps: a value added to the denominator for numerical stability
-        :type eps: float
-
-        :param dim: The dimension with respect to which to take the norm
-        :type dim: int
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3314,26 +3222,20 @@ def rms_norm(self, input, eps, dim, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_rms_norm(
-            self.handle, input.handle, eps, dim, c_name
+        handle = ffc().flexflow_model_add_scalar_multiply(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.RMS_NORM, name)
-        return Tensor(handle, owner_op_type=OpType.RMS_NORM)
-
-    def residual_rms_norm(self, input1, input2, eps, dim, name=None):
-        """Defines the Residual RMS Norm layer.
+        self.add_layer(OpType.SCALAR_MULTIPLY, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_MULTIPLY)
 
-        :param input: the input 1 Tensor.
-        :type input: Tensor
+    def scalar_add(self, input, scalar, inplace=True, name=None):
+        """Scalar addition of a scalar to each entry of a tensor.
 
-        :param input: the input 2 Tensor.
+        :param input: the input Tensor.
         :type input: Tensor
 
-        :param eps: a value added to the denominator for numerical stability
-        :type eps: float
-
-        :param dim: The dimension with respect to which to take the norm
-        :type dim: int
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3341,28 +3243,20 @@ def residual_rms_norm(self, input1, input2, eps, dim, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handles_array = ffc().flexflow_model_add_residual_rms_norm(
-            self.handle, input1.handle, input2.handle, eps, dim, c_name
-        )
-        self.add_layer(OpType.RESIDUAL_RMS_NORM, name)
-        return Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM), Tensor(
-            handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM
+        handle = ffc().flexflow_model_add_scalar_add(
+            self.handle, input.handle, scalar, inplace, c_name
         )
+        self.add_layer(OpType.SCALAR_ADD, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_ADD)
 
-    def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
-        """Defines the Arg TopK layer.
+    def scalar_sub(self, input, scalar, inplace=True, name=None):
+        """Scalar subtraction of a scalar to each entry of a tensor.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param k: the top k indices to select
-        :type k: int
-
-        :param sorted: Whether the entries should be sorted
-        :type sorted: bool
-
-        :param speculative_decoding: Whether you need to perform beam search
-        :type speculative_decoding: bool
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3370,23 +3264,20 @@ def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_arg_top_k(
-            self.handle, input.handle, k, sorted, c_name
+        handle = ffc().flexflow_model_add_scalar_sub(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.ARG_TOPK, name)
-        return Tensor(handle, owner_op_type=OpType.ARG_TOPK)
+        self.add_layer(OpType.SCALAR_SUB, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_SUB)
 
-    def beam_top_k(self, input, max_beam_size, sorted, name=None):
-        """Defines the Beam TopK layer.
+    def scalar_true_divide(self, input, scalar, inplace=True, name=None):
+        """Scalar regular division of a tensor by an scalar.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param max_beam_size: the top max_beam_size indices to select
-        :type max_beam_size: int
-
-        :param sorted: Whether the entries should be sorted
-        :type sorted: bool
+        :param input: the scalar
+        :type scalar: float
 
         :param name: the name of the layer. Default is None.
         :type name: string
@@ -3394,889 +3285,1498 @@ def beam_top_k(self, input, max_beam_size, sorted, name=None):
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_beam_top_k(
-            self.handle, input.handle, max_beam_size, sorted, c_name
+        handle = ffc().flexflow_model_add_scalar_truediv(
+            self.handle, input.handle, scalar, inplace, c_name
         )
-        self.add_layer(OpType.BEAM_TOPK, name)
-        return Tensor(handle, owner_op_type=OpType.BEAM_TOPK)
+        self.add_layer(OpType.SCALAR_TRUEDIV, name)
+        return Tensor(handle, owner_op_type=OpType.SCALAR_TRUEDIV)
 
-    def sampling(self, input, top_p, name=None):
-        """Defines the Sampling layer.
+    def gelu(self, input, inplace=True, name=None):
+        """Gaussian Error Linear Unit activation function.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param top_p: The top_p parameter of the sampling
-        :type top_p: float
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_sampling(
-            self.handle, input.handle, top_p, c_name
-        )
-        self.add_layer(OpType.SAMPLING, name)
-        return Tensor(handle, owner_op_type=OpType.SAMPLING)
+        handle = ffc().flexflow_model_add_gelu(self.handle, input.handle, c_name)
+        self.add_layer(OpType.GELU, name)
+        return Tensor(handle, owner_op_type=OpType.GELU)
 
-    def argmax(self, input, beam_search, name=None):
-        """Defines the Sampling layer.
+    def relu(self, input, inplace=True, name=None):
+        """Rectified Linear Unit activation function.
 
         :param input: the input Tensor.
         :type input: Tensor
 
-        :param beam_search: Whether you need to perform beam search
-        :type beam_search: bool
-
         :param name: the name of the layer. Default is None.
         :type name: string
 
         :returns:  Tensor -- the output tensor.
         """
         c_name = get_c_name(name)
-        handle = ffc().flexflow_model_add_argmax(
-            self.handle, input.handle, beam_search, c_name
+        handle = ffc().flexflow_model_add_relu(
+            self.handle, input.handle, inplace, c_name
         )
-        self.add_layer(OpType.ARGMAX, name)
-        return Tensor(handle, owner_op_type=OpType.ARGMAX)
+        self.add_layer(OpType.RELU, name)
+        return Tensor(handle, owner_op_type=OpType.RELU)
 
-    def reset_metrics(self):
-        """Reset performance metrics.
+    def identity(self, input, name=None):
+        """Identity function.
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_reset_metrics(self.handle)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def init_layers(self):
-        """Initialize layers.
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        ffc().flexflow_model_init_layers(self.handle)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_identity(self.handle, input.handle, c_name)
+        self.add_layer(OpType.IDENTITY, name)
+        return Tensor(handle, owner_op_type=OpType.IDENTITY)
 
-    def prefetch(self):
-        ffc().flexflow_model_prefetch(self.handle)
+    def sigmoid(self, input, name=None):
+        """Sigmoid activation function, :math:`sigmoid(x) = 1 / (1 + exp(-x))`.
 
-    def forward(self, seq_length=None):
-        """Forward propagation of all layers.
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :returns:  None -- no returns.
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
         """
-        if seq_length is None:
-            seq_length = -1
-        ffc().flexflow_model_forward(self.handle, seq_length)
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sigmoid(self.handle, input.handle, c_name)
+        self.add_layer(OpType.SIGMOID, name)
+        return Tensor(handle, owner_op_type=OpType.SIGMOID)
 
-    # TODO: seperate compute_metrics from backward
-    def backward(self, seq_length=None):
-        """Backward propagation of all layers.
+    def tanh(self, input, name=None):
+        """Hyperbolic tangent activation function.
 
-        :returns:  None -- no returns.
-        """
-        if seq_length is None:
-            seq_length = -1
-        ffc().flexflow_model_backward(self.handle, seq_length)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def compute_metrics(self):
-        """Compute performance metrics.
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        ffc().flexflow_model_compute_metrics(self.handle)
-
-    def update(self):
-        """Update weights and biases of all layers.
-
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_update(self.handle)
-
-    def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None):
-        """Configure the model for trainting. FlexFlow uses lazy initialization,
-        so the actual creating of all operations (including creating and partitioning
-        of weight, bias and output tensors) happen during compile.
-
-        :param optimizer: optimizer instance.
-        :type optimizer: Optimizer
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_tanh(self.handle, input.handle, c_name)
+        self.add_layer(OpType.TANH, name)
+        return Tensor(handle, owner_op_type=OpType.TANH)
 
-        :param loss_type: Enum of LossType.
-          Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
-          LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE.
-        :type loss_type: LossType
+    def elu(self, input, inplace=True, name=None):
+        """Exponential Linear Unit. activation function.
 
-        :param metrics: List of metrics to be evaluated by the model during training and testing.
-          Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY,
-          METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
-          METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR
-        :type metrics: MetricsType
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param comp_mode: Enum of CompMode.
-          Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE
-        :type comp_mode: CompMode
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        self.optimizer = optimizer
-
-        c_loss_type = enum_to_int(LossType, loss_type)
-        metrics_int = []
-        for metric in metrics:
-            metrics_int.append(enum_to_int(MetricsType, metric))
-        c_metrics = ffi.new("int[]", metrics_int)
-        if comp_mode == None:
-            comp_mode = CompMode.TRAINING
-        c_comp_mode = enum_to_int(CompMode, comp_mode)
-        ffc().flexflow_model_compile(
-            self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_elu(
+            self.handle, input.handle, inplace, c_name
         )
-        for ff_tensor, np_tensor in self.attr_tensors.items():
-            ff_tensor.set_tensor(self, np_tensor)
-        print("Compiled ffmodel!")
-
-    def fit(self, x=None, y=None, batch_size=None, epochs=1):
-        """Trains the model for a fixed number of epochs (iterations on a dataset).
-
-        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
-        :type x: Dataloader
-
-        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
-        :type y: Dataloader
-
-        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
-          or :attr:`--batch-size` from the command line.
-        :type batch_size: int
-
-        :param epochs: Number of epochs to train the model.
-          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
-          The default value is 1.
-        :type epochs: int
-
-        :returns:  None -- no returns.
-        """
-        if isinstance(x, list) == False:
-            dataloaders = [x]
-        else:
-            dataloaders = x
-        dataloaders.append(y)
-
-        num_samples = y.num_samples
-        batch_size = self._ffconfig.batch_size
-        self._tracing_id += 1  # get a new tracing id
-        for epoch in range(0, epochs):
-            for d in dataloaders:
-                d.reset()
-            self.reset_metrics()
-            iterations = num_samples / batch_size
-            for iter in range(0, int(iterations)):
-                self._ffconfig.begin_trace(self._tracing_id)
-                for d in dataloaders:
-                    d.next_batch(self)
-                self.forward()
-                self.zero_gradients()
-                self.backward()
-                self.update()
-                self._ffconfig.end_trace(self._tracing_id)
+        self.add_layer(OpType.ELU, name)
+        return Tensor(handle, owner_op_type=OpType.ELU)
 
-    def eval(self, x=None, y=None, batch_size=None):
-        """Returns the loss value & metrics values for the model in test mode.
+    def dropout(self, input, rate, seed, name=None):
+        """The Dropout layer randomly sets input units to 0 with
+        a frequency of :attr:`rate` at each step during training time,
+        which helps prevent overfitting.
+        Inputs not set to 0 are scaled up by 1/(1 - rate) such that the
+        sum over all inputs is unchanged.
 
-        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
-        :type x: Dataloader
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
-        :type y: Dataloader
+        :param rate: Fraction of the input units to drop.
+        :type rate: float(0-1)
 
-        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
-          or :attr:`--batch-size` from the command line.
-        :type batch_size: int
+        :param seed: random seed.
+        :type seed: int
 
-        :param epochs: Number of epochs to train the model.
-          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
-          The default value is 1.
-        :type epochs: int
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  None -- no returns.
+        :returns:  Tensor -- the output tensor.
         """
-        if isinstance(x, list) == False:
-            dataloaders = [x]
-        else:
-            dataloaders = x
-        dataloaders.append(y)
-
-        num_samples = y.num_samples
-        batch_size = self._ffconfig.batch_size
-        for d in dataloaders:
-            d.reset()
-        self.reset_metrics()
-        iterations = num_samples / batch_size
-        self._tracing_id += 1  # get a new tracing id
-        for iter in range(0, int(iterations)):
-            for d in dataloaders:
-                d.next_batch(self)
-            self._ffconfig.begin_trace(self._tracing_id)
-            self.forward()
-            self.compute_metrics()
-            self._ffconfig.end_trace(self._tracing_id)
-
-    def zero_gradients(self):
-        """Empty the gradients of all layers.
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_dropout(
+            self.handle, input.handle, rate, seed, c_name
+        )
+        self.add_layer(OpType.DROPOUT, name)
+        return Tensor(handle, owner_op_type=OpType.DROPOUT)
 
-        :returns:  None -- no returns.
-        """
-        ffc().flexflow_model_zero_gradients(self.handle)
+    def multihead_attention(
+        self,
+        query,
+        key,
+        value,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kernel_initializer=None,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`query`, :attr:`key`, and :attr:`value`,
+        and returns the dot-product attention between them:.
 
-    def set_optimizer(self, optimizer):
-        if isinstance(optimizer, SGDOptimizer) == True:
-            ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle)
-        elif isinstance(optimizer, AdamOptimizer) == True:
-            ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle)
-        elif optimizer == None:
-            pass
-        else:
-            assert 0, "[Model]: unknown optimizer"
+        :param query: the query Tensor.
+        :type query: Tensor
 
-    optimizer = property(fset=set_optimizer)
+        :param key: the key Tensor.
+        :type key: Tensor
 
-    def print_layers(self, id=-1):
-        ffc().flexflow_model_print_layers(self.handle, id)
+        :param value: the value Tensor.
+        :type value: Tensor
 
-    def get_layer_by_id(self, layer_id):
-        return self._layers[layer_id]
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-    def get_last_layer(self):
-        return self._layers[self._nb_layers - 1]
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
 
-    def get_layer_by_name(self, layer_name):
-        for layer_id in self._layers:
-            layer = self._layers[layer_id]
-            if layer.name == layer_name:
-                return layer
-        assert 0, f"Cannot find the layer with name {layer_name}"
-        return None
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-    def get_tensor_by_id(self, id):
-        handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id)
-        return Parameter(handle)
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-    @property
-    def label_tensor(self):
-        handle = ffc().flexflow_model_get_label_tensor(self.handle)
-        return Tensor(handle, deallocate=False)
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-    def get_perf_metrics(self):
-        handle = ffc().flexflow_model_get_perf_metrics(self.handle)
-        return PerfMetrics(handle)
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-    def set_transformer_layer_id(self, id):
-        ffc().flexflow_model_set_transformer_layer_id(self.handle, id)
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-    def create_data_loader(self, batch_tensor, full_array):
-        """Create a SingleDataloader instance.
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-        :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model.
-        :type batch_tensor: Tensor
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-        :param full_array: the entire data.
-        :type full_array: Numpy Array
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        :returns:  SingleDataloader -- returns a dataloader instance.
+        :returns:  Tensor -- the output tensor.
         """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        handle = ffc().flexflow_model_add_multihead_attention(
+            self.handle,
+            query.handle,
+            key.handle,
+            value.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            kernel_init_handle,
+            c_name,
+        )
+        self.add_layer(OpType.MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.MULTIHEAD_ATTENTION)
 
-        if self._ffconfig.enable_control_replication:
-            assert (
-                self._ffconfig.python_data_loader_type != 1
-            ), "To enable control replication, please set --python-data-loader-type 2"
-            return self.__create_data_loader_ptr(batch_tensor, full_array)
-        else:
-            if self._ffconfig.python_data_loader_type == 1:
-                return self.__create_data_loader_attach(batch_tensor, full_array)
-            else:
-                return self.__create_data_loader_ptr(batch_tensor, full_array)
-
-    def __create_data_loader_attach(self, batch_tensor, full_array):
-        full_array_shape = full_array.shape
-        num_samples = full_array_shape[0]
-        num_dim = len(full_array_shape)
-        if full_array.dtype == "float16":
-            datatype = DataType.DT_HALF
-        elif full_array.dtype == "float32":
-            datatype = DataType.DT_FLOAT
-        elif full_array.dtype == "int32":
-            datatype = DataType.DT_INT32
-        elif full_array.dtype == "int64":
-            datatype = DataType.DT_INT64
-        else:
-            assert 0, "unsupported datatype"
+    def inc_multihead_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        In inference mode, the attention is computed using incremental decoding.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
+
+    def spec_inc_multihead_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (beam search) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_spec_inc_multihead_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
+
+    def inc_multihead_self_attention_verify(
+        self,
+        input,
+        embed_dim,
+        num_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the MultiHead Attention operation as described in Attention Is All You Need
+        which takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (tree verify) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_heads: Number of attention heads.
+        :type num_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multihead_self_attention_verify(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
+
+    def inc_multiquery_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        In inference mode, the attention is computed using incremental decoding.
+
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
+
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
+
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
+
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
+
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
+
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
+
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
+
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
+
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
+
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
+
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multiquery_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
+        )
+        self.add_layer(OpType.INC_MULTIHEAD_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.INC_MULTIHEAD_ATTENTION)
+
+    def spec_inc_multiquery_self_attention(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (beam search) mode.
 
-        if num_dim == 2:
-            full_tensor = self.create_tensor(
-                [num_samples, full_array_shape[1]], datatype
-            )
-            self.map_tensor(full_tensor)
-        elif num_dim == 4:
-            full_tensor = self.create_tensor(
-                [
-                    num_samples,
-                    full_array_shape[1],
-                    full_array_shape[2],
-                    full_array_shape[3],
-                ],
-                datatype,
-            )
-            self.map_tensor(full_tensor)
-        else:
-            assert 0, "unsupported dims"
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        full_tensor.attach_numpy_array(self._ffconfig, full_array)
-        dataloader = SingleDataLoader(
-            self, batch_tensor, full_tensor, num_samples, datatype
-        )
-        full_tensor.detach_numpy_array(self._ffconfig)
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-        return dataloader
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
 
-    def __create_data_loader_ptr(self, batch_tensor, full_array):
-        full_array_shape = full_array.shape
-        num_samples = full_array_shape[0]
-        if full_array.dtype == "float16":
-            datatype = DataType.DT_HALF
-        elif full_array.dtype == "float32":
-            datatype = DataType.DT_FLOAT
-        elif full_array.dtype == "int32":
-            datatype = DataType.DT_INT32
-        elif full_array.dtype == "int64":
-            datatype = DataType.DT_INT64
-        else:
-            assert 0, "unsupported datatype"
-        np_raw_ptr = full_array.__array_interface__["data"]
-        raw_ptr = ffi.cast("float*", np_raw_ptr[0])
-        print(
-            "numpy array: %s, %s, %s"
-            % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))
-        )
-        dataloader = SingleDataLoader(
-            self, batch_tensor, raw_ptr, num_samples, datatype
-        )
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
 
-        return dataloader
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-    def __get_initializer_handle(self, initializer):
-        if initializer == None:
-            null_initializer = Initializer(None)
-            return null_initializer.handle
-        else:
-            return initializer.handle
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
 
-    def __get_op_handle(self, shared_op):
-        if shared_op == None:
-            op_handle = ffi.new("flexflow_op_t *")
-            op_handle.impl = ffi.NULL
-            op = Op(op_handle[0])
-        else:
-            op = shared_op
-        return op.handle
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
 
-    def get_output_tensor(self, ffmodel, data_type):
-        shape = self.dims
-        if data_type == DataType.DT_HALF:
-            np_array = np.empty(shape, dtype=np.float16)
-        elif data_type == DataType.DT_FLOAT:
-            np_array = np.empty(shape, dtype=np.float32)
-        elif self.data_type == DataType.DT_INT32:
-            np_array = np.empty(shape, dtype=np.int32)
-        elif self.data_type == DataType.DT_INT64:
-            np_array = np.empty(shape, dtype=np.int64)
-        else:
-            assert 0, f"Unsupported datatype: {self.data_type}"
-        np_raw_ptr = np_array.__array_interface__["data"]
-        if np_array.dtype == np.float32:
-            raw_ptr = ffi.cast("float*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_float(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        elif np_array.dtype == np.int32:
-            raw_ptr = ffi.cast("int*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_int(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        elif np_array.dtype == np.int64:
-            raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0])
-            ret_val = ffc().flexflow_tensor_get_tensor_int64(
-                self.handle, ffmodel.handle, raw_ptr, False
-            )
-        fflogger.debug(
-            "get weights raw_ptr: %s, %s, %s, %s"
-            % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))
-        )
-        assert ret_val == True
-        return np_array
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
 
-    def generate(self, prompt_list, max_sequence_length):
-        assert isinstance(prompt_list, list)
-        c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
-        max_num_chars = 5 * (max_sequence_length + 100)
-        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
-        c_output_length_and_tokens = [ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list]
-        ffc().flexflow_model_generate(
-            self.handle,
-            len(prompt_list),
-            c_input_texts,
-            max_num_chars,
-            c_output_texts,
-            max_sequence_length,
-            c_output_length_and_tokens,
-        )
-        #output_length = c_output_length_and_tokens[0]
-        #output_tokens = []
-        #for i in range(output_length):
-        #    output_tokens.append(c_output_length_and_tokens[i + 1])
-        from flexflow.serve import GenerationResult
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
 
-        return [GenerationResult(ffi.string(c_output_text), []) for c_output_text in c_output_texts]
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-    def set_position_offset(self, offset):
-        ffc().flexflow_model_set_position_offset(self.handle, offset)
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-# -----------------------------------------------------------------------
-# SGDOptimizer
-# -----------------------------------------------------------------------
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
-class SGDOptimizer(object):
-    __slots__ = ["handle", "_handle"]
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
-    def __init__(
-        self, ffmodel, lr=0.01, momentum=0.0, nesterov=False, weight_decay=0.0
-    ):
-        self.handle = ffc().flexflow_sgd_optimizer_create(
-            ffmodel.handle, lr, momentum, nesterov, weight_decay
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
+
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_spec_inc_multiquery_self_attention(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_sgd_optimizer_destroy)
+        self.add_layer(OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.SPEC_INC_MULTIHEAD_SELF_ATTENTION)
 
-    def set_learning_rate(self, learning_rate):
-        ffc().flexflow_sgd_optimizer_set_lr(self.handle, learning_rate)
+    def inc_multiquery_self_attention_verify(
+        self,
+        input,
+        embed_dim,
+        num_q_heads,
+        num_kv_heads,
+        kdim=0,
+        vdim=0,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        data_type=DataType.DT_NONE,
+        kernel_initializer=None,
+        apply_rotary_embedding=False,
+        scaling_query=False,
+        scaling_factor=1.0,
+        qk_prod_scaling=True,
+        position_bias=False,
+        name=None,
+    ):
+        """Defines the multi-query head attention, which allows a different number of Q and KV heads,
+        and takes in the tensors :attr:`input`, and uses it for all three of query, key and values.
+        This operator only supports computing the attention in inference (tree verify) mode.
+
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param embed_dim: total dimension of the model
+        :type embed_dim: int
 
-# -----------------------------------------------------------------------
-# AdamOptimizer
-# -----------------------------------------------------------------------
+        :param num_q_heads: Number of query attention heads.
+        :type num_q_heads: int
 
+        :param num_kv_heads: Number of key/value attention heads.
+        :type num_kv_heads: int
 
-class AdamOptimizer(object):
-    __slots__ = ["handle", "_handle"]
+        :param kdim: total number of features in key. Default is 0
+        :type kdim: int
 
-    def __init__(
-        self,
-        ffmodel,
-        alpha=0.001,
-        beta1=0.9,
-        beta2=0.999,
-        weight_decay=0.0,
-        epsilon=1e-8,
-    ):
-        self.handle = ffc().flexflow_adam_optimizer_create(
-            ffmodel.handle, alpha, beta1, beta2, weight_decay, epsilon
-        )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_adam_optimizer_destroy)
+        :param vdim: total number of features in value. Default is 0
+        :type vdim: int
+
+        :param dropout: a Dropout layer on attn_output_weights. Default is 0.0
+        :type dropout: float(0-1)
+
+        :param bias: Whether the dense layers use bias vectors. Default is True.
+        :type bias: bool
+
+        :param add_bias_kv: add bias to the key and value sequences at dim=0. Default is False.
+        :type add_bias_kv: bool
+
+        :param add_zero_attn: add a new batch of zeros to the key and value sequences at dim=1. Default is False.
+        :type add_zero_attn: bool
 
-    def set_learning_rate(self, learning_rate):
-        ffc().flexflow_adam_optimizer_set_lr(self.handle, learning_rate)
+        :param data_type: the data type of the tensors. Default is DataType.DT_NONE, which means using the data type of the input tensors.
+        :type data_type: DataType
 
+        :param kernel_initializer: Initializer for dense layer kernels. If it is set to None, the GlorotUniformInitializer is applied.
+        :type kernel_initializer: Initializer
 
-# -----------------------------------------------------------------------
-# Initializer
-# -----------------------------------------------------------------------
-class Initializer(object):
-    __slots__ = ["handle", "p_handle"]
+        :param apply_rotary_embedding: Whether to apply rotary embeddings. Default is False.
+        :type apply_rotary_embedding: bool
 
-    def __init__(self, handle, p_handle=0):
-        self.p_handle = ffi.new("flexflow_initializer_t *")
-        if handle == None:
-            self.p_handle.impl = ffi.NULL
-        else:
-            self.p_handle.impl = handle.impl
-        self.handle = self.p_handle[0]
-        assert ffi.typeof(self.handle) == ffi.typeof(
-            "flexflow_initializer_t"
-        ), "Initializer handle is wrong"
+        :param scaling_query: Whether to apply scaling query. Default is False.
+        :type scaling_query: bool
 
+        :param scaling_factor: The scaling factor to use for scaling. Default is 1.0.
+        :type scaling_factor: float
 
-# -----------------------------------------------------------------------
-# GlorotUniform
-# -----------------------------------------------------------------------
+        :param qk_prod_scaling: Whether to apply scaling to the QK product. Default is True.
+        :type qk_prod_scaling: bool
 
+        :param position_bias: Whether to add position bias to the QK product. Default is False.
+        :type position_bias: bool
 
-class GlorotUniformInitializer(Initializer):
-    __slots__ = ["glorot_handle", "_glorot_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(self, seed):
-        self.glorot_handle = ffc().flexflow_glorot_uniform_initializer_create(seed)
-        self._glorot_handle = ffi.gc(
-            self.glorot_handle, ffc().flexflow_glorot_uniform_initializer_destroy
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        kernel_init_handle = self.__get_initializer_handle(kernel_initializer)
+        c_data_type = enum_to_int(DataType, data_type)
+        handle = ffc().flexflow_model_add_inc_multiquery_self_attention_verify(
+            self.handle,
+            input.handle,
+            embed_dim,
+            num_q_heads,
+            num_kv_heads,
+            kdim,
+            vdim,
+            dropout,
+            bias,
+            add_bias_kv,
+            add_zero_attn,
+            c_data_type,
+            kernel_init_handle,
+            apply_rotary_embedding,
+            scaling_query,
+            scaling_factor,
+            qk_prod_scaling,
+            position_bias,
+            c_name,
         )
-        super(GlorotUniformInitializer, self).__init__(self.glorot_handle)
+        self.add_layer(OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION, name)
+        return Tensor(handle, owner_op_type=OpType.TREE_INC_MULTIHEAD_SELF_ATTENTION)
 
+    def rms_norm(self, input, eps, dim, name=None):
+        """Defines the RMS Norm layer.
 
-# -----------------------------------------------------------------------
-# ZeroInitializer
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
+
+        :param eps: a value added to the denominator for numerical stability
+        :type eps: float
 
+        :param dim: The dimension with respect to which to take the norm
+        :type dim: int
 
-class ZeroInitializer(Initializer):
-    __slots__ = ["zero_handle", "_zero_handle"]
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-    def __init__(self):
-        self.zero_handle = ffc().flexflow_zero_initializer_create()
-        self._zero_handle = ffi.gc(
-            self.zero_handle, ffc().flexflow_zero_initializer_destroy
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_rms_norm(
+            self.handle, input.handle, eps, dim, c_name
         )
-        super(ZeroInitializer, self).__init__(self.zero_handle)
+        self.add_layer(OpType.RMS_NORM, name)
+        return Tensor(handle, owner_op_type=OpType.RMS_NORM)
 
+    def residual_rms_norm(
+        self, input1, input2, eps, dim, inplace_residual=False, name=None
+    ):
+        """Defines the Residual RMS Norm layer.
 
-# -----------------------------------------------------------------------
-# UniformInitializer
-# -----------------------------------------------------------------------
+        :param input: the input 1 Tensor.
+        :type input: Tensor
 
+        :param input: the input 2 Tensor.
+        :type input: Tensor
 
-class UniformInitializer(Initializer):
-    __slots__ = ["uniform_handle", "_uniform_handle"]
+        :param eps: a value added to the denominator for numerical stability
+        :type eps: float
 
-    def __init__(self, seed, minv, maxv):
-        self.uniform_handle = ffc().flexflow_uniform_initializer_create(
-            seed, minv, maxv
+        :param dim: The dimension with respect to which to take the norm
+        :type dim: int
+
+        :param name: the name of the layer. Default is None.
+        :type name: string
+
+        :param inplace_residual: whether to compute the residual inplace using the input tensor. Default is False.
+        :type inplace_residual: bool
+
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handles_array = ffc().flexflow_model_add_residual_rms_norm(
+            self.handle,
+            input1.handle,
+            input2.handle,
+            eps,
+            dim,
+            inplace_residual,
+            c_name,
         )
-        self._uniform_handle = ffi.gc(
-            self.uniform_handle, ffc().flexflow_uniform_initializer_destroy
+        self.add_layer(OpType.RESIDUAL_RMS_NORM, name)
+        return (
+            Tensor(handles_array[0], owner_op_type=OpType.RESIDUAL_RMS_NORM),
+            Tensor(handles_array[1], owner_op_type=OpType.RESIDUAL_RMS_NORM),
         )
-        super(UniformInitializer, self).__init__(self.uniform_handle)
 
+    def arg_top_k(self, input, k, sorted, speculative_decoding, name=None):
+        """Defines the Arg TopK layer.
 
-# -----------------------------------------------------------------------
-# NormInitializer
-# -----------------------------------------------------------------------
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param k: the top k indices to select
+        :type k: int
 
-class NormInitializer(Initializer):
-    __slots__ = ["norm_handle", "_norm_handle"]
+        :param sorted: Whether the entries should be sorted
+        :type sorted: bool
 
-    def __init__(self, seed, mean, stddev):
-        self.norm_handle = ffc().flexflow_norm_initializer_create(seed, mean, stddev)
-        self._norm_handle = ffi.gc(
-            self.norm_handle, ffc().flexflow_norm_initializer_destroy
-        )
-        super(NormInitializer, self).__init__(self.norm_handle)
+        :param speculative_decoding: Whether you need to perform beam search
+        :type speculative_decoding: bool
 
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-# -----------------------------------------------------------------------
-# PerfMetrics
-# -----------------------------------------------------------------------
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_arg_top_k(
+            self.handle, input.handle, k, sorted, c_name
+        )
+        self.add_layer(OpType.ARG_TOPK, name)
+        return Tensor(handle, owner_op_type=OpType.ARG_TOPK)
 
+    def beam_top_k(self, input, max_beam_size, sorted, name=None):
+        """Defines the Beam TopK layer.
 
-class PerfMetrics(object):
-    __slots__ = ["handle", "_handle"]
+        :param input: the input Tensor.
+        :type input: Tensor
 
-    def __init__(self, handle):
-        self.handle = handle
-        self._handle = ffi.gc(self.handle, ffc().flexflow_per_metrics_destroy)
+        :param max_beam_size: the top max_beam_size indices to select
+        :type max_beam_size: int
 
-    def get_accuracy(self):
-        return ffc().flexflow_per_metrics_get_accuracy(self.handle)
+        :param sorted: Whether the entries should be sorted
+        :type sorted: bool
 
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-# -----------------------------------------------------------------------
-# NetConfig
-# -----------------------------------------------------------------------
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_beam_top_k(
+            self.handle, input.handle, max_beam_size, sorted, c_name
+        )
+        self.add_layer(OpType.BEAM_TOPK, name)
+        return Tensor(handle, owner_op_type=OpType.BEAM_TOPK)
 
+    def sampling(self, input, top_p, name=None):
+        """Defines the Sampling layer.
 
-class NetConfig(object):
-    def __init__(self):
-        self.handle = ffc().flexflow_net_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_net_config_destroy)
-        cpath = ffc().flexflow_net_config_get_dataset_path(self.handle)
-        self.dataset_path = ffi.string(cpath)
+        :param input: the input Tensor.
+        :type input: Tensor
 
+        :param top_p: The top_p parameter of the sampling
+        :type top_p: float
 
-# -----------------------------------------------------------------------
-# DLRMConfig
-# -----------------------------------------------------------------------
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_sampling(
+            self.handle, input.handle, top_p, c_name
+        )
+        self.add_layer(OpType.SAMPLING, name)
+        return Tensor(handle, owner_op_type=OpType.SAMPLING)
 
-class DLRMConfig(object):
-    def __init__(self):
-        self.handle = ffc().flexflow_dlrm_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_dlrm_config_destroy)
+    def argmax(self, input, beam_search, name=None):
+        """Defines the Sampling layer.
 
-        cstr = ffc().flexflow_dlrm_config_get_dataset_path(self.handle)
-        self.dataset_path = ffi.string(cstr)
+        :param input: the input Tensor.
+        :type input: Tensor
 
-        cstr = ffc().flexflow_dlrm_config_get_arch_interaction_op(self.handle)
-        self.arch_interaction_op = ffi.string(cstr)
+        :param beam_search: Whether you need to perform beam search
+        :type beam_search: bool
 
-        self.sparse_feature_size = ffc().flexflow_dlrm_config_get_sparse_feature_size(
-            self.handle
-        )
-        self.sigmoid_bot = ffc().flexflow_dlrm_config_get_sigmoid_bot(self.handle)
-        self.sigmoid_top = ffc().flexflow_dlrm_config_get_sigmoid_top(self.handle)
-        self.embedding_bag_size = ffc().flexflow_dlrm_config_get_embedding_bag_size(
-            self.handle
-        )
-        self.loss_threshold = ffc().flexflow_dlrm_config_get_loss_threshold(self.handle)
+        :param name: the name of the layer. Default is None.
+        :type name: string
 
-        mlp_bot_c = ffc().flexflow_dlrm_config_get_mlp_bot(self.handle)
-        self.mlp_bot = []
-        for i in range(0, mlp_bot_c[0]):
-            self.mlp_bot.append(mlp_bot_c[i + 1])
+        :returns:  Tensor -- the output tensor.
+        """
+        c_name = get_c_name(name)
+        handle = ffc().flexflow_model_add_argmax(
+            self.handle, input.handle, beam_search, c_name
+        )
+        self.add_layer(OpType.ARGMAX, name)
+        return Tensor(handle, owner_op_type=OpType.ARGMAX)
 
-        mlp_top_c = ffc().flexflow_dlrm_config_get_mlp_top(self.handle)
-        self.mlp_top = []
-        for i in range(0, mlp_top_c[0]):
-            self.mlp_top.append(mlp_top_c[i + 1])
+    def add_lora_layer(self, peft_config):
+        return ffc().flexflow_model_add_lora_layer(self.handle, peft_config.handle)
 
-        embedding_size_c = ffc().flexflow_dlrm_config_get_embedding_size(self.handle)
-        self.embedding_size = []
-        for i in range(0, embedding_size_c[0]):
-            self.embedding_size.append(embedding_size_c[i + 1])
+    def reset_metrics(self):
+        """Reset performance metrics.
 
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_reset_metrics(self.handle)
 
-# -----------------------------------------------------------------------
-# Single DataLoader
-# -----------------------------------------------------------------------
+    def init_layers(self):
+        """Initialize layers.
 
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_init_layers(self.handle)
 
-class SingleDataLoader(object):
-    __slots__ = ["handle", "_handle"]
+    def prefetch(self):
+        ffc().flexflow_model_prefetch(self.handle)
 
-    def __init__(self, ffmodel, input, full_input, num_samples, data_type):
-        assert type(ffmodel) is FFModel, "SingleDataLoader ffmodel is wrong"
-        assert type(input) is Tensor, "SingleDataLoader input is wrong"
-        if type(full_input) is Tensor:
-            self.init_from_tensor(ffmodel, input, full_input, num_samples, data_type)
-        else:
-            self.init_from_ptr(ffmodel, input, full_input, num_samples, data_type)
-        self._handle = ffi.gc(self.handle, ffc().flexflow_single_dataloader_destroy)
+    def forward(self, seq_length=None):
+        """Forward propagation of all layers.
 
-    def init_from_tensor(self, ffmodel, input, full_input, num_samples, data_type):
-        assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
-        c_data_type = enum_to_int(DataType, data_type)
-        self.handle = ffc().flexflow_single_dataloader_create(
-            ffmodel.handle, input.handle, full_input.handle, num_samples, c_data_type
-        )
+        :returns:  None -- no returns.
+        """
+        if seq_length is None:
+            seq_length = -1
+        ffc().flexflow_model_forward(self.handle, seq_length)
 
-    def init_from_ptr(self, ffmodel, input, full_input, num_samples, data_type):
-        # assert type(full_input) is Tensor, "SingleDataLoader full_input is wrong"
-        c_data_type = enum_to_int(DataType, data_type)
-        self.handle = ffc().flexflow_single_dataloader_create2(
-            ffmodel.handle, input.handle, full_input, num_samples, c_data_type
-        )
+    # TODO: seperate compute_metrics from backward
+    def backward(self, seq_length=None):
+        """Backward propagation of all layers.
 
-    @property
-    def num_samples(self):
-        return ffc().flexflow_single_dataloader_get_num_samples(self.handle)
+        :returns:  None -- no returns.
+        """
+        if seq_length is None:
+            seq_length = -1
+        ffc().flexflow_model_backward(self.handle, seq_length)
 
-    @num_samples.setter
-    def num_samples(self, samples):
-        ffc().flexflow_single_dataloader_set_num_samples(self.handle, samples)
+    def compute_metrics(self):
+        """Compute performance metrics.
 
-    def next_batch(self, ffmodel):
-        """Ask the dataloder to load the next batch to the :attr:`batch_tensor`.
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_compute_metrics(self.handle)
+
+    def update(self):
+        """Update weights and biases of all layers.
 
         :returns:  None -- no returns.
         """
-        ffc().flowflow_single_dataloader_next_batch(self.handle, ffmodel.handle)
+        ffc().flexflow_model_update(self.handle)
 
-    def reset(self):
-        """Reset the current position of the dataloder to 0.
+    def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None):
+        """Configure the model for trainting. FlexFlow uses lazy initialization,
+        so the actual creating of all operations (including creating and partitioning
+        of weight, bias and output tensors) happen during compile.
+
+        :param optimizer: optimizer instance.
+        :type optimizer: Optimizer
+
+        :param loss_type: Enum of LossType.
+          Options are LOSS_CATEGORICAL_CROSSENTROPY, LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
+          LOSS_MEAN_SQUARED_ERROR_AVG_REDUCE and LOSS_MEAN_SQUARED_ERROR_SUM_REDUCE.
+        :type loss_type: LossType
+
+        :param metrics: List of metrics to be evaluated by the model during training and testing.
+          Each of this is a Enum of MetricsType. Options are METRICS_ACCURACY,
+          METRICS_CATEGORICAL_CROSSENTROPY, METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
+          METRICS_MEAN_SQUARED_ERROR, METRICS_ROOT_MEAN_SQUARED_ERROR, METRICS_MEAN_ABSOLUTE_ERROR
+        :type metrics: MetricsType
+
+        :param comp_mode: Enum of CompMode.
+          Options are COMP_MODE_TRAINING, COMP_MODE_INFERENCE
+        :type comp_mode: CompMode
 
         :returns:  None -- no returns.
         """
-        ffc().flexflow_single_dataloader_reset(self.handle)
+        self.optimizer = optimizer
 
+        c_loss_type = enum_to_int(LossType, loss_type)
+        metrics_int = []
+        for metric in metrics:
+            metrics_int.append(enum_to_int(MetricsType, metric))
+        c_metrics = ffi.new("int[]", metrics_int)
+        if comp_mode == None:
+            comp_mode = CompMode.TRAINING
+        c_comp_mode = enum_to_int(CompMode, comp_mode)
+        ffc().flexflow_model_compile(
+            self.handle, c_loss_type, c_metrics, len(metrics), c_comp_mode
+        )
+        for ff_tensor, np_tensor in self.attr_tensors.items():
+            ff_tensor.set_tensor(self, np_tensor)
+        print("Compiled ffmodel!")
 
-class RegionNdarray(object):
-    __slots__ = ["__array_interface__"]
+    def fit(self, x=None, y=None, batch_size=None, epochs=1):
+        """Trains the model for a fixed number of epochs (iterations on a dataset).
 
-    def __init__(self, shape, data_type, base_ptr, strides, read_only):
-        # See: https://docs.scipy.org/doc/numpy/reference/arrays.interface.html
-        if data_type == DataType.DT_HALF:
-            field_type = "<f2"
-        elif data_type == DataType.DT_FLOAT:
-            field_type = "<f4"
-        elif data_type == DataType.DT_INT32:
-            field_type = "<i4"
-        else:
-            assert 0, "unknown data type"
-            field_type = "<f4"
-        self.__array_interface__ = {
-            "version": 3,
-            "shape": shape,
-            "typestr": field_type,
-            "data": (base_ptr, read_only),
-            "strides": strides,
-        }
+        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
+        :type x: Dataloader
 
+        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
+        :type y: Dataloader
 
-# -----------------------------------------------------------------------
-# BatchConfig
-# -----------------------------------------------------------------------
+        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
+          or :attr:`--batch-size` from the command line.
+        :type batch_size: int
 
+        :param epochs: Number of epochs to train the model.
+          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
+          The default value is 1.
+        :type epochs: int
 
-class BatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+        :returns:  None -- no returns.
+        """
+        if isinstance(x, list) == False:
+            dataloaders = [x]
+        else:
+            dataloaders = x
+        dataloaders.append(y)
 
-    def __init__(self):
-        self.handle = ffc().flexflow_batch_config_create()
-        self._handle = ffi.gc(self.handle, ffc().flexflow_batch_config_destroy)
+        num_samples = y.num_samples
+        batch_size = self._ffconfig.batch_size
+        self._tracing_id += 1  # get a new tracing id
+        for epoch in range(0, epochs):
+            for d in dataloaders:
+                d.reset()
+            self.reset_metrics()
+            iterations = num_samples / batch_size
+            for iter in range(0, int(iterations)):
+                self._ffconfig.begin_trace(self._tracing_id)
+                for d in dataloaders:
+                    d.next_batch(self)
+                self.forward()
+                self.zero_gradients()
+                self.backward()
+                self.update()
+                self._ffconfig.end_trace(self._tracing_id)
 
+    def eval(self, x=None, y=None, batch_size=None):
+        """Returns the loss value & metrics values for the model in test mode.
 
-# -----------------------------------------------------------------------
-# TreeVerifyBatchConfig
-# -----------------------------------------------------------------------
+        :param x: Input data. It can be a Dataloader instance or a list of Dataloader instances.
+        :type x: Dataloader
 
+        :param y: Target data (label). It can be a Dataloader instance or a list of Dataloader instances.
+        :type y: Dataloader
 
-class TreeVerifyBatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+        :param batch_size: Number of samples per gradient update. It must be identical with :attr:`-b`
+          or :attr:`--batch-size` from the command line.
+        :type batch_size: int
 
-    def __init__(self):
-        self.handle = ffc().flexflow_tree_verify_batch_config_create()
-        self._handle = ffi.gc(
-            self.handle, ffc().flexflow_tree_verify_batch_config_destroy
-        )
+        :param epochs: Number of epochs to train the model.
+          An epoch is an iteration over the entire :attr:`x` and :attr:`y` data provided.
+          The default value is 1.
+        :type epochs: int
 
+        :returns:  None -- no returns.
+        """
+        if isinstance(x, list) == False:
+            dataloaders = [x]
+        else:
+            dataloaders = x
+        dataloaders.append(y)
 
-# -----------------------------------------------------------------------
-# BeamSearchBatchConfig
-# -----------------------------------------------------------------------
+        num_samples = y.num_samples
+        batch_size = self._ffconfig.batch_size
+        for d in dataloaders:
+            d.reset()
+        self.reset_metrics()
+        iterations = num_samples / batch_size
+        self._tracing_id += 1  # get a new tracing id
+        for iter in range(0, int(iterations)):
+            for d in dataloaders:
+                d.next_batch(self)
+            self._ffconfig.begin_trace(self._tracing_id)
+            self.forward()
+            self.compute_metrics()
+            self._ffconfig.end_trace(self._tracing_id)
 
+    def zero_gradients(self):
+        """Empty the gradients of all layers.
 
-class BatchConfig(object):
-    __slots__ = ["handle", "_handle"]
+        :returns:  None -- no returns.
+        """
+        ffc().flexflow_model_zero_gradients(self.handle)
 
-    def __init__(self):
-        self.handle = ffc().flexflow_beam_search_batch_config_create()
-        self._handle = ffi.gc(
-            self.handle, ffc().flexflow_beam_search_batch_config_destroy
-        )
+    def set_optimizer(self, optimizer):
+        if isinstance(optimizer, SGDOptimizer) == True:
+            ffc().flexflow_model_set_sgd_optimizer(self.handle, optimizer.handle)
+        elif isinstance(optimizer, AdamOptimizer) == True:
+            ffc().flexflow_model_set_adam_optimizer(self.handle, optimizer.handle)
+        elif optimizer == None:
+            pass
+        else:
+            assert 0, "[Model]: unknown optimizer"
+
+    optimizer = property(fset=set_optimizer)
+
+    def print_layers(self, id=-1):
+        ffc().flexflow_model_print_layers(self.handle, id)
 
+    def get_layer_by_id(self, layer_id):
+        return self._layers[layer_id]
 
-# -----------------------------------------------------------------------
-# RequestManager
-# -----------------------------------------------------------------------
+    def get_last_layer(self):
+        return self._layers[self._nb_layers - 1]
 
+    def get_layer_by_name(self, layer_name):
+        for layer_id in self._layers:
+            layer = self._layers[layer_id]
+            if layer.name == layer_name:
+                return layer
+        assert 0, f"Cannot find the layer with name {layer_name}"
+        return None
 
-class RequestManager(object):
-    __slots__ = ["handle"]
+    def get_tensor_by_id(self, id):
+        handle = ffc().flexflow_model_get_parameter_by_id(self.handle, id)
+        return Parameter(handle)
 
-    def __init__(self):
-        self.handle = ffc().flexflow_request_manager_get_request_manager()
-        # self._handle = ffi.gc(self.handle, ffc().flexflow_request_manager_destroy)
+    @property
+    def label_tensor(self):
+        handle = ffc().flexflow_model_get_label_tensor(self.handle)
+        return Tensor(handle, deallocate=False)
 
-    def register_tokenizer(
-        self, model_type, bos_token_id, eos_token_id, tokenizer_filepath
-    ):
-        c_model_type = enum_to_int(ModelType, model_type)
-        c_tokenizer_filepath = get_c_name(tokenizer_filepath)
-        return ffc().flexflow_request_manager_register_tokenizer(
-            self.handle, c_model_type, bos_token_id, eos_token_id, c_tokenizer_filepath
-        )
+    def get_perf_metrics(self):
+        handle = ffc().flexflow_model_get_perf_metrics(self.handle)
+        return PerfMetrics(handle)
 
-    def register_output_filepath(self, output_filepath):
-        c_output_filepath = get_c_name(output_filepath)
-        return ffc().flexflow_request_manager_register_output_filepath(
-            self.handle, c_output_filepath
-        )
+    def set_transformer_layer_id(self, id):
+        ffc().flexflow_model_set_transformer_layer_id(self.handle, id)
 
-    def register_ssm_model(self, model):
-        return ffc().flexflow_request_manager_register_ssm_model(
-            self.handle, model.handle
-        )
+    def create_data_loader(self, batch_tensor, full_array):
+        """Create a SingleDataloader instance.
 
-    def set_max_requests_per_batch(self, max_requests):
-        return ffc().flexflow_request_manager_set_max_requests_per_batch(
-            self.handle, max_requests)
-    
-    def set_max_tokens_per_batch(self, max_tokens):
-        return ffc().flexflow_request_manager_set_max_tokens_per_batch(
-            self.handle, max_tokens)
-    
-    def set_max_spec_tree_token_num(self, max_tokens):
-        return ffc().flexflow_request_manager_set_max_spec_tree_token_num(
-            self.handle, max_tokens)
-    
-    def set_max_sequence_length(self, max_length):
-        return ffc().flexflow_request_manager_set_max_sequence_length(
-            self.handle, max_length)
+        :param batch_tensor: a batch-sized tensor. Usually it is a input tensor of the model.
+        :type batch_tensor: Tensor
 
-    def start_server(self, model):
-        return ffc().flexflow_request_manager_start_background_server(
-            self.handle, model.handle
-        )
+        :param full_array: the entire data.
+        :type full_array: Numpy Array
 
-    def stop_server(self):
-        return ffc().flexflow_request_manager_terminate_background_server(
-            self.handle)
-# -----------------------------------------------------------------------
-# InferenceManager
-# -----------------------------------------------------------------------
+        :returns:  SingleDataloader -- returns a dataloader instance.
+        """
 
+        if self._ffconfig.enable_control_replication:
+            assert (
+                self._ffconfig.python_data_loader_type != 1
+            ), "To enable control replication, please set --python-data-loader-type 2"
+            return self.__create_data_loader_ptr(batch_tensor, full_array)
+        else:
+            if self._ffconfig.python_data_loader_type == 1:
+                return self.__create_data_loader_attach(batch_tensor, full_array)
+            else:
+                return self.__create_data_loader_ptr(batch_tensor, full_array)
 
-class InferenceManager(object):
-    __slots__ = ["handle"]
+    def __create_data_loader_attach(self, batch_tensor, full_array):
+        full_array_shape = full_array.shape
+        num_samples = full_array_shape[0]
+        num_dim = len(full_array_shape)
+        if full_array.dtype == "float16":
+            datatype = DataType.DT_HALF
+        elif full_array.dtype == "float32":
+            datatype = DataType.DT_FLOAT
+        elif full_array.dtype == "int32":
+            datatype = DataType.DT_INT32
+        elif full_array.dtype == "int64":
+            datatype = DataType.DT_INT64
+        else:
+            assert 0, "unsupported datatype"
 
-    def __init__(self):
-        self.handle = ffc().flexflow_inference_manager_get_inference_manager()
-        # self._handle = ffi.gc(self.handle, ffc().flexflow_inference_manager_destroy)
+        if num_dim == 2:
+            full_tensor = self.create_tensor(
+                [num_samples, full_array_shape[1]], datatype
+            )
+            self.map_tensor(full_tensor)
+        elif num_dim == 4:
+            full_tensor = self.create_tensor(
+                [
+                    num_samples,
+                    full_array_shape[1],
+                    full_array_shape[2],
+                    full_array_shape[3],
+                ],
+                datatype,
+            )
+            self.map_tensor(full_tensor)
+        else:
+            assert 0, "unsupported dims"
 
-    def compile_model_and_allocate_buffer(self, model):
-        ffc().flexflow_inference_manager_compile_model_and_allocate_buffer(
-            self.handle, model.handle
+        full_tensor.attach_numpy_array(self._ffconfig, full_array)
+        dataloader = SingleDataLoader(
+            self, batch_tensor, full_tensor, num_samples, datatype
         )
+        full_tensor.detach_numpy_array(self._ffconfig)
 
-    def init_operators_inference(self, model):
-        ffc().flexflow_inference_manager_init_operators_inference(
-            self.handle, model.handle
-        )
+        return dataloader
 
-    def register_model_weights_loader(self, model, fileloader):
-        ffc().flexflow_inference_manager_register_model_weights_loader(
-            self.handle, model.handle, fileloader.handle
+    def __create_data_loader_ptr(self, batch_tensor, full_array):
+        full_array_shape = full_array.shape
+        num_samples = full_array_shape[0]
+        if full_array.dtype == "float16":
+            datatype = DataType.DT_HALF
+        elif full_array.dtype == "float32":
+            datatype = DataType.DT_FLOAT
+        elif full_array.dtype == "int32":
+            datatype = DataType.DT_INT32
+        elif full_array.dtype == "int64":
+            datatype = DataType.DT_INT64
+        else:
+            assert 0, "unsupported datatype"
+        np_raw_ptr = full_array.__array_interface__["data"]
+        raw_ptr = ffi.cast("float*", np_raw_ptr[0])
+        print(
+            "numpy array: %s, %s, %s"
+            % (str(np_raw_ptr), str(raw_ptr), hex(np_raw_ptr[0]))
+        )
+        dataloader = SingleDataLoader(
+            self, batch_tensor, raw_ptr, num_samples, datatype
         )
 
-# -----------------------------------------------------------------------
-# FileDataLoader
-# -----------------------------------------------------------------------
+        return dataloader
 
+    def __get_initializer_handle(self, initializer):
+        if initializer == None:
+            null_initializer = Initializer(None)
+            return null_initializer.handle
+        else:
+            return initializer.handle
 
-class FileDataLoader(object):
-    __slots__ = ["handle", "_handle"]
+    def __get_op_handle(self, shared_op):
+        if shared_op == None:
+            op_handle = ffi.new("flexflow_op_t *")
+            op_handle.impl = ffi.NULL
+            op = Op(op_handle[0])
+        else:
+            op = shared_op
+        return op.handle
 
-    def __init__(
-        self,
-        weight_file_path,
-        num_q_heads,
-        num_kv_heads,
-        hidden_dim,
-        qkv_inner_dim,
-        tensor_parallelism_degree,
-        use_full_precision
-    ):
-        c_weight_file_path = get_c_name(weight_file_path)
-        self.handle = ffc().flexflow_file_data_loader_create(
-            c_weight_file_path,
-            num_q_heads,
-            num_kv_heads,
-            hidden_dim,
-            qkv_inner_dim,
-            tensor_parallelism_degree,
-            use_full_precision
+    def get_output_tensor(self, ffmodel, data_type):
+        shape = self.dims
+        if data_type == DataType.DT_HALF:
+            np_array = np.empty(shape, dtype=np.float16)
+        elif data_type == DataType.DT_FLOAT:
+            np_array = np.empty(shape, dtype=np.float32)
+        elif self.data_type == DataType.DT_INT32:
+            np_array = np.empty(shape, dtype=np.int32)
+        elif self.data_type == DataType.DT_INT64:
+            np_array = np.empty(shape, dtype=np.int64)
+        else:
+            assert 0, f"Unsupported datatype: {self.data_type}"
+        np_raw_ptr = np_array.__array_interface__["data"]
+        if np_array.dtype == np.float32:
+            raw_ptr = ffi.cast("float*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_float(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        elif np_array.dtype == np.int32:
+            raw_ptr = ffi.cast("int*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_int(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        elif np_array.dtype == np.int64:
+            raw_ptr = ffi.cast("int64_t*", np_raw_ptr[0])
+            ret_val = ffc().flexflow_tensor_get_tensor_int64(
+                self.handle, ffmodel.handle, raw_ptr, False
+            )
+        fflogger.debug(
+            "get weights raw_ptr: %s, %s, %s, %s"
+            % (str(raw_ptr), str(np_raw_ptr[0]), hex(np_raw_ptr[0]), str(shape))
         )
-        self._handle = ffi.gc(self.handle, ffc().flexflow_file_data_loader_destroy)
+        assert ret_val == True
+        return np_array
 
-    def load_weights(self, model):
-        # Check data type and create use_full_precision boolean
-        #assert data_type == DataType.DT_FLOAT or data_type == DataType.DT_HALF
-        #use_full_precision = data_type == DataType.DT_FLOAT
-        ffc().flexflow_file_data_loader_load_weights(
-            self.handle, model.handle
+    def generate_inf_only(self, prompt_list: List[str], max_sequence_length: int = 128):
+        assert isinstance(prompt_list, list)
+        c_input_texts = [get_c_name(prompt) for prompt in prompt_list]
+        max_num_chars = 5 * (max_sequence_length + 100)
+        c_output_texts = [ffi.new("char[]", max_num_chars) for prompt in prompt_list]
+        c_output_length_and_tokens = [
+            ffi.new("int[]", max_sequence_length + 100) for prompt in prompt_list
+        ]
+        c_request_types = [
+            enum_to_int(RequestType, RequestType.REQ_INFERENCE)
+            for prompt in prompt_list
+        ]
+        max_sequence_lengths = [max_sequence_length for prompt in prompt_list]
+        peft_model_ids = [PEFTModelID.no_id_handle() for prompt in prompt_list]
+        dataset_filepaths = [ffi.NULL for prompt in prompt_list]
+        training_steps = [0 for prompt in prompt_list]
+        num_finetuning_losses = ffi.new("int *")
+        c_finetuning_losses = ffi.new("float[]", 0)
+        ffc().flexflow_model_generate(
+            self.handle,
+            len(prompt_list),
+            c_request_types,
+            c_input_texts,
+            c_output_texts,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
+            c_output_length_and_tokens,
+            num_finetuning_losses,
+            c_finetuning_losses,
+        )
+        from flexflow.serve import GenerationResult
+
+        return [
+            GenerationResult(
+                text=ffi.string(c_output_text), tokens=[], finetuning_losses=[]
+            )
+            for c_output_text in c_output_texts
+        ]
+
+    def generate(self, requests_list: List[Request]):
+        assert isinstance(requests_list, list)
+        c_input_texts = [
+            get_c_name(request.prompt) for request in requests_list
+        ]  # entry will be None for finetuning requests
+        c_output_texts = [
+            (
+                ffi.new("char[]", 5 * (request.max_sequence_length + 100))
+                if request.req_type == RequestType.REQ_INFERENCE
+                else ffi.NULL
+            )
+            for request in requests_list
+        ]
+        c_output_length_and_tokens = [
+            ffi.new("int[]", request.max_sequence_length + 100)
+            for request in requests_list
+        ]
+        c_request_types = [
+            enum_to_int(RequestType, request.req_type) for request in requests_list
+        ]
+        max_sequence_lengths = [
+            request.max_sequence_length for request in requests_list
+        ]
+        peft_model_ids = [
+            (
+                request.peft_model_id
+                if request.peft_model_id is not None
+                else PEFTModelID.no_id_handle()
+            )
+            for request in requests_list
+        ]
+        dataset_filepaths = [
+            get_c_name(request.dataset_filepath) for request in requests_list
+        ]
+        training_steps = [request.max_training_steps for request in requests_list]
+        num_finetuning_losses = ffi.new("int *")
+        # c_finetuning_losses = ffi.new("float**")
+        # TODO: set this value automatically
+        c_finetuning_losses = ffi.new("float[]", 10000)
+        
+        ffc().flexflow_model_generate(
+            self.handle,
+            len(requests_list),
+            c_request_types,
+            c_input_texts,
+            c_output_texts,
+            max_sequence_lengths,
+            peft_model_ids,
+            dataset_filepaths,
+            training_steps,
+            c_output_length_and_tokens,
+            num_finetuning_losses,
+            c_finetuning_losses,
         )
+        finetuning_losses = []
+        if num_finetuning_losses[0] > 0:
+            finetuning_losses = [
+                c_finetuning_losses[i] for i in range(num_finetuning_losses[0])
+            ]
+        results = []
+        for c_output_text in c_output_texts:
+            results.append(
+                GenerationResult(
+                    text=(
+                        ffi.string(c_output_text) if c_output_text != ffi.NULL else None
+                    ),
+                    tokens=[],
+                    finetuning_losses=finetuning_losses,
+                )
+            )
+        return results
+
+    def set_position_offset(self, offset):
+        ffc().flexflow_model_set_position_offset(self.handle, offset)
diff --git a/python/flexflow/serve/__init__.py b/python/flexflow/serve/__init__.py
index 5af077273d..fd29080a6a 100644
--- a/python/flexflow/serve/__init__.py
+++ b/python/flexflow/serve/__init__.py
@@ -15,7 +15,16 @@
 from typing import Optional
 from ..type import *
 from flexflow.core import *
-from .serve import LLM, SSM, GenerationConfig, GenerationResult
+from .serve import (
+    LLM,
+    SSM,
+    GenerationConfig,
+    GenerationResult,
+    LoraLinearConfig,
+    PEFTModelID,
+    Request,
+    RequestType,
+)
 
 
 def __check_positive_int(configs_dict: dict, key: str):
@@ -44,6 +53,9 @@ def init(
     offload_reserve_space_size: Optional[int] = None,
     use_4bit_quantization: Optional[bool] = None,
     use_8bit_quantization: Optional[bool] = None,
+    enable_peft: Optional[bool] = None,
+    peft_activation_reserve_space_size: Optional[int] = None,
+    peft_weight_reserve_space_size: Optional[int] = None,
     profiling: Optional[bool] = None,
     benchmarking: Optional[bool] = None,
     inference_debugging: Optional[bool] = None,
@@ -69,9 +81,12 @@ def init(
     - tensor_parallelism_degree: the degree of parallelization in the tensor parallel dimension (using the Megatron technique), defaults to 1
     - pipeline_parallelism_degree: the degree of parallelization in the pipeline parallel dimension, defaults to 1
     - offload: whether to enable offloading of the weights to CPU, defaults to False
-    - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
+    - offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
     - use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     - use_8bit_quantization: whether to use 8-bit quantization, defaults to False
+    - enable_peft: whether to enable the use of PEFT, defaults to False
+    - peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
+    - peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
     - profiling: whether to enable the FlexFlow profiling mode, defaults to False
     - benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
     - inference_debugging: whether to run inference in debugging mode, saving all inputs/outputs/weights to file, defaults to False
@@ -100,12 +115,18 @@ def init(
     :type pipeline_parallelism_degree: Optional[int], optional
     :param offload: whether to enable offloading of the weights to CPU, defaults to False
     :type offload: Optional[bool], optional
-    :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, default to 1024^2
+    :param offload_reserve_space_size: the space (in MB) to reserve on CPU for offloading, defaults to 8 GB
     :type offload_reserve_space_size: Optional[int], optional
     :param use_4bit_quantization: whether to use 4-bit quantization, defaults to False
     :type use_4bit_quantization: Optional[bool], optional
     :param use_8bit_quantization: whether to use 8-bit quantization, defaults to False
     :type use_8bit_quantization: Optional[bool], optional
+    :param enable_peft: whether to enable the use of PEFT, defaults to False
+    :type enable_peft: Optional[bool], optional
+    :param peft_activation_reserve_space_size: the space (in MB) to reserve on GPU for PEFT activations, default to 1 GB
+    :type peft_activation_reserve_space_size: Optional[int], optional
+    :param peft_weight_reserve_space_size: the space (in MB) to reserve on GPU for PEFT weights, default to 1 GB
+    :type peft_weight_reserve_space_size: Optional[int], optional
     :param profiling: whether to enable the FlexFlow profiling mode, defaults to False
     :type profiling: Optional[bool], optional
     :param benchmarking: whether to run benchmaking only, without loading real weights, defaults to False
@@ -135,6 +156,9 @@ def init(
             offload_reserve_space_size is not None,
             use_4bit_quantization is not None,
             use_8bit_quantization is not None,
+            enable_peft is not None,
+            peft_activation_reserve_space_size is not None,
+            peft_weight_reserve_space_size is not None,
             profiling is not None,
             benchmarking is not None,
             inference_debugging is not None,
@@ -161,6 +185,9 @@ def init(
             "offload_reserve_space_size": offload_reserve_space_size,
             "use_4bit_quantization": use_4bit_quantization,
             "use_8bit_quantization": use_8bit_quantization,
+            "enable_peft": enable_peft,
+            "peft_activation_reserve_space_size": peft_activation_reserve_space_size,
+            "peft_weight_reserve_space_size": peft_weight_reserve_space_size,
             "profiling": profiling,
             "benchmarking": benchmarking,
             "inference_debugging": inference_debugging,
@@ -182,6 +209,8 @@ def init(
         "tensor_parallelism_degree",
         "pipeline_parallelism_degree",
         "offload_reserve_space_size",
+        "peft_activation_reserve_space_size",
+        "peft_weight_reserve_space_size",
     ]
     for param in positive_int_params:
         __check_positive_int(configs_dict, param)
@@ -200,11 +229,17 @@ def init(
     if configs_dict.get("offload", None) is None:
         configs_dict["offload"] = False
     if configs_dict.get("offload_reserve_space_size", None) is None:
-        configs_dict["offload_reserve_space_size"] = 1024**2
+        configs_dict["offload_reserve_space_size"] = 8 * 1024**3
     if configs_dict.get("use_4bit_quantization", None) is None:
         configs_dict["use_4bit_quantization"] = False
     if configs_dict.get("use_8bit_quantization", None) is None:
         configs_dict["use_8bit_quantization"] = False
+    if configs_dict.get("enable_peft", None) is None:
+        configs_dict["enable_peft"] = False
+    if configs_dict.get("peft_activation_reserve_space_size", None) is None:
+        configs_dict["peft_activation_reserve_space_size"] = 8 * 1024**3
+    if configs_dict.get("peft_weight_reserve_space_size", None) is None:
+        configs_dict["peft_weight_reserve_space_size"] = 1024**3
     if configs_dict.get("profiling", None) is None:
         configs_dict["profiling"] = False
     if configs_dict.get("benchmarking", None) is None:
diff --git a/python/flexflow/serve/models/base.py b/python/flexflow/serve/models/base.py
index e7f3914037..17bb894250 100644
--- a/python/flexflow/serve/models/base.py
+++ b/python/flexflow/serve/models/base.py
@@ -32,5 +32,8 @@ def __init__(
     def build_model(self):
         assert False, "Not implemented yet"
 
+    def convert_hf_weight_name(name):
+        assert False, "Not implemented yet"
+
     def convert_hf_model(model, dst_folder):
         assert False, "Not implemented yet"
diff --git a/python/flexflow/serve/models/falcon.py b/python/flexflow/serve/models/falcon.py
index 7a55da26ef..0e8fbcbd7d 100644
--- a/python/flexflow/serve/models/falcon.py
+++ b/python/flexflow/serve/models/falcon.py
@@ -124,7 +124,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, att_norm = ffmodel.residual_layer_norm(
@@ -135,7 +135,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     True,
                     self.falcon_config.layer_norm_epsilon,
-                    name=f"layers_{i}_input_layernorm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -153,7 +153,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -170,7 +170,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multiquery_self_attention(
@@ -187,7 +187,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attention",
                 )
             else:
                 assert False
@@ -197,7 +197,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size * 4,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_h_to_4h",
+                name=f"layers.{i}.mlp.dense_h_to_4h",
             )
             dense_h_to_4h = ffmodel.gelu(dense_h_to_4h)
             mlp_output = ffmodel.dense(
@@ -205,7 +205,7 @@ def build_model(self, max_tokens_per_batch):
                 self.falcon_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_mlp_dense_4h_to_h",
+                name=f"layers.{i}.mlp.dense_4h_to_h",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -239,10 +239,18 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
-                output = ffmodel.argmax(lm_head, False)
+                softmax = ffmodel.softmax(lm_head, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (name.replace("transformer.h.", "layers.")
+            .replace("transformer.", "")
+            .replace("self_attention.dense", "self_attention.o_proj")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         n_head = (
@@ -251,17 +259,12 @@ def convert_hf_model(model, dst_folder):
             else model.config.num_attention_heads
         )
         for name, params in model.named_parameters():
-            name = (
-                name.replace(".", "_")
-                .replace("transformer_h_", "layers_")
-                .replace("transformer_", "")
-                .replace("self_attention_dense", "attention_wo")
-            )
+            name = FlexFlowFalcon.convert_hf_weight_name(name)
             # Split Q,K,V attention weights
-            if "self_attention_query_key_value" in name:
-                name_q = name.replace("self_attention_query_key_value", "attention_wq")
-                name_k = name.replace("self_attention_query_key_value", "attention_wk")
-                name_v = name.replace("self_attention_query_key_value", "attention_wv")
+            if "self_attention.query_key_value" in name:
+                name_q = name.replace("self_attention.query_key_value", "self_attention.q_proj")
+                name_k = name.replace("self_attention.query_key_value", "self_attention.k_proj")
+                name_v = name.replace("self_attention.query_key_value", "self_attention.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -278,5 +281,5 @@ def convert_hf_model(model, dst_folder):
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         # LM head weight
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/models/llama.py b/python/flexflow/serve/models/llama.py
index 6b33030f62..96f0258572 100644
--- a/python/flexflow/serve/models/llama.py
+++ b/python/flexflow/serve/models/llama.py
@@ -62,7 +62,7 @@ def __init__(
         # self.llama_config.max_num_tokens = max_tokens_per_batch
         self.weights_filepath = weights_filepath
         self.tokenizer_filepath = tokenizer_filepath
-        self.maxint = 2**31 - 1
+        self.maxint = 2 ** 31 - 1
         max_verify_tokens_per_batch = (
             max_tokens_per_batch + self.llama_config.max_spec_tree_token_num
         )
@@ -106,7 +106,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="tok_embeddings",
+            name="embed_tokens",
         )
 
         for i in range(self.llama_config.num_hidden_layers):
@@ -117,7 +117,7 @@ def build_model(self, max_tokens_per_batch):
                     token,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
             else:
                 token, attn_norm = ffmodel.residual_rms_norm(
@@ -125,7 +125,7 @@ def build_model(self, max_tokens_per_batch):
                     w2,
                     self.llama_config.rms_norm_eps,
                     self.llama_config.hidden_size,
-                    name=f"layers_{i}_attention_norm",
+                    name=f"layers.{i}.input_layernorm",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -145,7 +145,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multiquery_self_attention_verify(
@@ -164,7 +164,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multiquery_self_attention(
@@ -183,7 +183,7 @@ def build_model(self, max_tokens_per_batch):
                     DataType.DT_NONE,  # data_type
                     None,  # kernel initializer
                     True,  # apply_rotary_embedding
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -193,21 +193,21 @@ def build_model(self, max_tokens_per_batch):
                 mha,
                 self.llama_config.rms_norm_eps,
                 self.llama_config.hidden_size,
-                name=f"layers_{i}_ffn_norm",
+                name=f"layers.{i}.post_attention_layernorm",
             )
             w1 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w1",
+                name=f"layers.{i}.mlp.gate_proj",
             )
             w3 = ffmodel.dense(
                 ff_norm,
                 self.llama_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w3",
+                name=f"layers.{i}.mlp.up_proj",
             )
             multi = ffmodel.sigmoid_silu_multi(w1, w3)
             w2 = ffmodel.dense(
@@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch):
                 self.llama_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_feed_forward_w2",
+                name=f"layers.{i}.mlp.down_proj",
             )
 
         _, token = ffmodel.residual_rms_norm(
@@ -230,7 +230,7 @@ def build_model(self, max_tokens_per_batch):
             self.llama_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="output",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -246,28 +246,16 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(dense, 1, False)
-                output = ffmodel.argmax(dense, False)
+                softmax = ffmodel.softmax(dense, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return name.replace("model.", "")
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = (
-                name.replace(".", "_")
-                .replace("self_attn", "attention")
-                .replace("q_proj", "wq")
-                .replace("k_proj", "wk")
-                .replace("v_proj", "wv")
-                .replace("o_proj", "wo")
-                .replace("mlp", "feed_forward")
-                .replace("gate_proj", "w1")
-                .replace("down_proj", "w2")
-                .replace("up_proj", "w3")
-                .replace("input_layernorm", "attention_norm")
-                .replace("post_attention_layernorm", "ffn_norm")
-                .replace("embed_tokens", "tok_embeddings")
-                .replace("lm_head", "output")
-                .replace("model_", "")
-            )
+            name = FlexFlowLLAMA.convert_hf_weight_name(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
diff --git a/python/flexflow/serve/models/mpt.py b/python/flexflow/serve/models/mpt.py
index 92867fd498..b350ae106d 100644
--- a/python/flexflow/serve/models/mpt.py
+++ b/python/flexflow/serve/models/mpt.py
@@ -97,7 +97,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
 
         axes = [
@@ -114,7 +114,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
             else:
                 hidden_states, layernorm_output = ffmodel.residual_layer_norm(
@@ -126,7 +126,7 @@ def build_model(self, max_tokens_per_batch):
                     True,
                     1e-05,
                     False,
-                    name=f"layers_{i}_norm_1",
+                    name=f"layers.{i}.norm_1",
                 )
 
             if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -148,7 +148,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention_verify(
@@ -169,7 +169,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 attn_outputs = ffmodel.inc_multihead_self_attention(
@@ -190,7 +190,7 @@ def build_model(self, max_tokens_per_batch):
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
                     True,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.attn",
                 )
             else:
                 assert False
@@ -204,7 +204,7 @@ def build_model(self, max_tokens_per_batch):
                 True,
                 1e-05,
                 False,
-                name=f"layers_{i}_norm_2",
+                name=f"layers.{i}.norm_2",
             )
             # mlp
             layernorm_output = ffmodel.dense(
@@ -212,7 +212,7 @@ def build_model(self, max_tokens_per_batch):
                 4 * self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_up_proj",
+                name=f"layers.{i}.ffn.up_proj",
             )
             layernorm_output = ffmodel.gelu(layernorm_output)
             intermediate_output = ffmodel.dense(
@@ -220,7 +220,7 @@ def build_model(self, max_tokens_per_batch):
                 self.mpt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 False,
-                name=f"layers_{i}_ffn_down_proj",
+                name=f"layers.{i}.ffn.down_proj",
             )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -232,7 +232,7 @@ def build_model(self, max_tokens_per_batch):
             True,
             1e-05,
             False,
-            name=f"transformer_norm_f",
+            name=f"norm_f",
         )
         lm_head = ffmodel.dense(
             all_final_norm,
@@ -249,18 +249,27 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(dense, -1)
             output = ffmodel.sampling(softmax, self.generation_config.topp)
         else:
-            output = ffmodel.argmax(lm_head, False)
+            softmax = ffmodel.softmax(lm_head, -1)
+            output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
+    # TODO: finish this
+    def convert_hf_weight_name(name):
+        return (
+            name.replace("transformer.blocks.", "layers.")
+            .replace("transformer.", "")
+            .replace("attn.out_proj", "attn.o_proj")
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.blocks.", "layers.").replace(".", "_")
+            name = FlexFlowMPT.convert_hf_weight_name(name)
             if "Wqkv" in name:
-                name_q = name.replace("attn_Wqkv", "attention_wq")
-                name_k = name.replace("attn_Wqkv", "attention_wk")
-                name_v = name.replace("attn_Wqkv", "attention_wv")
+                name_q = name.replace("attn.Wqkv", "attn.q_proj")
+                name_k = name.replace("attn.Wqkv", "attn.k_proj")
+                name_v = name.replace("attn.Wqkv", "attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -273,13 +282,10 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "out_proj" in name:
-                name = name.replace("attn_out_proj", "attention_wo")
-                params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
             else:
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
 
         shutil.copy(
-            os.path.join(dst_folder, "transformer_wte_weight"),
-            os.path.join(dst_folder, "lm_head_weight"),
+            os.path.join(dst_folder, "wte.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/opt.py b/python/flexflow/serve/models/opt.py
index b715f5f35e..02668abf59 100644
--- a/python/flexflow/serve/models/opt.py
+++ b/python/flexflow/serve/models/opt.py
@@ -139,7 +139,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_attention_layer_norm",
+                    name=f"layers.{i}.self_attn_layer_norm",
                 )
             else:
                 hidden_states = ffmodel.add(token, positional_embedding)
@@ -163,7 +163,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.TREE_VERIFY_MODE:
                 mha = ffmodel.inc_multihead_self_attention_verify(
@@ -183,7 +183,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             elif self.mode == InferenceMode.INC_DECODING_MODE:
                 mha = ffmodel.inc_multihead_self_attention(
@@ -203,7 +203,7 @@ def build_model(self, max_tokens_per_batch):
                     (self.opt_config.hidden_size / self.opt_config.num_attention_heads)
                     ** (-0.5),  # scaling_factor
                     False,  # qk_prod_scaling
-                    name=f"layers_{i}_attention",
+                    name=f"layers.{i}.self_attn",
                 )
             else:
                 assert False
@@ -215,7 +215,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 self.opt_config.layer_norm_elementwise_affine,
                 1e-05,
-                name=f"layers_{i}_add_bias_residual_layer_norm",
+                name=f"layers.{i}.add_bias_residual_layer_norm",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -226,14 +226,14 @@ def build_model(self, max_tokens_per_batch):
                 self.opt_config.ffn_dim,
                 ActiMode.AC_MODE_RELU,
                 True,
-                name=f"layers_{i}_fc1",
+                name=f"layers.{i}.fc1",
             )
             fc2 = ffmodel.dense(
                 fc1,
                 self.opt_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_fc2",
+                name=f"layers.{i}.fc2",
             )
 
             if not self.opt_config.do_layer_norm_before:
@@ -245,7 +245,7 @@ def build_model(self, max_tokens_per_batch):
                     axes,
                     self.opt_config.layer_norm_elementwise_affine,
                     1e-05,
-                    name=f"layers_{i}_final_layer_norm",
+                    name=f"layers.{i}.final_layer_norm",
                 )
 
         _, all_final_norm = ffmodel.residual_layer_norm(
@@ -263,7 +263,7 @@ def build_model(self, max_tokens_per_batch):
             self.opt_config.vocab_size,
             ActiMode.AC_MODE_NONE,
             False,
-            name="embed_tokens_weight_lm_head",
+            name="lm_head",
         )
 
         if self.mode == InferenceMode.BEAM_SEARCH_MODE:
@@ -279,30 +279,29 @@ def build_model(self, max_tokens_per_batch):
                 output = ffmodel.sampling(softmax, self.generation_config.topp)
             else:
                 # output = ffmodel.arg_top_k(lm_head, 1, False)
-                output = ffmodel.argmax(lm_head, False)
+                softmax = ffmodel.softmax(lm_head, -1)
+                output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
+    def convert_hf_weight_name(name):
+        return (
+            name.replace("decoder.", "")
+            .replace("model.", "")
+            .replace("self_attn.out_proj", "self_attn.o_proj")
+            .replace("self_attn.o_proj.bias", "add_bias_residual_layer_norm.attn_bias")
+            .replace(
+                ".final_layer_norm", ".add_bias_residual_layer_norm"
+            )  # important to use the leading "_" to avoid matching the last LayerNorm
+        )
+
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = (
-                name.replace(".", "_")
-                .replace("decoder_", "")
-                .replace("model_", "")
-                .replace("self_attn", "attention")
-                .replace("q_proj", "wq")
-                .replace("k_proj", "wk")
-                .replace("v_proj", "wv")
-                .replace("out_proj", "wo")
-                .replace("attention_wo_bias", "add_bias_residual_layer_norm_attn_bias")
-                .replace(
-                    "_final_layer_norm", "_add_bias_residual_layer_norm"
-                )  # important to use the leading "_" to avoid matching the last LayerNorm
-            )
+            name = FlexFlowOPT.convert_hf_weight_name(name)
             params.detach().cpu().numpy().tofile(f"{dst_folder}/{name}")
         # copy embedding weights
         shutil.copy(
-            os.path.join(dst_folder, "embed_tokens_weight"),
-            os.path.join(dst_folder, "embed_tokens_weight_lm_head"),
+            os.path.join(dst_folder, "embed_tokens.weight"),
+            os.path.join(dst_folder, "lm_head.weight"),
         )
diff --git a/python/flexflow/serve/models/starcoder.py b/python/flexflow/serve/models/starcoder.py
index 37edaa4c40..2d4471201f 100644
--- a/python/flexflow/serve/models/starcoder.py
+++ b/python/flexflow/serve/models/starcoder.py
@@ -111,7 +111,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wte",
+            name="wte",
         )
         positional_embedding = ffmodel.embedding(
             position_tensor,
@@ -121,7 +121,7 @@ def build_model(self, max_tokens_per_batch):
             self.data_type,
             None,
             embed_init,
-            name="transformer_wpe",
+            name="wpe",
         )
 
         axes = [
@@ -139,7 +139,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_1",
+                name=f"layers.{i}.ln_1",
             )
 
             assert self.mode == InferenceMode.INC_DECODING_MODE
@@ -159,7 +159,7 @@ def build_model(self, max_tokens_per_batch):
                 DataType.DT_NONE,  # data_type
                 None,  # kernel initializer
                 False,  # apply_rotary_embedding
-                name=f"layers_{i}_attention",
+                name=f"layers.{i}.attn.c_attn",
             )
 
             residual, l2_norm = ffmodel.residual_layer_norm(
@@ -171,7 +171,7 @@ def build_model(self, max_tokens_per_batch):
                 axes,
                 True,
                 self.starcoder_config.layer_norm_epsilon,
-                name=f"layers_{i}_ln_2",
+                name=f"layers.{i}.ln_2",
             )
 
             # mlp
@@ -181,7 +181,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.intermediate_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_fc",
+                name=f"layers.{i}.mlp.c_fc",
             )
             activation = ffmodel.gelu(c_fc, False)
             c_proj = ffmodel.dense(
@@ -189,7 +189,7 @@ def build_model(self, max_tokens_per_batch):
                 self.starcoder_config.hidden_size,
                 ActiMode.AC_MODE_NONE,
                 True,
-                name=f"layers_{i}_mlp_c_proj",
+                name=f"layers.{i}.mlp.c_proj",
             )
 
         _, ln_f = ffmodel.residual_layer_norm(
@@ -200,7 +200,7 @@ def build_model(self, max_tokens_per_batch):
             axes,
             True,
             self.starcoder_config.layer_norm_epsilon,
-            name=f"transformer_ln_f",
+            name=f"ln_f",
         )
         lm_head = ffmodel.dense(
             ln_f,
@@ -217,18 +217,19 @@ def build_model(self, max_tokens_per_batch):
             softmax = ffmodel.softmax(dense, -1)
             output = ffmodel.sampling(softmax, self.generation_config.topp)
         else:
-            output = ffmodel.argmax(lm_head, False)
+            softmax = ffmodel.softmax(lm_head, -1)
+            output = ffmodel.argmax(softmax, False)
 
         self.ffmodel = ffmodel
 
     def convert_hf_model(model, dst_folder):
         os.makedirs(dst_folder, exist_ok=True)
         for name, params in model.named_parameters():
-            name = name.replace("transformer.h", "layers").replace(".", "_")
-            if "c_attn_weight" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            name = name.replace("transformer.h", "layers").replace("transformer.", "")
+            if "attn.c_attn.weight" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -241,10 +242,10 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_attn_bias" in name:
-                name_q = name.replace("attn_c_attn", "attention_wq")
-                name_k = name.replace("attn_c_attn", "attention_wk")
-                name_v = name.replace("attn_c_attn", "attention_wv")
+            elif "attn.c_attn.bias" in name:
+                name_q = name.replace("attn.c_attn", "attn.c_attn.q_proj")
+                name_k = name.replace("attn.c_attn", "attn.c_attn.k_proj")
+                name_v = name.replace("attn.c_attn", "attn.c_attn.v_proj")
                 q, k, v = torch.split(
                     params,
                     [
@@ -257,14 +258,14 @@ def convert_hf_model(model, dst_folder):
                 q.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_q))
                 k.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_k))
                 v.detach().cpu().numpy().tofile(os.path.join(dst_folder, name_v))
-            elif "c_proj_bias" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.bias" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
-            elif "c_proj_weight" in name:
-                name = name.replace("attn_c_proj", "attention_wo")
+            elif "attn.c_proj.weight" in name:
+                name = name.replace("attn.c_proj", "attn.c_attn.o_proj")
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
             else:
                 params.detach().cpu().numpy().tofile(os.path.join(dst_folder, name))
         model.lm_head.weight.detach().cpu().numpy().tofile(
-            os.path.join(dst_folder, "lm_head_weight")
+            os.path.join(dst_folder, "lm_head.weight")
         )
diff --git a/python/flexflow/serve/serve.py b/python/flexflow/serve/serve.py
index ac622b3337..132c50995b 100644
--- a/python/flexflow/serve/serve.py
+++ b/python/flexflow/serve/serve.py
@@ -28,44 +28,38 @@
 )
 from flexflow.core import *
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer
+from peft import PeftModel, PeftConfig, LoraConfig
 from huggingface_hub import HfApi
-import sys, torch, shutil, hashlib
+import torch, shutil, hashlib, json, gc
 from typing import Union, List
 
 
-class GenerationConfig:
-    """A class to store the sampling configs."""
-
-    def __init__(
-        self,
-        do_sample: bool = False,
-        temperature: float = 0.9,
-        topp: float = 0.8,
-        topk: int = 1,
-    ):
-        """Initialize the sampling configs
-
-        :param do_sample: Whether to perform sampling, or use greedy decoding, defaults to False
-        :type do_sample: bool, optional
-        :param temperature: The temperature setting, defaults to 0.9
-        :type temperature: float, optional
-        :param topp: The top probabilities (top-p) setting, defaults to 0.8
-        :type topp: float, optional
-        :param topk: The top-k setting, defaults to 1
-        :type topk: int, optional
-        """
-        self.do_sample = do_sample
-        self.temperature = temperature
-        self.topp = topp
-        self.topk = topk
-
-
-class GenerationResult:
-    """A class to store the output of a generation request."""
+class _SupportedModels:
+    def __init__(self,):
+        self.supported_models = {
+            "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
+            "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
+            "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig),
+            "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
+            "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
+            "GPTBigCodeForCausalLM": (
+                ModelType.STARCODER,
+                FlexFlowSTARCODER,
+                STARCODERConfig,
+            ),
+            "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig),
+        }
 
-    def __init__(self, text: str = None, tokens: list = None):
-        self.output_text = text
-        self.output_tokens = tokens
+    def get_ff_model_type(self, hf_config):
+        architectures = getattr(hf_config, "architectures", [])
+        ff_arch = None
+        if next(iter(architectures), None) is not None:
+            ff_arch = self.supported_models.get(architectures[0])
+        if ff_arch is None:
+            raise ValueError(
+                f"Huggingface model of type {architectures} is not yet supported by FlexFlow"
+            )
+        return ff_arch
 
 
 class LLM:
@@ -92,68 +86,117 @@ def __init__(
         :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to ""
         :type output_file: str, optional
         """
-        self.supported_models = {
-            "LlamaForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
-            "LLaMAForCausalLM": (ModelType.LLAMA, FlexFlowLLAMA, LLAMAConfig),
-            "OPTForCausalLM": (ModelType.OPT, FlexFlowOPT, OPTConfig),
-            "RWForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
-            "FalconForCausalLM": (ModelType.FALCON, FlexFlowFalcon, FalconConfig),
-            "GPTBigCodeForCausalLM": (
-                ModelType.STARCODER,
-                FlexFlowSTARCODER,
-                STARCODERConfig,
-            ),
-            "MPTForCausalLM": (ModelType.MPT, FlexFlowMPT, MPTConfig),
-        }
+        self.supported_models = _SupportedModels()
         self.hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
         self.model_name = self.hf_config._name_or_path
         (
             self.model_type,
             self.model_class,
             self.config_class,
-        ) = self.__get_ff_model_type()
+        ) = self.supported_models.get_ff_model_type(self.hf_config)
         self.data_type = data_type
         assert self.data_type == DataType.DT_HALF or self.data_type == DataType.DT_FLOAT
         self.cache_path = cache_path if len(cache_path) > 0 else "~/.cache/flexflow"
         self.refresh_cache = refresh_cache
         self.output_file = output_file
         self.rm = None
+        self.pefts = {}
 
     def __del__(self):
         # Stop the background server before deleting the object
         if type(self) == LLM and self.rm is not None:
             self.rm.stop_server()
 
-    def __get_ff_model_type(self):
-        architectures = getattr(self.hf_config, "architectures", [])
-        ff_arch = None
-        if next(iter(architectures), None) is not None:
-            ff_arch = self.supported_models.get(architectures[0])
-        if ff_arch is None:
-            print(
-                f"Huggingface model of type {architectures} is not yet supported by FlexFlow"
+    def add_peft(self, lora_config: LoraLinearConfig):
+        """Add a PEFT adapter to the LLM"""
+        if lora_config is None:
+            raise ValueError("lora_config cannot be None")
+        if len(lora_config.peft_model_id or "") == 0:
+            raise ValueError("PEFT model id cannot be empty")
+        # Inference (trainable=False): LoRA model should already exist in huggingface. Any changes of parameters from original model are ignored
+        # Training (trainable=True): Either an existing model (init_lora_weights=False) or a new one (init_lora_weights=True)
+
+        if lora_config.trainable == False or not lora_config.init_lora_weights:
+            peft_config = PeftConfig.from_pretrained(lora_config.peft_model_id)
+        else:
+            peft_config = LoraConfig(
+                peft_type="LORA",
+                base_model_name_or_path=self.model_name,
+                r=lora_config.rank,
+                target_modules=lora_config.target_modules,
+                lora_alpha=lora_config.lora_alpha,
+                lora_dropout=lora_config.lora_dropout,
+                init_lora_weights=lora_config.init_lora_weights,
             )
-            sys.exit(1)
-        return ff_arch
+        if peft_config.peft_type != "LORA":
+            raise RuntimeError(
+                f"PEFT type {peft_config.peft_type} not yet supported in FlexFlow"
+            )
+        if "base_model_name_or_path" not in peft_config.to_dict():
+            raise ValueError(
+                f"PEFT model {lora_config.peft_model_id} does not have an associated base model"
+            )
+        if peft_config.base_model_name_or_path != self.model_name:
+            raise RuntimeError(
+                f"Attempting to add PEFT with base model name {peft_config.base_model_name_or_path} to LLM {self.model_name}"
+            )
+
+        self.pefts[lora_config] = {
+            "peft_config": peft_config,
+            "peft_type": peft_config.peft_type,
+        }
+
+    def get_ff_peft_id(self, lora_config: LoraLinearConfig) -> PEFTModelID:
+        if lora_config is None:
+            raise ValueError("lora_config cannot be None")
+        if len(lora_config.peft_model_id or "") == 0:
+            raise ValueError("PEFT model id cannot be empty")
+        if lora_config not in self.pefts:
+            raise ValueError(
+                f"PEFT {lora_config} not registered with LLM {self.model_name}"
+            )
+        if "ff_peft_model_id" not in self.pefts[lora_config]:
+            raise RuntimeError(
+                f"Attempting to run PEFT {lora_config} before compiling LLM {self.model_name}"
+            )
+
+        return self.pefts[lora_config]["ff_peft_model_id"]
 
     def download_hf_config(self):
         """Save the HuggingFace model configs to a json file. Useful mainly to run the C++ inference code."""
-        self.config_dir = os.path.join(
+        config_dir = os.path.join(
             os.path.expanduser(self.cache_path), "configs", self.model_name.lower()
         )
-        self.config_path = os.path.join(self.config_dir, "config.json")
-        os.makedirs(self.config_dir, exist_ok=True)
-        print(f"Creating directory {self.config_dir} (if it doesn't exist)...")
-        print(f"Saving {self.model_name} configs to file {self.config_path}...")
-        self.hf_config.to_json_file(self.config_path)
+        config_path = os.path.join(config_dir, "config.json")
+        os.makedirs(config_dir, exist_ok=True)
+        print(f"Creating directory {config_dir} (if it doesn't exist)...")
+        print(f"Saving {self.model_name} configs to file {config_path}...")
+        self.hf_config.to_json_file(config_path)
+
+        # Save PEFT configs if the LLM has any registered PEFTs
+        for ff_peft_config, peft_dict in self.pefts.items():
+            peft_config = peft_dict["peft_config"]
+            peft_model_id = ff_peft_config.peft_model_id
+            peft_config_dir = os.path.join(
+                os.path.expanduser(self.cache_path), "configs", peft_model_id.lower()
+            )
+            os.makedirs(peft_config_dir, exist_ok=True)
+            peft_config_path = os.path.join(peft_config_dir, "config.json")
+            print(f"Saving {peft_model_id} configs to file {peft_config_path}...")
+            with open(peft_config_path, "w") as json_file:
+
+                class SetEncoder(json.JSONEncoder):
+                    def default(self, obj):
+                        if isinstance(obj, set):
+                            return list(obj)
+                        return super().default(obj)
 
-    def __get_revision_hashes(self, model_name: str, weights: bool):
+                json.dump(peft_config.to_dict(), json_file, indent=2, cls=SetEncoder)
+
+    def __get_revision_hashes(self, model_name: str, folder: str):
         ff_revision = None
-        ff_revision_file = (
-            os.path.join(self.weights_path, "rev_sha.txt")
-            if weights
-            else os.path.join(self.tokenizer_path, "rev_sha.txt")
-        )
+        ff_revision_file = os.path.join(folder, "rev_sha.txt")
+
         if os.path.exists(ff_revision_file):
             ff_revision = "".join(open(ff_revision_file).read().split())
 
@@ -173,65 +216,109 @@ def __get_revision_hashes(self, model_name: str, weights: bool):
     def download_hf_weights_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's model weights are available and up to date.
         If not, or if the refresh_cache parameter is set to True, download new weights.
+
+        If any PEFT adapter is registered, perform the same operation for PEFT.
         """
-        if self.data_type == DataType.DT_HALF:
-            torch.set_default_tensor_type(torch.HalfTensor)
-        elif self.data_type == DataType.DT_FLOAT:
-            torch.set_default_tensor_type(torch.FloatTensor)
-        else:
-            assert False, "Data type not yet supported -- cannot download weights!"
 
-        # Use local cache, or download new version
-        self.weights_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "weights",
-            self.model_name.lower(),
-            (
-                "full-precision"
-                if self.data_type == DataType.DT_FLOAT
-                else "half-precision"
-            ),
-        )
-        if self.refresh_cache:
-            print(
-                f"Refreshing weights in cache for model {self.model_name} at path {self.weights_path} ..."
+        def get_weights_path(model_name):
+            return os.path.join(
+                os.path.expanduser(self.cache_path),
+                "weights",
+                model_name.lower(),
+                (
+                    "full-precision"
+                    if self.data_type == DataType.DT_FLOAT
+                    else "half-precision"
+                ),
             )
-            if os.path.exists(self.weights_path):
-                shutil.rmtree(self.weights_path)
-        os.makedirs(self.weights_path, exist_ok=True)
-        print(f"Creating directory {self.weights_path} (if it doesn't exist)...")
 
-        ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, weights=True
-        )
-
-        # Download if needed
-        if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
+        def refresh_cache_if_needed(model_name):
+            weights_path = get_weights_path(model_name)
+            if self.refresh_cache:
                 print(
-                    f"'{self.model_name}' model weights not found in cache or outdated. Downloading from huggingface.co ..."
+                    f"Refreshing weights in cache for model {model_name} at path {weights_path} ..."
                 )
-            else:
-                # Remote model
+                if os.path.exists(weights_path):
+                    shutil.rmtree(weights_path)
+            os.makedirs(weights_path, exist_ok=True)
+
+        def get_hf_llm(model_name):
+            return AutoModelForCausalLM.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                torch_dtype=(
+                    torch.float32
+                    if self.data_type == DataType.DT_FLOAT
+                    else torch.float16
+                ),
+            )
+
+        def download_llm_weights():
+            refresh_cache_if_needed(self.model_name)
+            ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+                self.model_name, self.weights_path
+            )
+            if ff_revision != latest_revision:
                 print(
-                    f"'{self.model_name}' local model weights were updated! Converting new weights now..."
+                    f"'{self.model_name}' local model weights need updating! Downloading/converting new weights now..."
                 )
-            # Download model from HuggingFace, or load it from the local folder
-            hf_model = AutoModelForCausalLM.from_pretrained(
-                self.model_name, trust_remote_code=True
-            )
-            # Print log message to notify user download of model has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading HF weights. Converting them now...")
-            # Convert the model to FlexFlow format
-            self.model_class.convert_hf_model(hf_model, self.weights_path)
-            # Save new revision hash to file
-            with open(ff_revision_file, "w+") as f:
-                f.write(latest_revision)
-            print("Done converting the weights...")
-        else:
-            print(f"Loading '{self.model_name}' model weights from the cache...")
+                hf_model = get_hf_llm(self.model_name)
+                # Convert the model to FlexFlow format
+                self.model_class.convert_hf_model(hf_model, self.weights_path)
+                # Save new revision hash to file
+                with open(ff_revision_file, "w+") as f:
+                    f.write(latest_revision)
+                print(f"Done converting the weights for model {self.model_name}")
+                # Deallocate hf model
+                del hf_model
+                gc.collect()
+                torch.cuda.empty_cache()
+
+        def convert_peft_model(hf_peft_model, peft_type, weights_path):
+            for name, params in hf_peft_model.named_parameters():
+                if peft_type.lower() in name:
+                    name = name.replace("base_model.model.model.", "").replace(
+                        ".default", ""
+                    )
+                    name = self.model_class.convert_hf_weight_name(name)
+                    params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+
+        def download_peft_weights():
+            for ff_peft_config, peft_dict in self.pefts.items():
+                if not ff_peft_config.init_lora_weights:
+                    peft_config = peft_dict["peft_config"]
+                    peft_type = peft_dict["peft_type"]
+                    peft_model_id = ff_peft_config.peft_model_id
+
+                    weights_path = get_weights_path(peft_model_id)
+                    refresh_cache_if_needed(peft_model_id)
+                    ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
+                        peft_model_id, weights_path
+                    )
+
+                    if ff_revision != latest_revision:
+                        print(
+                            f"'{peft_model_id}' local model weights need updating! Downloading/converting new weights now..."
+                        )
+                        hf_model = get_hf_llm(peft_model_id)
+                        hf_peft_model = PeftModel.from_pretrained(
+                            hf_model, peft_model_id, config=peft_config
+                        )
+                        # Convert the model to FlexFlow format
+                        convert_peft_model(hf_peft_model, peft_type, weights_path)
+                        # Save new revision hash to file
+                        with open(ff_revision_file, "w+") as f:
+                            f.write(latest_revision)
+                        print(f"Done converting the weights for model {peft_model_id}")
+                        # Deallocate hf model
+                        del hf_peft_model
+                        del hf_model
+                        gc.collect()
+                        torch.cuda.empty_cache()
+
+        self.weights_path = get_weights_path(self.model_name)
+        download_llm_weights()
+        download_peft_weights()
 
     def download_hf_tokenizer_if_needed(self):
         """Check in the folder specified by the cache_path whether the LLM's tokenizer files are available and up to date.
@@ -241,13 +328,11 @@ def download_hf_tokenizer_if_needed(self):
 
         # Use local cache, or download new version
         self.tokenizer_path = os.path.join(
-            os.path.expanduser(self.cache_path),
-            "tokenizers",
-            self.model_name.lower(),
+            os.path.expanduser(self.cache_path), "tokenizers", self.model_name.lower()
         )
         if self.refresh_cache:
             print(
-                f"Discarding cached tokenizer files (if they exist) for model {self.model_name}..."
+                f"Refreshing cached tokenizer for model {self.model_name} at path {self.tokenizer_path} ..."
             )
             if os.path.exists(self.tokenizer_path):
                 shutil.rmtree(self.tokenizer_path)
@@ -257,46 +342,29 @@ def download_hf_tokenizer_if_needed(self):
 
         # Get local revision SHA, check if it matches latest one on huggingface
         ff_revision, ff_revision_file, latest_revision = self.__get_revision_hashes(
-            self.model_name, weights=False
+            self.model_name, self.tokenizer_path
         )
 
         if ff_revision != latest_revision:
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                # Local model
-                print(
-                    f"'{self.model_name}' tokenizer not found in cache or outdated. Downloading from huggingface.co ..."
-                )
-            else:
-                # Remote model
-                print(
-                    f"'{self.model_name}' local tokenizer was updated! Saving new tokenizer now..."
-                )
+            print(
+                f"'{self.model_name}' tokenizer needs updating! Downloading tokenizer now..."
+            )
             # Download tokenizer from HuggingFace, or load it from the local folder
-            if self.model_type == ModelType.LLAMA:
-                hf_tokenizer = LlamaTokenizer.from_pretrained(
-                    self.model_name, use_fast=True
-                )
-            else:
-                hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-            # Print log message to notify user download of tokenizer has finished
-            if not os.path.exists(self.model_name) or os.path.isdir(self.model_name):
-                print("Done downloading tokenizer. Saving it now...")
+            hf_tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             # Save tokenizer
             hf_tokenizer.save_pretrained(self.tokenizer_path)
-            print("Done saving HF tokenizer.")
+            print("Done updating HF tokenizer.")
             # Save new revision hash to file
             with open(ff_revision_file, "w+") as f:
                 f.write(latest_revision)
 
-        else:
-            print(f"Loading '{self.model_name}' tokenizer from the cache...")
-
     def compile(
         self,
         generation_config: GenerationConfig = GenerationConfig(),
         max_requests_per_batch: int = 1,
         max_seq_length: int = 256,
         max_tokens_per_batch: int = 64,
+        enable_peft_finetuning: bool = False,
         model_specific_data_parallelism_degree: int = None,
         model_specific_tensor_parallelism_degree: int = None,
         model_specific_pipeline_parallelism_degree: int = None,
@@ -312,6 +380,8 @@ def compile(
         :type max_seq_length: int, optional
         :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 64
         :type max_tokens_per_batch: int, optional
+        :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
+        :type enable_peft_finetuning: bool, optional
         :param model_specific_data_parallelism_degree: Use this parameter if you want to give the LLM a different data parallelism degree than the one used to initialize the runtime, defaults to None
         :type model_specific_data_parallelism_degree: int, optional
         :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the LLM a different tensor parallelism degree than the one used to initialize the runtime, defaults to None
@@ -321,9 +391,6 @@ def compile(
         :param ssms: The SSMs to use when operating in speculative inference mode, defaults to []
         :type ssms: list, optional
         """
-        # self.max_requests_per_batch = max_requests_per_batch
-        # self.max_seq_length = max_seq_length
-        # self.max_tokens_per_batch = max_tokens_per_batch
         self.ssms = ssms
         self.generation_config = GenerationConfig()
         self.ffconfig = FFConfig()
@@ -355,6 +422,7 @@ def compile(
         self.rm.set_max_requests_per_batch(max_requests_per_batch)
         self.rm.set_max_tokens_per_batch(max_tokens_per_batch)
         self.rm.set_max_sequence_length(max_seq_length)
+        self.rm.set_enable_peft_finetuning(enable_peft_finetuning)
 
         # Instantiate the relevant model
         self.model = self.model_class(
@@ -366,16 +434,27 @@ def compile(
             max_tokens_per_batch,
         )
 
+        # Download the config from huggingface
+        self.download_hf_config()
+
+        # Download the tokenizer from huggingface (if needed) and load them
+        self.download_hf_tokenizer_if_needed()
+
         # Download the weights from huggingface (if needed)
         self.download_hf_weights_if_needed()
 
+        # Add PEFT layer if registered
+        for ff_peft_config, peft_dict in self.pefts.items():
+            ff_peft_config.ff_compile()
+            ff_peft_model_id = self.model.ffmodel.add_lora_layer(ff_peft_config)
+            peft_dict["ff_peft_model_id"] = ff_peft_model_id
+
         # Create file data loader, load weights into tensors
         model_configs = self.config_class(self.hf_config)
 
         self.rm.set_max_spec_tree_token_num(
             model_configs.max_spec_tree_token_num
-            if "max_spec_tree_token_num"
-            in model_configs.__dict__
+            if "max_spec_tree_token_num" in model_configs.__dict__
             else 20
         )
 
@@ -393,9 +472,6 @@ def compile(
         self.im = InferenceManager()
         self.im.register_model_weights_loader(self.model.ffmodel, self.fileloader)
 
-        # Download the tokenizer from huggingface (if needed) and load them
-        self.download_hf_tokenizer_if_needed()
-
         # Create tokenizer (this must be done after we have downloaded the tokenizer
         bos_token_id = (
             -1 if self.hf_config.bos_token_id is None else self.hf_config.bos_token_id
@@ -419,22 +495,36 @@ def compile(
 
             atexit.register(self.rm.stop_server)
 
-    def generate(self, prompts: Union[str, List[str]], max_length: int = 128):
+    def generate(
+        self,
+        requests_or_prompts: Union[str, List[str], Request, List[Request]],
+        max_length: int = 128,
+    ):
         """Generate tokens based on the input prompt(s)
 
-        :param prompts: The generation prompt(s) in the form of a string, or list of strings
-        :type prompts: Union[str, List[str]]
+        :param requests_or_prompts: The generation prompt(s) in the form of a string, a list of strings, a Request, or list of Requests
+        :type requests_or_prompts: Union[str, List[str], Request, List[Request]]
         :return: the generation results
         :rtype: GenerationResult
         """
-        if type(prompts) == str:
-            if len(prompts) == 0:
+        if type(requests_or_prompts) == str:
+            if len(requests_or_prompts) == 0:
                 return None
-            return self.model.ffmodel.generate([prompts], max_length)
-        elif type(prompts) == list:
-            if len(prompts) == 0:
+            return self.model.ffmodel.generate_inf_only(
+                [requests_or_prompts], max_length
+            )
+        elif type(requests_or_prompts) == Request:
+            return self.model.ffmodel.generate(requests_or_prompts)
+        elif type(requests_or_prompts) == list:
+            if len(requests_or_prompts) == 0:
                 return []
-            return self.model.ffmodel.generate(prompts, max_length)
+            if type(requests_or_prompts[0]) == str:
+                return self.model.ffmodel.generate_inf_only(
+                    requests_or_prompts, max_length
+                )
+            else:
+                print(requests_or_prompts)
+                return self.model.ffmodel.generate(requests_or_prompts)
         else:
             assert False, "Please pass a non-empty string or list of strings"
 
@@ -446,17 +536,6 @@ def stop_server(self):
         self.rm.stop_server()
         print("Background server stopped.")
 
-    def __enter__(self):
-        # Start the server when entering the context
-        # self.rm.start_server(self.model.ffmodel)
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        # Stop the server when exiting the context
-        # self.rm.stop_server()
-        if exc_type:
-            print(f"Exception occurred: {exc_value}")
-
 
 class SSM(LLM):
     """This class creates a SSM (Small-Speculative Model) object based on a model from HuggingFace"""
@@ -482,13 +561,7 @@ def __init__(
         :param output_file: Path to the output file. If left blank, the output will not be written to file, defaults to ""
         :type output_file: str, optional
         """
-        super().__init__(
-            model_name,
-            data_type,
-            cache_path,
-            refresh_cache,
-            output_file,
-        )
+        super().__init__(model_name, data_type, cache_path, refresh_cache, output_file)
 
     def compile(
         self,
@@ -496,15 +569,13 @@ def compile(
         max_requests_per_batch: int = 16,
         max_seq_length: int = 256,
         max_tokens_per_batch: int = 128,
+        enable_peft_finetuning: bool = False,
         model_specific_data_parallelism_degree: int = 1,
         model_specific_tensor_parallelism_degree: int = 1,
         model_specific_pipeline_parallelism_degree: int = 1,
         ssms: list = [],
     ):
         """Compile the SSM for inference and load the weights into memory
-
-        :param mode: The SSM inference mode (InferenceMode.INC_DECODING_MODE for incremental decoding, InferenceMode.BEAM_SEARCH_MODE for beam search, or InferenceMode.TREE_VERIFY_MODE for token tree verification), defaults to InferenceMode.INC_DECODING_MODE
-        :type mode: InferenceMode, optional
         :param generation_config: The GenerationConfig object with the configurations to use for sampling, defaults to GenerationConfig()
         :type generation_config: GenerationConfig, optional
         :param max_requests_per_batch: The maximum batch size to allow, defaults to 16
@@ -513,6 +584,8 @@ def compile(
         :type max_seq_length: int, optional
         :param max_tokens_per_batch: The maximum number of tokens (across requests) to allow per batch, defaults to 128
         :type max_tokens_per_batch: int, optional
+        :param enable_peft_finetuning: Whether to enable support for PEFT fine-tuning, defaults to False
+        :type enable_peft_finetuning: bool, optional
         :param model_specific_data_parallelism_degree: Use this parameter if you want to give the SSM a different data parallelism degree than the default one, defaults to 1
         :type model_specific_data_parallelism_degree: int, optional
         :param model_specific_tensor_parallelism_degree: Use this parameter if you want to give the SSM a different tensor parallelism degree than the default one, defaults to 1
@@ -527,6 +600,7 @@ def compile(
             max_requests_per_batch,
             max_seq_length,
             max_tokens_per_batch,
+            enable_peft_finetuning,
             model_specific_data_parallelism_degree,
             model_specific_tensor_parallelism_degree,
             model_specific_pipeline_parallelism_degree,
diff --git a/python/flexflow/type.py b/python/flexflow/type.py
index 994a85f57e..0f4726837c 100644
--- a/python/flexflow/type.py
+++ b/python/flexflow/type.py
@@ -46,6 +46,12 @@ class LossType(Enum):
     LOSS_IDENTITY = 54
 
 
+class OptimizerType(Enum):
+    OPTIMIZER_TYPE_NONE = 60
+    OPTIMIZER_TYPE_SGD = 61
+    OPTIMIZER_TYPE_ADAM = 62
+
+
 class CompMode(Enum):
     TRAINING = 70
     INFERENCE = 71
@@ -153,6 +159,11 @@ class OpType(Enum):
     RESIDUAL_LAYERNORM = 2306
 
 
+class RequestType(Enum):
+    REQ_INFERENCE = 4001
+    REQ_FINETUNING = 4002
+
+
 def enum_to_int(enum, enum_item):
     for item in enum:
         if enum_item == item:
diff --git a/rdelacou/generate_trace.py b/rdelacou/generate_trace.py
new file mode 100644
index 0000000000..986dab37df
--- /dev/null
+++ b/rdelacou/generate_trace.py
@@ -0,0 +1,121 @@
+import pandas as pd
+from math import ceil
+from random import shuffle, uniform
+import json, pickle, requests, os, argparse
+
+class TraceBuilder(object):
+
+  # trace_type: either "conv" or "code"
+  def __init__(self, import_times=True, import_prompts=True):
+    self.req_times = None
+    self.imported_req_times = False
+    self.prompt_data = None
+    self.imported_prompt_data = False
+    if import_times:
+      self.import_trace_timestamps()
+    if import_prompts:
+      self.import_prompt_data()
+
+  def import_trace_timestamps(self, trace_type="conv"):
+    if not self.imported_req_times:
+      # Import Microsoft LLM 1 hour trace
+      df_trace = pd.read_csv("https://raw.githubusercontent.com/Azure/AzurePublicDataset/master/data/AzureLLMInferenceTrace_"+trace_type+".csv", parse_dates=["TIMESTAMP"])
+      req_times = (pd.to_datetime(df_trace["TIMESTAMP"]).astype(int)//1000) # Timestamps are in microseconds
+      req_times = req_times - req_times.min()
+      self.req_times = req_times.tolist()
+      self.imported_req_times = True
+  
+  def import_prompt_data(self, shuffle_=True):
+    if not self.imported_prompt_data:
+      sharegpt_filename = "sharegpt_opt_text_completion_length.pkl"
+      sharegpt_filepath = f"./{sharegpt_filename}"
+      if os.path.exists(sharegpt_filepath):
+        os.remove("sharegpt_opt_text_completion_length.pkl")
+      sharegpt_url = f"https://github.com/sosp-ae-39/sosp-ae-astra/raw/main/datasets/{sharegpt_filename}"
+      response = requests.get(sharegpt_url)
+      with open(sharegpt_filename, "wb") as file:
+        file.write(response.content)
+      with open(sharegpt_filepath, 'rb') as f:
+        data2 = pickle.load(f)
+      os.remove("sharegpt_opt_text_completion_length.pkl")
+
+      prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+      generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+
+      for pair in data2:
+        assert(len(pair) == 2)
+
+      prompt_lengths = [pair[0] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+      generation_lengths = [pair[1] for pair in data2 if pair[0] <= 2048 and pair[0] >= 4 and pair[1] >= 4 and pair[1] <= 2048 and pair[0]+pair[1] <= 2048]
+      num_pairs = len(prompt_lengths)
+      assert(num_pairs == len(generation_lengths))
+      print("Number of conversation pairs: ", num_pairs)
+
+      print(f"Prompt lengths: min={min(prompt_lengths)}, max={max(prompt_lengths)}, avg={sum(prompt_lengths)/len(prompt_lengths)}")
+      print(f"Generation lengths: min={min(generation_lengths)}, max={max(generation_lengths)}, avg={sum(generation_lengths)/len(generation_lengths)}")
+      total_lengths = [prompt_lengths[i] + generation_lengths[i] for i in range(len(prompt_lengths))]
+      print(f"Total lengths: min={min(total_lengths)}, max={max(total_lengths)}, avg={sum(total_lengths)/len(total_lengths)}")
+
+      self.prompt_data = [{"human": prompt_lengths[i], "gpt": generation_lengths[i]} for i in range(num_pairs)]
+        
+      if shuffle_:
+        shuffle(self.prompt_data)
+      self.imported_prompt_data = True
+
+  # Delta is in seconds
+  # Rate is in req per second
+  def generate_trace(self, target_arrival_rate=10, debug_verbose=False):
+    self.import_trace_timestamps()
+    self.import_prompt_data()
+
+    microsec = 1000000
+    avg_arrival_rate = len(self.req_times) / (self.req_times[-1]/float(microsec)) # Request per second. Computed that way to enforce working with numbers of reasonable orders of magnitude
+    if debug_verbose:
+      print("Avg arrival rate of original trace (req/s): ", avg_arrival_rate)
+    scale_factor = float(target_arrival_rate) / avg_arrival_rate
+    if debug_verbose:
+      print("Scale factor to obtain target arrival rate: ", scale_factor)
+
+    # Buckets are 1 second timeframes
+    nb_buckets = ceil(self.req_times[-1] / microsec)
+    buckets = []
+    j = 0
+    k = 0
+    for i in range(nb_buckets):
+      bucket_size = 0
+      while(j < len(self.req_times) and self.req_times[j] >= i*microsec and self.req_times[j] < (i+1)*microsec):
+        bucket_size += 1
+        j += 1
+      bucket_size = bucket_size*scale_factor
+      prob = bucket_size - int(bucket_size)
+      bucket_size = int(bucket_size) + int(uniform(0, 1) <= prob)
+      
+      # If used all of the prompt data, loop back at the beggining and reuse some prompts
+      if k+bucket_size > len(self.prompt_data):
+        bucket = self.prompt_data[k:] + self.prompt_data[:(k+bucket_size)%len(self.prompt_data)]
+      else:
+        bucket = self.prompt_data[k:k+bucket_size]
+      k = (k+bucket_size) % len(self.prompt_data)
+      buckets.append(bucket)
+
+    if debug_verbose:
+      print("Avg arrival rate obtained (req/s): ", sum([len(b) for b in buckets])/len(buckets))
+    return buckets
+
+def generate_and_save_trace(arrival_rate, output_file):
+  builder = TraceBuilder()
+  trace = builder.generate_trace(target_arrival_rate=arrival_rate, debug_verbose=True)
+  with open(output_file, 'w+') as f:
+    json.dump(trace, f, indent=2)
+
+if __name__ == '__main__':
+  # Set up the argument parser
+  parser = argparse.ArgumentParser(description='Generate and save a trace.')
+  parser.add_argument('--arrival-rate', type=float, default=10.0, help='The target arrival rate for the trace.')
+  parser.add_argument('--output-file', type=str, default='sharegpt.json', help='The path to the output file to save the trace.')
+
+  # Parse the command-line arguments
+  args = parser.parse_args()
+
+  # Call the function with the user-provided arrival rate
+  generate_and_save_trace(args.arrival_rate, args.output_file)
diff --git a/requirements.txt b/requirements.txt
index ad65622367..64f1808934 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,3 +16,11 @@ transformers>=4.31.0
 sentencepiece
 einops
 pip
+# peft-related
+scipy
+bitsandbytes 
+datasets 
+accelerate 
+loralib
+triton
+peft
diff --git a/src/c/flexflow_c.cc b/src/c/flexflow_c.cc
index 5714c8fe3d..e39cb29037 100644
--- a/src/c/flexflow_c.cc
+++ b/src/c/flexflow_c.cc
@@ -67,6 +67,13 @@ class FFCObjectWrapper {
   FF_NEW_OPAQUE_WRAPPER(flexflow_request_manager_t, RequestManager *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_file_data_loader_t, FileDataLoader *);
   FF_NEW_OPAQUE_WRAPPER(flexflow_generation_result_t, GenerationResult *);
+  // FF_NEW_OPAQUE_WRAPPER(flexflow_lora_optimizer_config_t, LoraOptimizerConfig
+  // *); FF_NEW_OPAQUE_WRAPPER(flexflow_lora_sgd_optimizer_config_t,
+  //                       LoraSGDOptimizerConfig *);
+  // FF_NEW_OPAQUE_WRAPPER(flexflow_lora_adam_optimizer_config_t,
+  //                       LoraAdamOptimizerConfig *);
+  FF_NEW_OPAQUE_WRAPPER(flexflow_lora_linear_config_t, LoraLinearConfig *);
+  FF_NEW_OPAQUE_WRAPPER(flexflow_peft_model_id_t, PEFTModelID *);
 };
 
 Logger ffc_log("flexflow_c");
@@ -649,6 +656,7 @@ flexflow_tensor_t *
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   const Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -672,6 +680,7 @@ flexflow_tensor_t *
                               elementwise_affine,
                               eps,
                               use_bias,
+                              inplace_residual,
                               input->data_type,
                               name);
   assert(tensor_outputs[0] != nullptr);
@@ -679,7 +688,7 @@ flexflow_tensor_t *
   DEBUG_PRINT("[ResidualLayerNorm] input %p, residual1 %p, residual2 "
               "%p, output0: %p, "
               "output1: %p, use_two_residuals: %d, elementwise_affine %d, eps "
-              "%f, use_bias: %d, name %s",
+              "%f, use_bias: %d, inplace_residual: %d, name %s",
               input,
               residual1,
               residual2,
@@ -689,6 +698,7 @@ flexflow_tensor_t *
               elementwise_affine,
               eps,
               use_bias,
+              inplace_residual,
               name);
   flexflow_tensor_t *tensor_outputs_wrapped =
       (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t));
@@ -706,6 +716,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
     bool elementwise_affine,
     float eps,
     bool use_bias,
+    bool inplace_residual,
     char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   const Tensor input = FFCObjectWrapper::unwrap(input_);
@@ -722,13 +733,14 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
                                        elementwise_affine,
                                        eps,
                                        use_bias,
+                                       inplace_residual,
                                        input->data_type,
                                        name);
   assert(tensor_outputs[0] != nullptr);
   assert(tensor_outputs[1] != nullptr);
   DEBUG_PRINT("[AddBiasResidualLayerNorm] input %p, residual %p, output0: %p, "
               "output1: %p, elementwise_affine %d, eps "
-              "%f, use_bias %d, name %s",
+              "%f, use_bias %d, inplace_residual: %d, name %s",
               input,
               residual,
               tensor_outputs[0],
@@ -736,6 +748,7 @@ flexflow_tensor_t *flexflow_model_add_add_bias_residual_layer_norm(
               elementwise_affine,
               eps,
               use_bias,
+              inplace_residual,
               name);
   flexflow_tensor_t *tensor_outputs_wrapped =
       (flexflow_tensor_t *)calloc(2, sizeof(flexflow_tensor_t));
@@ -1469,13 +1482,20 @@ flexflow_tensor_t *
                                          const flexflow_tensor_t input2_,
                                          float eps,
                                          int dim,
+                                         bool inplace_residual,
                                          char const *name) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
   Tensor input1 = FFCObjectWrapper::unwrap(input1_);
   Tensor input2 = FFCObjectWrapper::unwrap(input2_);
   Tensor tensor_outputs[2];
-  handle->residual_rms_norm(
-      input1, input2, tensor_outputs, eps, dim, input1->data_type, name);
+  handle->residual_rms_norm(input1,
+                            input2,
+                            tensor_outputs,
+                            eps,
+                            dim,
+                            inplace_residual,
+                            input1->data_type,
+                            name);
   assert(tensor_outputs[0] != nullptr);
   assert(tensor_outputs[1] != nullptr);
   flexflow_tensor_t *tensor_outputs_wrapped =
@@ -1529,6 +1549,21 @@ flexflow_tensor_t flexflow_model_add_argmax(flexflow_model_t handle_,
   return FFCObjectWrapper::wrap(tensor);
 }
 
+flexflow_peft_model_id_t flexflow_model_add_lora_layer(
+    flexflow_model_t handle_,
+    const flexflow_lora_linear_config_t peft_config_) {
+  FFModel *handle = FFCObjectWrapper::unwrap(handle_);
+  LoraLinearConfig const *peft_config = FFCObjectWrapper::unwrap(peft_config_);
+  PEFTModelID *peft_model_id = handle->add_lora_layer(*peft_config);
+
+  DEBUG_PRINT("[Add Lora Layer] model handle: %p, peft_config handle %p, "
+              "peft_model_id: %p",
+              handle,
+              peft_config,
+              peft_model_id);
+  return FFCObjectWrapper::wrap(peft_model_id);
+}
+
 void flexflow_model_set_sgd_optimizer(flexflow_model_t handle_,
                                       flexflow_sgd_optimizer_t optimizer_) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
@@ -1584,39 +1619,83 @@ void flexflow_model_set_transformer_layer_id(flexflow_model_t handle_, int id) {
 
 void flexflow_model_generate(flexflow_model_t handle_,
                              int num_requests,
+                             enum RequestType *request_types,
                              char const **input_texts,
-                             int max_num_chars,
                              char **output_texts,
-                             int max_seq_length,
-                             int **output_length_and_tokens) {
+                             int *max_seq_lengths,
+                             flexflow_peft_model_id_t *peft_model_ids,
+                             char const **dataset_filepaths,
+                             int *training_steps,
+                             int **output_length_and_tokens,
+                             int *num_finetuning_losses,
+                             float *finetuning_losses) {
   FFModel *handle = FFCObjectWrapper::unwrap(handle_);
-  std::vector<std::string> prompts;
+  std::vector<Request> requests;
+
   for (int i = 0; i < num_requests; i++) {
-    std::string const text_str(input_texts[i]);
-    prompts.push_back(text_str);
-    DEBUG_PRINT("[Model] generate[%d] %p %s %i",
-                i,
-                handle,
-                text_str.c_str(),
-                max_seq_length);
+    if (request_types[i] == RequestType::REQ_INFERENCE) {
+      std::string const text_str(input_texts[i]);
+      Request inference_req;
+      inference_req.prompt = text_str;
+      inference_req.max_sequence_length = max_seq_lengths[i];
+      PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      if (peft_model_id != nullptr) {
+        inference_req.peft_model_id = *peft_model_id;
+      }
+      requests.push_back(inference_req);
+      DEBUG_PRINT("[Model] generate[%d] %p %s %i",
+                  i,
+                  handle,
+                  text_str.c_str(),
+                  max_seq_lengths[i]);
+    } else if (request_types[i] == RequestType::REQ_FINETUNING) {
+      Request fine_tuning_req;
+      fine_tuning_req.req_type = RequestType::REQ_FINETUNING;
+      fine_tuning_req.max_sequence_length = max_seq_lengths[i];
+      PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(peft_model_ids[i]);
+      if (peft_model_id != nullptr) {
+        fine_tuning_req.peft_model_id = *peft_model_id;
+      }
+      std::string const dataset_fp(dataset_filepaths[i]);
+      fine_tuning_req.dataset_filepath = dataset_fp;
+      fine_tuning_req.max_training_steps = training_steps[i];
+      requests.push_back(fine_tuning_req);
+      DEBUG_PRINT("[Model] finetune[%d] %p %s %i %i",
+                  i,
+                  handle,
+                  dataset_fp.c_str(),
+                  max_seq_lengths[i],
+                  training_steps[i]);
+    } else {
+      assert(false && "Unknown request type");
+    }
   }
-  std::vector<GenerationResult> results =
-      handle->generate(prompts, max_seq_length);
-  // If the prompt exceeds max seq len, check that we return the prompt with no
-  // additional token. Otherwise, check that the output does not exceed the max
-  // sequence length.
+
+  std::vector<GenerationResult> results = handle->generate(requests);
+
   for (int i = 0; i < num_requests; i++) {
-    assert(results[i].output_tokens.size() <= max_seq_length ||
-           results[i].output_tokens.size() == results[i].input_tokens.size());
-    output_length_and_tokens[i][0] = results[i].output_tokens.size();
-    std::copy(results[i].output_tokens.begin(),
-              results[i].output_tokens.end(),
-              output_length_and_tokens[i] + 1);
-    std::memcpy(output_texts[i],
-                results[i].output_text.c_str(),
-                results[i].output_text.length());
+    if (request_types[i] == RequestType::REQ_INFERENCE) {
+      // If the prompt exceeds max seq len, check that we return the prompt with
+      // no additional token. Otherwise, check that the output does not exceed
+      // the max sequence length.
+      assert(results[i].output_tokens.size() <= max_seq_lengths[i] ||
+             results[i].output_tokens.size() == results[i].input_tokens.size());
+      output_length_and_tokens[i][0] = results[i].output_tokens.size();
+      std::copy(results[i].output_tokens.begin(),
+                results[i].output_tokens.end(),
+                output_length_and_tokens[i] + 1);
+      std::memcpy(output_texts[i],
+                  results[i].output_text.c_str(),
+                  results[i].output_text.length());
+    } else if (request_types[i] == RequestType::REQ_FINETUNING) {
+      assert(results[i].finetuning_losses.size() > 0);
+      *num_finetuning_losses = results[i].finetuning_losses.size();
+      // *finetuning_losses = results[i].finetuning_losses.data();
+      std::memcpy(finetuning_losses,
+                  results[i].finetuning_losses.data(),
+                  results[i].finetuning_losses.size() * sizeof(float));
+    }
   }
-  // return FFCObjectWrapper::wrap(&results[0]);
 }
 
 void flexflow_model_set_position_offset(flexflow_model_t handle_,
@@ -2597,6 +2676,14 @@ void flexflow_request_manager_set_max_sequence_length(
   DEBUG_PRINT("[RequestManager] set max_sequence_length %d", max_seq_length);
 }
 
+void flexflow_request_manager_set_enable_peft_finetuning(
+    flexflow_request_manager_t handle_, bool enable_peft_finetuning_) {
+  RequestManager *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->set_enable_peft_finetuning(enable_peft_finetuning_);
+  DEBUG_PRINT("[RequestManager] set_enable_peft_finetuning %d",
+              enable_peft_finetuning_);
+}
+
 void flexflow_request_manager_register_tokenizer(
     flexflow_request_manager_t handle_,
     enum ModelType model_type,
@@ -2730,3 +2817,238 @@ void flexflow_file_data_loader_load_weights(flexflow_file_data_loader_t handle_,
   FFModel *model = FFCObjectWrapper::unwrap(model_handle_);
   handle->load_weights(model);
 }
+
+// // -----------------------------------------------------------------------
+// // LoraSGDOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_sgd_optimizer_config_t
+// flexflow_lora_sgd_optimizer_config_create(
+//     double lr, double momentum, bool nesterov, bool weight_decay) {
+//   LoraSGDOptimizerConfig *handle =
+//       new LoraSGDOptimizerConfig(lr, momentum, nesterov, weight_decay);
+//   DEBUG_PRINT("[LoraSGDOptimizerConfig] new %p", handle);
+//   return FFCObjectWrapper::wrap(handle);
+// }
+
+// void flexflow_lora_sgd_optimizer_config_destroy(
+//     flexflow_lora_sgd_optimizer_config_t handle_) {
+//   LoraSGDOptimizerConfig *handle = FFCObjectWrapper::unwrap(handle_);
+//   DEBUG_PRINT("[LoraSGDOptimizerConfig] delete %p", handle);
+//   delete handle;
+// }
+
+// // -----------------------------------------------------------------------
+// // LoraAdamOptimizerConfig
+// // -----------------------------------------------------------------------
+
+// flexflow_lora_adam_optimizer_config_t
+//     flexflow_lora_adam_optimizer_config_create(double alpha,
+//                                                double beta1,
+//                                                double beta2,
+//                                                double weight_decay,
+//                                                double epsilon) {
+//   LoraAdamOptimizerConfig *handle =
+//       new LoraAdamOptimizerConfig(alpha, beta1, beta2, weight_decay,
+//       epsilon);
+//   DEBUG_PRINT("[LoraAdamOptimizerConfig] new %p", handle);
+//   return FFCObjectWrapper::wrap(handle);
+// }
+
+// void flexflow_lora_adam_optimizer_config_destroy(
+//     flexflow_lora_adam_optimizer_config_t handle_) {
+//   LoraAdamOptimizerConfig *handle = FFCObjectWrapper::unwrap(handle_);
+//   DEBUG_PRINT("[LoraAdamOptimizerConfig] delete %p", handle);
+//   delete handle;
+// }
+
+// -----------------------------------------------------------------------
+// LoraLinearConfig
+// -----------------------------------------------------------------------
+
+flexflow_lora_linear_config_t
+    flexflow_lora_linear_config_create(char const *cache_folder_,
+                                       char const *peft_model_id_,
+                                       bool trainable,
+                                       bool init_lora_weights,
+                                       char const *base_model_name_or_path_,
+                                       char const *precision_,
+                                       int rank,
+                                       float lora_alpha,
+                                       float lora_dropout,
+                                       int num_target_modules,
+                                       char const **target_modules_,
+                                       enum OptimizerType optimizer_type,
+                                       float sgd_learning_rate,
+                                       float sgd_momentum,
+                                       bool sgd_nesterov,
+                                       float sgd_weight_decay,
+                                       float adam_alpha,
+                                       float adam_beta1,
+                                       float adam_beta2,
+                                       float adam_weight_decay,
+                                       float adam_epsilon) {
+  assert(cache_folder_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  assert(peft_model_id_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  assert(base_model_name_or_path_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  assert(precision_ != nullptr &&
+         "Cannot convert nullptr char * to std::string");
+  std::string const cache_folder(cache_folder_);
+  std::string const peft_model_id(peft_model_id_);
+  LoraOptimizerConfig *optim_config = nullptr;
+  if (optimizer_type == OptimizerType::OPTIMIZER_TYPE_SGD) {
+    optim_config = new LoraSGDOptimizerConfig(
+        sgd_learning_rate, sgd_momentum, sgd_nesterov, sgd_weight_decay);
+  } else if (optimizer_type == OptimizerType::OPTIMIZER_TYPE_ADAM) {
+    optim_config = new LoraAdamOptimizerConfig(
+        adam_alpha, adam_beta1, adam_beta2, adam_weight_decay, adam_epsilon);
+  }
+  std::vector<std::string> target_modules;
+  for (int i = 0; i < num_target_modules; i++) {
+    std::string const target_module(target_modules_[i]);
+    target_modules.push_back(target_module);
+  }
+  std::string const base_model_name_or_path(base_model_name_or_path_);
+  std::string const precision(precision_);
+  LoraLinearConfig *handle = new LoraLinearConfig(cache_folder,
+                                                  peft_model_id,
+                                                  trainable,
+                                                  optim_config,
+                                                  init_lora_weights,
+                                                  base_model_name_or_path,
+                                                  precision,
+                                                  rank,
+                                                  lora_alpha,
+                                                  lora_dropout,
+                                                  target_modules);
+  DEBUG_PRINT("[LoraLinearConfig] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+void flexflow_lora_linear_config_destroy(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *peft_config = FFCObjectWrapper::unwrap(handle_);
+  DEBUG_PRINT("[LoraLinearConfig] delete %p", peft_config);
+  delete peft_config;
+}
+
+char const *flexflow_lora_linear_config_get_cache_folder(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->cache_folder.c_str();
+}
+
+char const *flexflow_lora_linear_config_get_peft_model_id(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->peft_model_id.c_str();
+}
+
+int flexflow_lora_linear_config_get_rank(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->rank;
+}
+
+float flexflow_lora_linear_config_get_lora_alpha(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->lora_alpha;
+}
+
+float flexflow_lora_linear_config_get_lora_dropout(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->lora_dropout;
+}
+
+bool flexflow_lora_linear_config_get_trainable(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->trainable;
+}
+
+bool flexflow_lora_linear_config_get_init_lora_weights(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->init_lora_weights;
+}
+
+char const **flexflow_lora_linear_config_get_target_modules(
+    flexflow_lora_linear_config_t handle_, int *num_target_modules) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  *num_target_modules = handle->target_modules.size();
+  static std::vector<char const *> target_modules_;
+  target_modules_.clear();
+  for (auto const &target_module : handle->target_modules) {
+    target_modules_.push_back(target_module.c_str());
+  }
+  return target_modules_.data();
+}
+
+char const *flexflow_lora_linear_config_get_base_model_name_or_path(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->base_model_name_or_path.c_str();
+}
+
+char const *flexflow_lora_linear_config_get_precision(
+    flexflow_lora_linear_config_t handle_) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  return handle->precision.c_str();
+}
+
+void flexflow_lora_linear_config_set_lora_alpha(
+    flexflow_lora_linear_config_t handle_, float value) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->lora_alpha = value;
+}
+
+void flexflow_lora_linear_config_set_lora_dropout(
+    flexflow_lora_linear_config_t handle_, float value) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->lora_dropout = value;
+}
+
+void flexflow_lora_linear_config_set_trainable(
+    flexflow_lora_linear_config_t handle_, bool value) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->trainable = value;
+}
+
+void flexflow_lora_linear_config_set_init_lora_weights(
+    flexflow_lora_linear_config_t handle_, bool value) {
+  LoraLinearConfig *handle = FFCObjectWrapper::unwrap(handle_);
+  handle->init_lora_weights = value;
+}
+
+// -----------------------------------------------------------------------
+// PEFTModelID
+// -----------------------------------------------------------------------
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create() {
+  PEFTModelID *handle = new PEFTModelID();
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+flexflow_peft_model_id_t flexflow_peft_model_id_create_id(size_t id) {
+  PEFTModelID *handle = new PEFTModelID(id);
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+flexflow_peft_model_id_t flexflow_peft_model_id_no_id() {
+  PEFTModelID *handle = const_cast<PEFTModelID *>(&PEFTModelID::NO_ID);
+  DEBUG_PRINT("[PEFTModelID] new %p", handle);
+  return FFCObjectWrapper::wrap(handle);
+}
+
+void flexflow_peft_model_id_destroy(flexflow_peft_model_id_t handle_) {
+  PEFTModelID *peft_model_id = FFCObjectWrapper::unwrap(handle_);
+  DEBUG_PRINT("[PEFTModelID] delete %p", peft_model_id);
+  delete peft_model_id;
+}
diff --git a/src/loss_functions/loss_functions.cpp b/src/loss_functions/loss_functions.cpp
index a87aaade84..99c13f5a67 100644
--- a/src/loss_functions/loss_functions.cpp
+++ b/src/loss_functions/loss_functions.cpp
@@ -86,7 +86,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper(
                      num_classes,
                      k);
   // Scale logit gradients by op->scale_factor
-  hipLaunchKernelGGL(scale_kernel,
+  hipLaunchKernelGGL(scale_kernel<float>,
                      GET_BLOCKS(logit_grad_volume),
                      CUDA_NUM_THREADS,
                      0,
@@ -116,7 +116,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper(
                      label_ptr,
                      logit_volume);
   // Scale logit gradients by loss->scale_factor
-  hipLaunchKernelGGL(scale_kernel,
+  hipLaunchKernelGGL(scale_kernel<float>,
                      GET_BLOCKS(logit_grad_volume),
                      CUDA_NUM_THREADS,
                      0,
@@ -146,7 +146,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper(
                      label_ptr,
                      logit_volume);
   // Scale logit gradients by loss->scale_factor
-  hipLaunchKernelGGL(scale_kernel,
+  hipLaunchKernelGGL(scale_kernel<float>,
                      GET_BLOCKS(logit_grad_volume),
                      CUDA_NUM_THREADS,
                      0,
@@ -173,7 +173,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr,
                      loss_ptr,
                      loss_volume);
   // Scale logit gradients by loss->scale_factor
-  hipLaunchKernelGGL(scale_kernel,
+  hipLaunchKernelGGL(scale_kernel<float>,
                      GET_BLOCKS(loss_grad_volume),
                      CUDA_NUM_THREADS,
                      0,
diff --git a/src/loss_functions/loss_functions.cu b/src/loss_functions/loss_functions.cu
index f78311980c..636ef9c4c3 100644
--- a/src/loss_functions/loss_functions.cu
+++ b/src/loss_functions/loss_functions.cu
@@ -81,7 +81,7 @@ void Loss::sparse_categorical_crossentropy_loss_backward_kernel_wrapper(
       logit_grad_ptr, label_ptr, num_samples, num_classes, k);
   // Scale logit gradients by op->scale_factor
   scale_kernel<<<GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      logit_grad_ptr, logit_grad_volume, 0, scale_factor * k);
+      logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor * k);
 }
 
 void Loss::categorical_crossentropy_loss_backward_kernel_wrapper(
@@ -100,7 +100,7 @@ void Loss::categorical_crossentropy_loss_backward_kernel_wrapper(
       logit_grad_ptr, logit_ptr, label_ptr, logit_volume);
   // Scale logit gradients by loss->scale_factor
   scale_kernel<<<GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      logit_grad_ptr, logit_grad_volume, 0, scale_factor);
+      logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor);
 }
 
 void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper(
@@ -119,7 +119,7 @@ void Loss::mean_squared_error_avg_loss_backward_kernel_wrapper(
       logit_grad_ptr, logit_ptr, label_ptr, logit_volume);
   // Scale logit gradients by loss->scale_factor
   scale_kernel<<<GET_BLOCKS(logit_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      logit_grad_ptr, logit_grad_volume, 0, scale_factor);
+      logit_grad_ptr, logit_grad_volume, 0.0f, scale_factor);
 }
 
 void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr,
@@ -135,7 +135,7 @@ void Loss::identity_loss_backward_kernel_wrapper(float *loss_grad_ptr,
                            stream>>>(loss_grad_ptr, loss_ptr, loss_volume);
   // Scale logit gradients by loss->scale_factor
   scale_kernel<<<GET_BLOCKS(loss_grad_volume), CUDA_NUM_THREADS, 0, stream>>>(
-      loss_grad_ptr, loss_grad_volume, 0, scale_factor);
+      loss_grad_ptr, loss_grad_volume, 0.0f, scale_factor);
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/add_bias_residual_layer_norm.cc b/src/ops/add_bias_residual_layer_norm.cc
index a17e156f18..7a1da2e974 100644
--- a/src/ops/add_bias_residual_layer_norm.cc
+++ b/src/ops/add_bias_residual_layer_norm.cc
@@ -43,7 +43,8 @@ bool operator==(AddBiasResidualLayerNormParams const &lhs,
                 AddBiasResidualLayerNormParams const &rhs) {
   return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes &&
          lhs.elementwise_affine == rhs.elementwise_affine &&
-         lhs.use_bias == rhs.use_bias;
+         lhs.use_bias == rhs.use_bias &&
+         lhs.inplace_residual == rhs.inplace_residual;
 }
 
 bool AddBiasResidualLayerNormParams::is_valid(
@@ -58,7 +59,8 @@ AddBiasResidualLayerNormParams AddBiasResidualLayerNorm::get_params() const {
   params.elementwise_affine = this->elementwise_affine;
   params.eps = this->eps;
   params.use_bias = this->use_bias;
-  if (this->name != nullptr) {
+  params.inplace_residual = this->inplace_residual;
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -71,6 +73,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input,
                                            bool elementwise_affine,
                                            float eps,
                                            bool use_bias,
+                                           bool inplace_residual,
                                            DataType data_type,
                                            char const *name) {
   // In PyTorch, axes must be the sizes of the last axes.size() dimensions of
@@ -171,6 +174,7 @@ void FFModel::add_bias_residual_layer_norm(const Tensor input,
   ln->add_int_property("use_bias", use_bias);
   ln->add_int_vector_property("axes", axes);
   ln->add_float_property("eps", eps);
+  ln->add_int_property("inplace_residual", inplace_residual);
   layers.push_back(ln);
   outputs[0] = ln->outputs[0];
   outputs[1] = ln->outputs[1];
@@ -189,6 +193,8 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer(
   layer->get_int_vector_property("axes", axes);
   float eps;
   layer->get_float_property("eps", eps);
+  layer->get_int_property("inplace_residual", value);
+  bool inplace_residual = (bool)value;
   return new AddBiasResidualLayerNorm(model,
                                       layer->layer_guid,
                                       inputs[0],
@@ -197,6 +203,7 @@ Op *AddBiasResidualLayerNorm::create_operator_from_layer(
                                       elementwise_affine,
                                       use_bias,
                                       eps,
+                                      inplace_residual,
                                       false, // allocate_weights
                                       layer->name);
 }
@@ -215,6 +222,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
                                params.elementwise_affine,
                                params.use_bias,
                                params.eps,
+                               params.inplace_residual,
                                allocate_weights,
                                params.name) {}
 
@@ -227,6 +235,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
     bool _elementwise_affine,
     bool _use_bias,
     float _eps,
+    bool _inplace_residual,
     bool allocate_weights,
     char const *name)
     : Op(model,
@@ -239,7 +248,7 @@ AddBiasResidualLayerNorm::AddBiasResidualLayerNorm(
          _input,
          _residual),
       elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes),
-      use_bias(_use_bias) {
+      use_bias(_use_bias), inplace_residual(_inplace_residual) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -348,48 +357,57 @@ void AddBiasResidualLayerNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
   // attn output
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  // added: attn_output + attn final bias + residual
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   // attn final bias
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   if (elementwise_affine) {
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(5, FID_DATA);
+    launcher.add_field(fid++, FID_DATA);
 
     if (use_bias) {
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -397,7 +415,7 @@ void AddBiasResidualLayerNorm::init_inference(
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(6, FID_DATA);
+      launcher.add_field(fid++, FID_DATA);
     }
   }
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
@@ -420,48 +438,56 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  // attn output
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(outputs[0]->part == inputs[0]->part);
+    assert(outputs[0]->region == inputs[0]->region);
+  }
+  // input: attn output
+  // added: attn_output + attn final bias + residual
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      WRITE_ONLY,
+                                                      EXCLUSIVE,
+                                                      outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   // attn final bias
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   if (elementwise_affine) {
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(5, FID_DATA);
+    launcher.add_field(fid++, FID_DATA);
 
     if (use_bias) {
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
@@ -469,7 +495,7 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(6, FID_DATA);
+      launcher.add_field(fid++, FID_DATA);
     }
   }
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
@@ -478,13 +504,11 @@ void AddBiasResidualLayerNorm::init(FFModel const &ff) {
 }
 
 /*
-  regions[0](I): attn output
-  regions[1](I): residual
-  regions[2](O): added output (attn output + final attn bias + residual)
-  regions[3](O): layer norm output
-  regions[4](I): final attn bias
-  regions[5](I): gamma
-  regions[6](I): beta
+  regions[0](I/O): attn output AND added output (attn output + final attn bias +
+  residual) regions[1](I): residual regions[2](O): layer norm output
+  regions[3](I): final attn bias
+  regions[4](I): gamma
+  regions[5](I): beta
 */
 OpMeta *AddBiasResidualLayerNorm::init_task(
     Task const *task,
@@ -517,10 +541,6 @@ void AddBiasResidualLayerNorm::forward(FFModel const &ff) {
   assert(false);
 }
 
-void AddBiasResidualLayerNorm::backward(FFModel const &ff) {
-  assert(false);
-}
-
 FutureMap AddBiasResidualLayerNorm::inference(
     FFModel const &ff,
     BatchConfigFuture const &bc,
@@ -546,69 +566,94 @@ FutureMap AddBiasResidualLayerNorm::inference(
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  // attn output
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
+  int fid = 0;
+  // input
+  // added_output: input + attn bias + residual
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
+  // attn bias
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+                                                    weights[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   // residual
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  // added: attn_output + attn final bias + residual
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
-  // layer norm output
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
+  // output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
-  // attn final bias
-  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   if (elementwise_affine) {
+    // gamma
     launcher.add_region_requirement(RegionRequirement(weights[1]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
                                                       weights[1]->region));
-    launcher.add_field(5, FID_DATA);
-
+    launcher.add_field(fid++, FID_DATA);
     if (use_bias) {
+      // beta
       launcher.add_region_requirement(RegionRequirement(weights[2]->part,
                                                         0 /*projection id*/,
                                                         READ_ONLY,
                                                         EXCLUSIVE,
                                                         weights[2]->region));
-      launcher.add_field(6, FID_DATA);
+      launcher.add_field(fid++, FID_DATA);
     }
   }
   return runtime->execute_index_space(ctx, launcher);
 }
 
+void AddBiasResidualLayerNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  if (inplace_residual) {
+    outputs[0]->parallel_is = inputs[0]->parallel_is;
+    outputs[0]->region = inputs[0]->region;
+    outputs[0]->part = inputs[0]->part;
+    outputs[0]->region_grad = inputs[0]->region_grad;
+    outputs[0]->part_grad = inputs[0]->part_grad;
+    // map output 1 to new region
+    ff.map_tensor(outputs[1], this);
+  } else {
+    Op::map_output_tensors(ff);
+  }
+}
+
 /*
-  regions[0](I): attn output
-  regions[1](I): residual
-  regions[2](O): added output (attn output + final attn bias + residual)
-  regions[3](O): layer norm output
-  regions[4](I): final attn bias
-  regions[5](I): gamma
-  regions[6](I): beta
+  regions[0](I): input / added output
+  regions[1](I): attn bias
+  regions[2](I): residual
+  regions[3](O): output
+  regions[4](I): gamma
+  regions[5](I): beta
 */
 void AddBiasResidualLayerNorm::inference_task(
     Task const *task,
@@ -626,30 +671,72 @@ void AddBiasResidualLayerNorm::inference_task(
       *((AddBiasResidualLayerNormMeta **)task->local_args);
 
   assert(regions.size() ==
-         5 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR residual = helperGetGenericTensorAccessorRO(
-      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW added_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR attn_bias = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+         4 + (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
+
+  int rid = 0, tid = 0, did = 0;
+  GenericTensorAccessorR input =
+      helperGetGenericTensorAccessorRO(m->input_type[0],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR attn_bias =
+      helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR residual =
+      helperGetGenericTensorAccessorRO(m->input_type[1],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW added_output;
+  if (m->inplace_residual) {
+    added_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                    regions[0],
+                                                    task->regions[0],
+                                                    FID_DATA,
+                                                    ctx,
+                                                    runtime);
+  } else {
+    added_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                    regions[rid++],
+                                                    task->regions[tid++],
+                                                    FID_DATA,
+                                                    ctx,
+                                                    runtime);
+  }
+  GenericTensorAccessorW output =
+      helperGetGenericTensorAccessorWO(m->output_type[1],
+                                       regions[rid++],
+                                       task->regions[tid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
   GenericTensorAccessorR gamma, beta;
 
   Domain in_domain = runtime->get_index_space_domain(
-      ctx, task->regions[0].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
+  Domain attn_bias_domain = runtime->get_index_space_domain(
+      ctx, task->regions[did++].region.get_index_space());
   Domain residual_domain = runtime->get_index_space_domain(
-      ctx, task->regions[1].region.get_index_space());
-  Domain added_out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[2].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
+  Domain added_out_domain;
+  if (m->inplace_residual) {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[0].region.get_index_space());
+  } else {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[did++].region.get_index_space());
+  }
   Domain out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[3].region.get_index_space());
-  Domain attn_bias_domain = runtime->get_index_space_domain(
-      ctx, task->regions[4].region.get_index_space());
+      ctx, task->regions[did++].region.get_index_space());
+
   Domain gamma_domain, beta_domain;
 
   assert(in_domain.get_volume() == out_domain.get_volume());
@@ -673,23 +760,23 @@ void AddBiasResidualLayerNorm::inference_task(
 
   if (m->elementwise_affine) {
     gamma = helperGetGenericTensorAccessorRO(m->weight_type[1],
-                                             regions[5],
-                                             task->regions[5],
+                                             regions[rid++],
+                                             task->regions[tid++],
                                              FID_DATA,
                                              ctx,
                                              runtime);
     gamma_domain = runtime->get_index_space_domain(
-        ctx, task->regions[5].region.get_index_space());
+        ctx, task->regions[did++].region.get_index_space());
 
     if (m->use_bias) {
       beta = helperGetGenericTensorAccessorRO(m->weight_type[2],
-                                              regions[6],
-                                              task->regions[6],
+                                              regions[rid++],
+                                              task->regions[tid++],
                                               FID_DATA,
                                               ctx,
                                               runtime);
       beta_domain = runtime->get_index_space_domain(
-          ctx, task->regions[6].region.get_index_space());
+          ctx, task->regions[did++].region.get_index_space());
       assert(gamma_domain == beta_domain);
     }
 
@@ -707,16 +794,7 @@ void AddBiasResidualLayerNorm::inference_task(
   }
 
   AddBiasResidualLayerNorm::inference_kernel_wrapper(
-      m,
-      (int)attn_bias_dim,
-      (int)residual_domain.get_volume(),
-      input,
-      added_output,
-      output,
-      residual,
-      attn_bias,
-      gamma,
-      beta);
+      m, bc, input, attn_bias, residual, added_output, output, gamma, beta);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -729,13 +807,299 @@ void AddBiasResidualLayerNorm::inference_task(
         weights_accessors.push_back(beta);
       }
     }
+    AddBiasResidualLayerNorm::save_inference_tensors_to_file(
+        m, shard_id, bc, {residual}, weights_accessors, {added_output, output});
+  }
+}
+
+void AddBiasResidualLayerNorm::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // added output
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // attn bias
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region));
+    launcher.add_field(field_id++, FID_DATA);
+    // gamma_grad
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+    if (use_bias) {
+      // beta_grad
+      launcher.add_region_requirement(
+          RegionRequirement(weights[2]->part_grad,
+                            0 /*projection id*/,
+                            READ_WRITE,
+                            EXCLUSIVE,
+                            weights[2]->region_grad));
+      launcher.add_field(field_id++, FID_DATA);
+    }
+  }
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void AddBiasResidualLayerNorm::backward_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  AddBiasResidualLayerNormMeta *m =
+      *((AddBiasResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() ==
+         5 + (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR added_output =
+      helperGetGenericTensorAccessorRO(m->output_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW attn_bias_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[2],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    gamma_grad =
+        helperGetGenericTensorAccessorRW(m->output_type[0],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+    if (m->use_bias) {
+      beta_grad =
+          helperGetGenericTensorAccessorRW(m->output_type[0],
+                                           regions[region_idx++],
+                                           task->regions[task_region_idx++],
+                                           FID_DATA,
+                                           ctx,
+                                           runtime);
+    }
+  }
+  AddBiasResidualLayerNorm::backward_kernel_wrapper(m,
+                                                    output_grad,
+                                                    added_output,
+                                                    input_grad,
+                                                    residual_grad,
+                                                    attn_bias_grad,
+                                                    gamma,
+                                                    gamma_grad,
+                                                    beta_grad);
+}
+
+Legion::FutureMap AddBiasResidualLayerNorm::peft_bwd(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[1]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[1]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void AddBiasResidualLayerNorm::peft_bwd_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  assert(task->regions.size() == regions.size());
+  AddBiasResidualLayerNormMeta *m =
+      *((AddBiasResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() == 3 + m->elementwise_affine);
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
+  GenericTensorAccessorR gamma;
+  if (m->elementwise_affine) {
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+  }
+  AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+      m, output_grad, input_grad, residual_grad, gamma);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    std::vector<GenericTensorAccessorR> weights_accessors;
+    if (m->elementwise_affine) {
+      weights_accessors.push_back(gamma);
+    }
     AddBiasResidualLayerNorm::save_inference_tensors_to_file(
         m,
         shard_id,
         bc,
-        {input, residual},
+        {input_grad, residual_grad},
         weights_accessors,
-        {added_output, output});
+        {output_grad},
+        false /*fwd_pass*/);
   }
 }
 
@@ -755,6 +1119,7 @@ void AddBiasResidualLayerNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->elementwise_affine);
   sez.serialize(this->eps);
   sez.serialize(this->use_bias);
+  sez.serialize(this->inplace_residual);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -771,6 +1136,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   bool elementwise_affine;
   bool use_bias;
   float eps;
+  bool inplace_residual;
   size_t id, transformer_layer_id, deserialized_model_id;
   dez.deserialize(id);
   dez.deserialize(transformer_layer_id);
@@ -785,6 +1151,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   dez.deserialize(elementwise_affine);
   dez.deserialize(eps);
   dez.deserialize(use_bias);
+  dez.deserialize(inplace_residual);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -796,6 +1163,7 @@ Node AddBiasResidualLayerNorm::deserialize(FFModel &ff,
   params.elementwise_affine = elementwise_affine;
   params.eps = eps;
   params.use_bias = use_bias;
+  params.inplace_residual = inplace_residual;
   strcpy(params.name, name);
   return ff.get_or_create_node<AddBiasResidualLayerNorm>({inputs[0], inputs[1]},
                                                          params);
@@ -816,6 +1184,7 @@ size_t hash<FlexFlow::AddBiasResidualLayerNormParams>::operator()(
   }
   hash_combine(key, params.elementwise_affine);
   hash_combine(key, params.use_bias);
+  hash_combine(key, params.inplace_residual);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/add_bias_residual_layer_norm.cpp b/src/ops/add_bias_residual_layer_norm.cpp
index 1add43ecd9..681f55c998 100644
--- a/src/ops/add_bias_residual_layer_norm.cpp
+++ b/src/ops/add_bias_residual_layer_norm.cpp
@@ -23,12 +23,13 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
     FFHandler handle,
     AddBiasResidualLayerNorm const *ln,
     MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
@@ -45,6 +46,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) {
@@ -75,7 +77,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -84,9 +86,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
-            ? shared[lid]
-            : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -94,53 +94,36 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
 }
 
 template <typename T>
-__global__ void LayerNormFusedForwardKernel(int attn_bias_dim,
-                                            int residual_volume,
-                                            int64_t effective_num_elements,
-                                            int64_t effective_batch_size,
+__global__ void LayerNormFusedForwardKernel(int64_t N,
+                                            int64_t attn_bias_dim,
                                             float eps,
                                             T const *input_ptr,
                                             T const *attn_bias_ptr,
                                             T const *residual_ptr,
-                                            T *added_output_ptr,
-                                            T *output_ptr,
-                                            T const *gamma_ptr,
-                                            T const *beta_ptr,
+                                            T *X,
                                             T *mean,
-                                            T *rstd) {
-  // Add attention bias and residual
-  CUDA_KERNEL_LOOP(i, residual_volume) {
-    int bias_idx = i % attn_bias_dim;
-    added_output_ptr[i] =
-        input_ptr[i] + attn_bias_ptr[bias_idx] + residual_ptr[i];
-  }
-
-  __syncthreads();
-
-  // LayerNorm
+                                            T *rstd,
+                                            T const *gamma,
+                                            T const *beta,
+                                            T *Y) {
   __shared__ float m_shared[C10_WARP_SIZE];
   __shared__ float v_shared[C10_WARP_SIZE];
   const int64_t i = blockIdx.x;
-  if (i >= effective_batch_size) {
-    return;
-  }
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < effective_num_elements;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
-    const int64_t index = i * effective_num_elements + j;
-    sum1 += static_cast<float>(added_output_ptr[index]);
-    sum2 += static_cast<float>(added_output_ptr[index]) *
-            static_cast<float>(added_output_ptr[index]);
-  }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const int64_t bias_idx = index % attn_bias_dim;
+    X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index];
+    sum1 += static_cast<float>(X[index]);
+    sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
-    float const scale = float(1) / static_cast<float>(effective_num_elements);
+    float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
     sum2 = max(sum2 * scale - sum1 * sum1, float(0));
     mean[i] = static_cast<T>(sum1);
@@ -150,17 +133,15 @@ __global__ void LayerNormFusedForwardKernel(int attn_bias_dim,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < effective_num_elements;
-       j += min(blockDim.x, kCUDANumThreads)) {
-    const int64_t index = i * effective_num_elements + j;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
     const T_ACC gamma_v =
-        gamma_ptr == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma_ptr[j]);
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
     const T_ACC beta_v =
-        beta_ptr == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta_ptr[j]);
-    output_ptr[index] = (static_cast<T_ACC>(added_output_ptr[index]) -
-                         static_cast<T_ACC>(mean[i])) *
-                            static_cast<T_ACC>(rstd[i]) * gamma_v +
-                        beta_v;
+        beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]);
+    Y[index] = (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
+                   static_cast<T_ACC>(rstd[i]) * gamma_v +
+               beta_v;
   }
 }
 
@@ -178,57 +159,108 @@ void AddBiasResidualLayerNorm::inference_kernel(
     T const *gamma_ptr,
     T const *beta_ptr,
     hipStream_t stream) {
-
-  std::pair<int, int> kernel1_parallelism = std::make_pair(
-      GET_BLOCKS(residual_volume), std::min(residual_volume, CUDA_NUM_THREADS));
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel3_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks = std::max({kernel1_parallelism.first,
-                             kernel2_parallelism.first,
-                             kernel3_parallelism.first});
-  int num_threads = std::max({kernel1_parallelism.second,
-                              kernel2_parallelism.second,
-                              kernel3_parallelism.second});
-
   hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormFusedForwardKernel<T>),
-                     num_blocks,
-                     num_threads,
+                     m->effective_batch_size,
+                     std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
                      0,
                      stream,
-                     attn_bias_dim,
-                     residual_volume,
                      m->effective_num_elements,
-                     m->effective_batch_size,
+                     attn_bias_dim,
                      m->eps,
                      input_ptr,
                      attn_bias_ptr,
                      residual_ptr,
                      added_output_ptr,
-                     output_ptr,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
                      gamma_ptr,
                      beta_ptr,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr));
+                     output_ptr);
 }
 
 /*static*/
 void AddBiasResidualLayerNorm::inference_kernel_wrapper(
-    AddBiasResidualLayerNormMeta const *m,
-    int attn_bias_dim,
-    int residual_volume,
+    AddBiasResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
+    GenericTensorAccessorR const &attn_bias,
+    GenericTensorAccessorR const &residual,
     GenericTensorAccessorW &added_output,
     GenericTensorAccessorW &output,
-    GenericTensorAccessorR const &residual,
-    GenericTensorAccessorR const &attn_bias,
     GenericTensorAccessorR const &gamma,
     GenericTensorAccessorR const &beta) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  // inference kernel
+  int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1;
+  int residual_volume = residual.domain.get_volume();
   if (m->input_type[0] == DT_FLOAT) {
     AddBiasResidualLayerNorm::inference_kernel<float>(
         m,
@@ -239,8 +271,8 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         residual.get_float_ptr(),
         added_output.get_float_ptr(),
         output.get_float_ptr(),
-        gamma.get_float_ptr(),
-        m->use_bias ? beta.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
         stream);
   } else if (m->input_type[0] == DT_HALF) {
     AddBiasResidualLayerNorm::inference_kernel<half>(
@@ -252,12 +284,566 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
         residual.get_half_ptr(),
         added_output.get_half_ptr(),
         output.get_half_ptr(),
-        gamma.get_half_ptr(),
-        m->use_bias ? beta.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
         stream);
   } else {
     assert(false && "unsupport datatype in layernorm");
   }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[AddBiasResidualLayerNorm] forward time (CF) = %.9fms\n", elapsed);
+    // if (m->input_type[0] == DT_FLOAT) {
+    //   print_tensor<float>(input.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:input]");
+    //   print_tensor<float>(attn_bias.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:attn_bias]");
+    //   print_tensor<float>(residual.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:residual]");
+    //   print_tensor<float>(added_output.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:added_output]");
+    //   print_tensor<float>(output.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:output]");
+    //   print_tensor<float>(gamma.get_float_ptr(),
+    //                       32,
+    //                       "[AddBiasResidualLayerNorm:forward:gamma]");
+    //   print_tensor<float>(
+    //       beta.get_float_ptr(), 32,
+    //       "[AddBiasResidualLayerNorm:forward:beta]");
+    // } else {
+    //   print_tensor<half>(
+    //       input.get_half_ptr(), 32,
+    //       "[AddBiasResidualLayerNorm:forward:input]");
+    //   print_tensor<half>(attn_bias.get_half_ptr(),
+    //                      32,
+    //                      "[AddBiasResidualLayerNorm:forward:attn_bias]");
+    //   print_tensor<half>(residual.get_half_ptr(),
+    //                      32,
+    //                      "[AddBiasResidualLayerNorm:forward:residual]");
+    //   print_tensor<half>(added_output.get_half_ptr(),
+    //                      32,
+    //                      "[AddBiasResidualLayerNorm:forward:added_output]");
+    //   print_tensor<half>(output.get_half_ptr(),
+    //                      32,
+    //                      "[AddBiasResidualLayerNorm:forward:output]");
+    //   print_tensor<half>(
+    //       gamma.get_half_ptr(), 32,
+    //       "[AddBiasResidualLayerNorm:forward:gamma]");
+    //   print_tensor<half>(
+    //       beta.get_half_ptr(), 32,
+    //       "[AddBiasResidualLayerNorm:forward:beta]");
+    // }
+    // print_tensor<T>(in_ptr, 32, "[AddBiasResidualLayerNorm:forward:input]");
+    // print_tensor<T>(out_ptr, 32,
+    // "[AddBiasResidualLayerNorm:forward:output]");
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual_i = dX_residual + i1 * N;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad) {
+      dX_residual_i[l] = f_grad_input;
+    } else {
+      dX_residual_i[l] += f_grad_input;
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual,
+             reset_input_grad,
+             reset_residual_grad,
+             N,
+             buf);
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::backward_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T const *added_output_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T *attn_bias_grad_ptr,
+    T const *gamma_ptr,
+    T *gamma_grad_ptr,
+    T *beta_grad_ptr,
+    hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
+                     M,
+                     kCUDABlockReduceNumThreads,
+                     0,
+                     stream,
+                     N,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     gamma_ptr,
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel<T>),
+                     B,
+                     kCUDANumThreads,
+                     0,
+                     stream,
+                     M,
+                     N,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr),
+                     static_cast<T *>(m->scale_ptr),
+                     static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual_grad_ptr,
+                     m->reset_input_grads[0],
+                     m->reset_input_grads[1],
+                     N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel<T>),
+                         B,
+                         kCUDANumThreads,
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel<T>),
+                         B,
+                         dim3(kThreadX, kThreadY),
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::backward_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR &added_output,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorW const &attn_bias_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual_grad.get_float_ptr(),
+        attn_bias_grad.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual_grad.get_half_ptr(),
+        attn_bias_grad.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::peft_bwd_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T const *gamma_ptr,
+    hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     static_cast<T const *>(m->input_activation),
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual_grad_ptr,
+                     m->reset_input_grads[0],
+                     m->reset_input_grads[1],
+                     N);
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorR const &gamma) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual_grad.get_float_ptr(),
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual_grad.get_half_ptr(),
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
 }
 
 }; // namespace FlexFlow
diff --git a/src/ops/add_bias_residual_layer_norm.cu b/src/ops/add_bias_residual_layer_norm.cu
index ceb1a6514e..bcca1ba2c6 100644
--- a/src/ops/add_bias_residual_layer_norm.cu
+++ b/src/ops/add_bias_residual_layer_norm.cu
@@ -22,12 +22,13 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
     FFHandler handle,
     AddBiasResidualLayerNorm const *ln,
     MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
@@ -44,6 +45,7 @@ AddBiasResidualLayerNormMeta::AddBiasResidualLayerNormMeta(
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 AddBiasResidualLayerNormMeta::~AddBiasResidualLayerNormMeta(void) {
@@ -74,7 +76,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -83,9 +85,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
-            ? shared[lid]
-            : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -110,20 +110,17 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const int64_t bias_idx = index % attn_bias_dim;
     X[index] = input_ptr[index] + attn_bias_ptr[bias_idx] + residual_ptr[index];
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -135,7 +132,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -161,42 +158,33 @@ void AddBiasResidualLayerNorm::inference_kernel(
     T const *gamma_ptr,
     T const *beta_ptr,
     cudaStream_t stream) {
-
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   LayerNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->effective_num_elements,
-                                               attn_bias_dim,
-                                               m->eps,
-                                               input_ptr,
-                                               attn_bias_ptr,
-                                               residual_ptr,
-                                               added_output_ptr,
-                                               static_cast<T *>(m->mean_ptr),
-                                               static_cast<T *>(m->rstd_ptr),
-                                               gamma_ptr,
-                                               beta_ptr,
-                                               output_ptr);
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   attn_bias_dim,
+                   m->eps,
+                   input_ptr,
+                   attn_bias_ptr,
+                   residual_ptr,
+                   added_output_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   output_ptr);
 }
 
 /*static*/
 void AddBiasResidualLayerNorm::inference_kernel_wrapper(
-    AddBiasResidualLayerNormMeta const *m,
-    int attn_bias_dim,
-    int residual_volume,
+    AddBiasResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
+    GenericTensorAccessorR const &attn_bias,
+    GenericTensorAccessorR const &residual,
     GenericTensorAccessorW &added_output,
     GenericTensorAccessorW &output,
-    GenericTensorAccessorR const &residual,
-    GenericTensorAccessorR const &attn_bias,
     GenericTensorAccessorR const &gamma,
     GenericTensorAccessorR const &beta) {
   cudaStream_t stream;
@@ -208,6 +196,69 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  // inference kernel
+  int attn_bias_dim = attn_bias.domain.hi()[0] - attn_bias.domain.lo()[0] + 1;
+  int residual_volume = residual.domain.get_volume();
   if (m->input_type[0] == DT_FLOAT) {
     AddBiasResidualLayerNorm::inference_kernel<float>(
         m,
@@ -297,4 +348,478 @@ void AddBiasResidualLayerNorm::inference_kernel_wrapper(
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual_i = dX_residual + i1 * N;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad) {
+      dX_residual_i[l] = f_grad_input;
+    } else {
+      dX_residual_i[l] += f_grad_input;
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual,
+             reset_input_grad,
+             reset_residual_grad,
+             N,
+             buf);
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::backward_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T const *added_output_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T *attn_bias_grad_ptr,
+    T const *gamma_ptr,
+    T *gamma_grad_ptr,
+    T *beta_grad_ptr,
+    cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          added_output_ptr,
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      added_output_ptr,
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      GammaBetaBackwardSimpleCUDAKernel<T>
+          <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                              N,
+                                              output_grad_ptr,
+                                              added_output_ptr,
+                                              static_cast<T *>(m->mean_ptr),
+                                              static_cast<T *>(m->rstd_ptr),
+                                              gamma_grad_ptr,
+                                              beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      GammaBetaBackwardCUDAKernel<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, stream>>>(
+              M,
+              N,
+              output_grad_ptr,
+              added_output_ptr,
+              static_cast<T *>(m->mean_ptr),
+              static_cast<T *>(m->rstd_ptr),
+              gamma_grad_ptr,
+              beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::backward_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR &added_output,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorW const &attn_bias_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual_grad.get_float_ptr(),
+        attn_bias_grad.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    AddBiasResidualLayerNorm::backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual_grad.get_half_ptr(),
+        attn_bias_grad.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[AddBiasResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void AddBiasResidualLayerNorm::peft_bwd_kernel(
+    AddBiasResidualLayerNormMeta const *m,
+    T const *output_grad_ptr,
+    T *input_grad_ptr,
+    T *residual_grad_ptr,
+    T const *gamma_ptr,
+    cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T const *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      N);
+}
+
+/*static*/
+void AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+    AddBiasResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW &input_grad,
+    GenericTensorAccessorW const &residual_grad,
+    GenericTensorAccessorR const &gamma) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual_grad.get_float_ptr(),
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual_grad.get_half_ptr(),
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[AddBiasResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/aggregate.cc b/src/ops/aggregate.cc
index 5f05458e34..c83b738a0e 100644
--- a/src/ops/aggregate.cc
+++ b/src/ops/aggregate.cc
@@ -85,7 +85,7 @@ AggregateParams Aggregate::get_params() const {
   AggregateParams params;
   params.n = this->n;
   params.lambda_bal = this->lambda_bal;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -242,7 +242,7 @@ OpMeta *Aggregate::init_task(Task const *task,
                              Runtime *runtime) {
   Aggregate *agg = (Aggregate *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  AggregateMeta *m = new AggregateMeta(handle, agg->n);
+  AggregateMeta *m = new AggregateMeta(handle, agg);
   m->profiling = agg->profiling;
   m->inference_debugging = agg->inference_debugging;
   std::strcpy(m->op_name, agg->name);
@@ -603,7 +603,7 @@ bool Aggregate::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  AggregateMeta *m = new AggregateMeta(sim->handler, n);
+  AggregateMeta *m = new AggregateMeta(sim->handler, this);
 
   // allocate
   sim->free_all();
diff --git a/src/ops/aggregate.cpp b/src/ops/aggregate.cpp
index d5ebdb0c22..5a508cfac4 100644
--- a/src/ops/aggregate.cpp
+++ b/src/ops/aggregate.cpp
@@ -281,13 +281,14 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m,
                      out_dim);
 }
 
-AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) {
-  checkCUDA(hipMalloc(&dev_exp_preds, n * sizeof(float *)));
-  checkCUDA(hipMalloc(&dev_exp_grads, n * sizeof(float *)));
+AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(hipMalloc(&dev_exp_preds, aggr->n * sizeof(float *)));
+  checkCUDA(hipMalloc(&dev_exp_grads, aggr->n * sizeof(float *)));
 }
 AggregateMeta::~AggregateMeta(void) {
   checkCUDA(hipFree(&dev_exp_preds));
   checkCUDA(hipFree(&dev_exp_grads));
 }
 
-}; // namespace FlexFlow
\ No newline at end of file
+}; // namespace FlexFlow
diff --git a/src/ops/aggregate.cu b/src/ops/aggregate.cu
index 38e141b252..9704302092 100644
--- a/src/ops/aggregate.cu
+++ b/src/ops/aggregate.cu
@@ -307,9 +307,10 @@ void Aggregate::backward_kernel_wrapper(AggregateMeta const *m,
   }
 }
 
-AggregateMeta::AggregateMeta(FFHandler handler, int n) : OpMeta(handler) {
-  checkCUDA(cudaMalloc(&dev_exp_preds, n * sizeof(float *)));
-  checkCUDA(cudaMalloc(&dev_exp_grads, n * sizeof(float *)));
+AggregateMeta::AggregateMeta(FFHandler handler, Aggregate const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(cudaMalloc(&dev_exp_preds, aggr->n * sizeof(float *)));
+  checkCUDA(cudaMalloc(&dev_exp_grads, aggr->n * sizeof(float *)));
 }
 AggregateMeta::~AggregateMeta(void) {
   checkCUDA(cudaFree(&dev_exp_preds));
diff --git a/src/ops/aggregate_spec.cc b/src/ops/aggregate_spec.cc
index 1edd430881..6ea3ff3747 100644
--- a/src/ops/aggregate_spec.cc
+++ b/src/ops/aggregate_spec.cc
@@ -84,7 +84,7 @@ AggregateSpecParams AggregateSpec::get_params() const {
   AggregateSpecParams params;
   params.n = this->n;
   params.lambda_bal = this->lambda_bal;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -210,7 +210,7 @@ OpMeta *AggregateSpec::init_task(Task const *task,
                                  Runtime *runtime) {
   AggregateSpec *agg = (AggregateSpec *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg->n);
+  AggregateSpecMeta *m = new AggregateSpecMeta(handle, agg);
   m->profiling = agg->profiling;
   m->inference_debugging = agg->inference_debugging;
   std::strcpy(m->op_name, agg->name);
@@ -543,7 +543,7 @@ bool AggregateSpec::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, n);
+  AggregateSpecMeta *m = new AggregateSpecMeta(sim->handler, this);
 
   // allocate
   sim->free_all();
diff --git a/src/ops/aggregate_spec.cpp b/src/ops/aggregate_spec.cpp
index 314e20a59c..a676fa81c3 100644
--- a/src/ops/aggregate_spec.cpp
+++ b/src/ops/aggregate_spec.cpp
@@ -290,9 +290,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m,
                      out_dim);
 }
 
-AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n)
-    : OpMeta(handler) {
-  checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *)));
+AggregateSpecMeta::AggregateSpecMeta(FFHandler handler,
+                                     AggregateSpec const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(hipMalloc(&dev_region_ptrs, aggr->n * sizeof(float *)));
 }
 AggregateSpecMeta::~AggregateSpecMeta(void) {
   checkCUDA(hipFree(&dev_region_ptrs));
diff --git a/src/ops/aggregate_spec.cu b/src/ops/aggregate_spec.cu
index 8d50d45d21..ac5a372efc 100644
--- a/src/ops/aggregate_spec.cu
+++ b/src/ops/aggregate_spec.cu
@@ -287,9 +287,10 @@ void AggregateSpec::backward_kernel_wrapper(AggregateSpecMeta const *m,
                                       out_dim);
 }
 
-AggregateSpecMeta::AggregateSpecMeta(FFHandler handler, int n)
-    : OpMeta(handler) {
-  checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *)));
+AggregateSpecMeta::AggregateSpecMeta(FFHandler handler,
+                                     AggregateSpec const *aggr)
+    : OpMeta(handler, aggr) {
+  checkCUDA(cudaMalloc(&dev_region_ptrs, aggr->n * sizeof(float *)));
 }
 AggregateSpecMeta::~AggregateSpecMeta(void) {
   checkCUDA(cudaFree(&dev_region_ptrs));
diff --git a/src/ops/arg_topk.cc b/src/ops/arg_topk.cc
index 780a77450e..534bac2419 100644
--- a/src/ops/arg_topk.cc
+++ b/src/ops/arg_topk.cc
@@ -112,7 +112,7 @@ ArgTopKParams ArgTopK::get_params() const {
   params.k = this->k;
   params.sorted = this->sorted;
   params.speculative_decoding = this->speculative_decoding;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -387,7 +387,7 @@ InferenceResult
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW probs;
 
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   ArgTopK::forward_kernel_wrapper(
       m, input, probs, indices, batch_size, nullptr);
 
@@ -399,7 +399,7 @@ InferenceResult
   }
 
   InferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
 }
@@ -431,9 +431,10 @@ BeamInferenceResult ArgTopK::inference_speculative_task(
   ArgTopK::forward_kernel_wrapper(m, input, probs, indices, batch_size, &bc);
 
   BeamInferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size * m->k);
-  download_tensor<float>(probs.get_float_ptr(), ir.probs, batch_size * m->k);
+  copy_tensor_dev_to_host<float>(
+      probs.get_float_ptr(), ir.probs, batch_size * m->k);
   return ir;
 }
 
diff --git a/src/ops/argmax.cc b/src/ops/argmax.cc
index 1892ac2353..4123e50e7e 100644
--- a/src/ops/argmax.cc
+++ b/src/ops/argmax.cc
@@ -91,7 +91,7 @@ Op *ArgMax::create_operator_from_layer(
 ArgMaxParams ArgMax::get_params() const {
   ArgMaxParams params;
   params.beam_search = this->beam_search;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -314,7 +314,7 @@ FutureMap ArgMax::inference(FFModel const &ff,
     launcher.add_future(bc);
     launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
                                                       0 /*projection id*/,
-                                                      READ_WRITE,
+                                                      READ_ONLY,
                                                       EXCLUSIVE,
                                                       batch_inputs[0]->region));
     launcher.add_field(0, FID_DATA);
@@ -348,15 +348,18 @@ BeamInferenceResult
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   GenericTensorAccessorW parent = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
+  float loss = 0.0f;
+  ArgMax::forward_kernel_wrapper(
+      m, bc, input, indices, parent, batch_size, &loss);
   BeamInferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
-  download_tensor(m->probs, ir.probs, batch_size);
-  download_tensor<int>(parent.get_int32_ptr(), ir.parent_id, batch_size);
+  copy_tensor_dev_to_host(m->probs, ir.probs, batch_size);
+  copy_tensor_dev_to_host<int>(
+      parent.get_int32_ptr(), ir.parent_id, batch_size);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -383,23 +386,36 @@ InferenceResult
     return ir;
   }
 
-  GenericTensorAccessorW input = helperGetGenericTensorAccessorRW(
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorW parent;
-  int batch_size = bc->num_active_tokens();
-  ArgMax::forward_kernel_wrapper(m, input, indices, parent, batch_size);
+  int batch_size = bc->num_active_infr_tokens();
+  float loss = 0.0f;
+
+  ArgMax::forward_kernel_wrapper(
+      m, bc, input, indices, parent, batch_size, &loss);
+
   InferenceResult ir;
+  ir.finetuning_loss = loss;
+
+  if (bc->num_active_peft_tokens() > 0) {
+    printf("Loss: %.4f\n", loss);
+  }
+
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     ArgMax::save_inference_tensors_to_file(
-        m, shard_id, bc, {}, {}, {input, indices});
+        m, shard_id, bc, {input}, {}, {indices});
+  } else {
+    m->decoding_step++;
   }
 
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
+
   return ir;
 }
 
@@ -453,4 +469,4 @@ size_t hash<FlexFlow::ArgMaxParams>::operator()(
   hash_combine(key, params.beam_search);
   return key;
 }
-}; // namespace std
\ No newline at end of file
+}; // namespace std
diff --git a/src/ops/argmax.cpp b/src/ops/argmax.cpp
index 8a1cf0b3b0..60d44cdf2b 100644
--- a/src/ops/argmax.cpp
+++ b/src/ops/argmax.cpp
@@ -334,6 +334,21 @@ __device__ void mergeShards(int num_shards,
   }
 }
 
+template <typename DT>
+__global__ void compute_sparse_categorical_crossentropy_loss(
+    DT const *logits,
+    BatchConfig::TokenId const *labels,
+    float *loss,
+    int num_tokens,
+    int num_classes) {
+  float const LOG_MIN_VALUE = 0.00000001f;
+  CUDA_KERNEL_LOOP(b, num_tokens) {
+    float my_logit =
+        max((float)logits[b * num_classes + labels[b]], LOG_MIN_VALUE);
+    atomicAdd(loss, -log(my_logit));
+  }
+}
+
 template <typename T>
 __global__ void argmax_forward_kernel(T const *__restrict__ input,
                                       size_t shared_memory_size,
@@ -381,14 +396,16 @@ __global__ void copy_result(hipcub::KeyValuePair<int, DT> *d_out,
 /*static*/
 template <typename DT>
 void ArgMax::forward_kernel(ArgMaxMeta const *m,
-                            DT *input_ptr,
+                            BatchConfig const *bc,
+                            DT const *input_ptr,
                             int *indices_ptr,
                             float *prob_ptr,
                             int *parent,
                             int const length,
                             int const batch_size,
+                            float *loss,
                             hipStream_t stream) {
-  checkCUDA(get_legion_stream(&stream));
+
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
 
   if (m->beam_search) {
@@ -425,28 +442,77 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                      k,
                      prob_ptr,
                      indices_ptr);
+
+  // compute cross-entropy loss if there is a finetuning request
+  assert(loss != nullptr);
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+  int num_finetuning_requests = 0, num_bwd_tokens = 0;
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_bwd) {
+      assert(num_finetuning_requests == 0 && num_bwd_tokens == 0);
+      num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+      // shift labels by 1 position to the left (ignore first token label)
+      for (int j = 0; j < num_bwd_tokens; j++) {
+        token_ids[j] =
+            bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
+      }
+      num_finetuning_requests += 1;
+    } else {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  assert(num_finetuning_requests <= 1);
+  if (num_bwd_tokens > 0) {
+    checkCUDA(hipMemcpyAsync(m->handle.workSpace,
+                             token_ids,
+                             sizeof(BatchConfig::TokenId) * num_bwd_tokens,
+                             hipMemcpyHostToDevice,
+                             stream));
+    // copy loss to d_loss
+    checkCUDA(hipMemsetAsync(m->d_loss, 0, sizeof(float), stream));
+    compute_sparse_categorical_crossentropy_loss<<<GET_BLOCKS(num_bwd_tokens),
+                                                   min(CUDA_NUM_THREADS,
+                                                       num_bwd_tokens),
+                                                   0,
+                                                   stream>>>(
+        input_ptr,
+        static_cast<BatchConfig::TokenId *>(m->handle.workSpace),
+        m->d_loss,
+        num_bwd_tokens,
+        length);
+    // copy value from d_loss to loss
+    checkCUDA(hipMemcpyAsync(
+        loss, m->d_loss, sizeof(float), hipMemcpyDeviceToHost, stream));
+    *loss = *loss / (float)num_bwd_tokens;
+  }
 }
 
 /*static*/
 void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
-                                    GenericTensorAccessorW const &input,
+                                    BatchConfig const *bc,
+                                    GenericTensorAccessorR const &input,
                                     GenericTensorAccessorW const &indices,
                                     GenericTensorAccessorW const &parent,
-                                    int batch_size) {
+                                    int batch_size,
+                                    float *loss) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-
   hipEvent_t t_start, t_end;
   if (m->profiling) {
     checkCUDA(hipEventCreate(&t_start));
     checkCUDA(hipEventCreate(&t_end));
     checkCUDA(hipEventRecord(t_start, stream));
   }
-
   int length = input.domain.hi()[0] - input.domain.lo()[0] + 1;
 
   if (input.data_type == DT_HALF) {
     ArgMax::forward_kernel<half>(m,
+                                 bc,
                                  input.get_half_ptr(),
                                  indices.get_int32_ptr(),
                                  m->probs,
@@ -454,10 +520,12 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                                 : nullptr,
                                  length,
                                  batch_size,
+                                 loss,
                                  stream);
 
   } else if (input.data_type == DT_FLOAT) {
     ArgMax::forward_kernel<float>(m,
+                                  bc,
                                   input.get_float_ptr(),
                                   indices.get_int32_ptr(),
                                   m->probs,
@@ -465,6 +533,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                                  : nullptr,
                                   length,
                                   batch_size,
+                                  loss,
                                   stream);
   } else {
     assert(false && "Unsupported data type");
diff --git a/src/ops/argmax.cu b/src/ops/argmax.cu
index 05c84719c1..8a2e2da2d0 100644
--- a/src/ops/argmax.cu
+++ b/src/ops/argmax.cu
@@ -44,19 +44,35 @@ __global__ void copy_result(cub::KeyValuePair<int, DT> *d_out,
   }
 }
 
+template <typename DT>
+__global__ void compute_sparse_categorical_crossentropy_loss(
+    DT const *logits,
+    BatchConfig::TokenId const *labels,
+    float *loss,
+    int num_tokens,
+    int num_classes) {
+  float const LOG_MIN_VALUE = 0.00000001f;
+  CUDA_KERNEL_LOOP(b, num_tokens) {
+    float my_logit =
+        max((float)logits[b * num_classes + labels[b]], LOG_MIN_VALUE);
+    atomicAdd(loss, -log(my_logit));
+  }
+}
+
 /*static*/
 template <typename DT>
 void ArgMax::forward_kernel(ArgMaxMeta const *m,
-                            DT *input_ptr,
+                            BatchConfig const *bc,
+                            DT const *input_ptr,
                             int *indices_ptr,
                             float *prob_ptr,
                             int *parent,
                             int const length,
                             int const batch_size,
+                            float *loss,
                             cudaStream_t stream) {
-
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
-  DT alpha = 1.0f, beta = 0.0f;
+
   if (m->beam_search) {
     // set all parents id zero in arg top1 case.
     checkCUDA(cudaMemsetAsync(parent, 0, batch_size * sizeof(int), stream));
@@ -73,7 +89,7 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
       m->d_offsets + 1,
       stream));
 
-  // copy dout to incides
+  // copy dout to indices
   int parallelism = batch_size;
   copy_result<<<GET_BLOCKS(parallelism),
                 min(CUDA_NUM_THREADS, parallelism),
@@ -84,14 +100,64 @@ void ArgMax::forward_kernel(ArgMaxMeta const *m,
                           batch_size,
                           m->beam_search);
   // print_tensor<int>(indices_ptr, 32, "argmax op");
+
+  // compute cross-entropy loss if there is a finetuning request
+  assert(loss != nullptr);
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+  int num_finetuning_requests = 0, num_bwd_tokens = 0;
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_bwd) {
+      assert(num_finetuning_requests == 0 && num_bwd_tokens == 0);
+      num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+      // shift labels by 1 position to the left (ignore first token label)
+      for (int j = 0; j < num_bwd_tokens; j++) {
+        token_ids[j] =
+            bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
+      }
+      num_finetuning_requests += 1;
+    } else {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  assert(num_finetuning_requests <= 1);
+  if (num_bwd_tokens > 0) {
+    checkCUDA(cudaMemcpyAsync(m->handle.workSpace,
+                              token_ids,
+                              sizeof(BatchConfig::TokenId) * num_bwd_tokens,
+                              cudaMemcpyHostToDevice,
+                              stream));
+    // copy loss to d_loss
+    checkCUDA(cudaMemsetAsync(m->d_loss, 0, sizeof(float), stream));
+    compute_sparse_categorical_crossentropy_loss<<<GET_BLOCKS(num_bwd_tokens),
+                                                   min(CUDA_NUM_THREADS,
+                                                       num_bwd_tokens),
+                                                   0,
+                                                   stream>>>(
+        input_ptr,
+        static_cast<BatchConfig::TokenId *>(m->handle.workSpace),
+        m->d_loss,
+        num_bwd_tokens,
+        length);
+    // copy value from d_loss to loss
+    checkCUDA(cudaMemcpyAsync(
+        loss, m->d_loss, sizeof(float), cudaMemcpyDeviceToHost, stream));
+    *loss = *loss / (float)num_bwd_tokens;
+  }
 }
 
 /*static*/
 void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
-                                    GenericTensorAccessorW const &input,
+                                    BatchConfig const *bc,
+                                    GenericTensorAccessorR const &input,
                                     GenericTensorAccessorW const &indices,
                                     GenericTensorAccessorW const &parent,
-                                    int batch_size) {
+                                    int batch_size,
+                                    float *loss) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -104,6 +170,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
 
   if (input.data_type == DT_HALF) {
     ArgMax::forward_kernel<half>(m,
+                                 bc,
                                  input.get_half_ptr(),
                                  indices.get_int32_ptr(),
                                  m->probs,
@@ -111,10 +178,12 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                                 : nullptr,
                                  length,
                                  batch_size,
+                                 loss,
                                  stream);
 
   } else if (input.data_type == DT_FLOAT) {
     ArgMax::forward_kernel<float>(m,
+                                  bc,
                                   input.get_float_ptr(),
                                   indices.get_int32_ptr(),
                                   m->probs,
@@ -122,6 +191,7 @@ void ArgMax::forward_kernel_wrapper(ArgMaxMeta const *m,
                                                  : nullptr,
                                   length,
                                   batch_size,
+                                  loss,
                                   stream);
   } else {
     assert(false && "Unsupported data type");
@@ -202,6 +272,10 @@ ArgMaxMeta::ArgMaxMeta(FFHandler handler,
   gpu_mem_allocator.create_legion_instance(reserveInst, temp_storage_bytes);
   d_temp_storage =
       gpu_mem_allocator.allocate_instance_untyped(temp_storage_bytes);
+
+  // allocate space for loss on device
+  gpu_mem_allocator.create_legion_instance(reserveInst, sizeof(float));
+  d_loss = gpu_mem_allocator.allocate_instance<float>(1);
 }
 
 ArgMaxMeta::~ArgMaxMeta(void) {
diff --git a/src/ops/attention.cc b/src/ops/attention.cc
index 203662d3ec..aef4f0a16a 100644
--- a/src/ops/attention.cc
+++ b/src/ops/attention.cc
@@ -1010,7 +1010,7 @@ MultiHeadAttentionParams MultiHeadAttention::get_params() const {
   params.bias = this->bias;
   params.add_bias_kv = this->add_bias_kv;
   params.add_zero_attn = this->add_zero_attn;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/attention.cpp b/src/ops/attention.cpp
index ee7f87a7fb..10655a4a1a 100644
--- a/src/ops/attention.cpp
+++ b/src/ops/attention.cpp
@@ -156,7 +156,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler,
                                                Memory gpu_mem,
                                                int num_samples,
                                                int num_heads)
-    : OpMeta(handler) {
+    : OpMeta(handler, attn) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(miopenSetStream(handler.dnn, stream));
diff --git a/src/ops/attention.cu b/src/ops/attention.cu
index 18fc810aed..4c460cdbbf 100644
--- a/src/ops/attention.cu
+++ b/src/ops/attention.cu
@@ -194,7 +194,7 @@ MultiHeadAttentionMeta::MultiHeadAttentionMeta(FFHandler handler,
                                                Memory gpu_mem,
                                                int num_samples,
                                                int num_heads)
-    : OpMeta(handler) {
+    : OpMeta(handler, attn) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
diff --git a/src/ops/batch_matmul.cc b/src/ops/batch_matmul.cc
index e13169f6c1..e5f0611fb0 100644
--- a/src/ops/batch_matmul.cc
+++ b/src/ops/batch_matmul.cc
@@ -279,7 +279,7 @@ OpMeta *BatchMatmul::init_task(Task const *task,
                                Runtime *runtime) {
   BatchMatmul const *bmm = (BatchMatmul *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  BatchMatmulMeta *m = new BatchMatmulMeta(handle);
+  BatchMatmulMeta *m = new BatchMatmulMeta(handle, bmm);
   m->profiling = bmm->profiling;
   m->inference_debugging = bmm->inference_debugging;
   m->a_seq_length_dim = bmm->a_seq_length_dim;
@@ -616,7 +616,7 @@ bool BatchMatmul::measure_operator_cost(Simulator *sim,
     batch *= sub_input0.dims[i].size;
   }
 
-  BatchMatmulMeta *meta = sim->batch_matmul_meta;
+  BatchMatmulMeta *meta = new BatchMatmulMeta(sim->handler, this);
 
   // allocate tensors in simulator
   sim->free_all();
diff --git a/src/ops/batch_norm.cpp b/src/ops/batch_norm.cpp
index 7dee6fdaaf..5856f1dddf 100644
--- a/src/ops/batch_norm.cpp
+++ b/src/ops/batch_norm.cpp
@@ -284,7 +284,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler,
                              int output_c,
                              int output_h,
                              int output_w)
-    : OpMeta(handler) {
+    : OpMeta(handler, bn) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&biasTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/batch_norm.cu b/src/ops/batch_norm.cu
index 929ebf81f8..01e993067a 100644
--- a/src/ops/batch_norm.cu
+++ b/src/ops/batch_norm.cu
@@ -270,7 +270,7 @@ BatchNormMeta::BatchNormMeta(FFHandler handler,
                              int output_c,
                              int output_h,
                              int output_w)
-    : OpMeta(handler) {
+    : OpMeta(handler, bn) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/beam_topk.cc b/src/ops/beam_topk.cc
index 5f4547ace5..36cc7fd8fa 100644
--- a/src/ops/beam_topk.cc
+++ b/src/ops/beam_topk.cc
@@ -375,7 +375,7 @@ BeamInferenceResult
   // embedding size: eg. 4096
   int length = input_domain.hi()[0] - input_domain.lo()[0] + 1;
   // total token nums
-  size_t batch_size = bc.num_active_tokens();
+  size_t batch_size = bc.num_active_infr_tokens();
 
   // need meta for: how many sub requests in a main request
   BeamTopK::forward_kernel_wrapper(m,
@@ -390,9 +390,11 @@ BeamInferenceResult
 
   BeamInferenceResult ir;
 
-  download_tensor<int>(index_ptr, ir.token_ids, batch_size * m->max_beam_width);
-  download_tensor<float>(value_ptr, ir.probs, batch_size * m->max_beam_width);
-  download_tensor<int>(
+  copy_tensor_dev_to_host<int>(
+      index_ptr, ir.token_ids, batch_size * m->max_beam_width);
+  copy_tensor_dev_to_host<float>(
+      value_ptr, ir.probs, batch_size * m->max_beam_width);
+  copy_tensor_dev_to_host<int>(
       parent_ptr, ir.parent_id, batch_size * m->max_beam_width);
 
   if (m->inference_debugging) {
diff --git a/src/ops/beam_topk.cpp b/src/ops/beam_topk.cpp
index 8545bea7cb..5d80707ea7 100644
--- a/src/ops/beam_topk.cpp
+++ b/src/ops/beam_topk.cpp
@@ -681,7 +681,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
 BeamTopKMeta::BeamTopKMeta(FFHandler handler,
                            Op const *op,
                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
+    : OpMeta(handler, op) {
   DataType data_type = op->inputs[0]->data_type;
   int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
   int max_requests_per_batch = BatchConfig::max_requests_per_batch();
diff --git a/src/ops/beam_topk.cu b/src/ops/beam_topk.cu
index c24bdf7c74..bf4c23cad0 100644
--- a/src/ops/beam_topk.cu
+++ b/src/ops/beam_topk.cu
@@ -723,7 +723,7 @@ void BeamTopK::forward_kernel_wrapper(BeamTopKMeta const *m,
 BeamTopKMeta::BeamTopKMeta(FFHandler handler,
                            Op const *op,
                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handler) {
+    : OpMeta(handler, op) {
   DataType data_type = op->inputs[0]->data_type;
   int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
   int max_requests_per_batch = BatchConfig::max_requests_per_batch();
diff --git a/src/ops/cache.cc b/src/ops/cache.cc
index 691e45b559..33b862ae85 100644
--- a/src/ops/cache.cc
+++ b/src/ops/cache.cc
@@ -165,7 +165,7 @@ OpMeta *Cache::init_task(Task const *task,
                          Runtime *runtime) {
   Cache *c = (Cache *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  CacheMeta *m = new CacheMeta(handle);
+  CacheMeta *m = new CacheMeta(handle, c);
   m->cache_score = 0.0f;
   m->profiling = c->profiling;
   m->inference_debugging = c->inference_debugging;
diff --git a/src/ops/cache.cpp b/src/ops/cache.cpp
index 95c5995f9e..a9512c2c59 100644
--- a/src/ops/cache.cpp
+++ b/src/ops/cache.cpp
@@ -75,7 +75,7 @@ float Cache::cache_update(Task const *task,
   return cache_score;
 }
 
-CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {}
+CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {}
 
 template void
     Cache::cache_forward<float>(Task const *task,
diff --git a/src/ops/cache.cu b/src/ops/cache.cu
index a113e57a1c..2f95e59669 100644
--- a/src/ops/cache.cu
+++ b/src/ops/cache.cu
@@ -74,7 +74,7 @@ float Cache::cache_update(Task const *task,
   return cache_score;
 }
 
-CacheMeta::CacheMeta(FFHandler handler) : OpMeta(handler) {}
+CacheMeta::CacheMeta(FFHandler handler, Cache const *c) : OpMeta(handler, c) {}
 
 template void
     Cache::cache_forward<float>(Task const *task,
diff --git a/src/ops/cast.cc b/src/ops/cast.cc
index e514236a31..4a52bf874e 100644
--- a/src/ops/cast.cc
+++ b/src/ops/cast.cc
@@ -190,7 +190,7 @@ OpMeta *Cast::init_task(Task const *task,
                         Runtime *runtime) {
   Cast *cast = (Cast *)task->args;
   FFHandler handler = *((FFHandler const *)task->local_args);
-  CastMeta *m = new CastMeta(handler);
+  CastMeta *m = new CastMeta(handler, cast);
   m->input_data_type = cast->inputs[0]->data_type;
   m->output_data_type = cast->outputs[0]->data_type;
   std::strcpy(m->op_name, cast->name);
diff --git a/src/ops/concat.cc b/src/ops/concat.cc
index d4d8e525fc..0a82779b6d 100644
--- a/src/ops/concat.cc
+++ b/src/ops/concat.cc
@@ -197,7 +197,7 @@ OpMeta *Concat::init_task(Task const *task,
                           Runtime *runtime) {
   Concat *cc = (Concat *)task->args;
   FFHandler handler = *((FFHandler const *)task->local_args);
-  ConcatMeta *m = new ConcatMeta(handler);
+  ConcatMeta *m = new ConcatMeta(handler, cc);
   // Note that our internal axis index ordering is opposite to other frameworks
   init_meta(m, cc->legion_axis);
   m->profiling = cc->profiling;
@@ -365,7 +365,7 @@ bool Concat::measure_operator_cost(Simulator *sim,
     }
   }
 
-  ConcatMeta *m = sim->concat_meta;
+  ConcatMeta *m = new ConcatMeta(sim->handler, this);
   init_meta(m, this->legion_axis);
 
   sim->free_all();
diff --git a/src/ops/conv_2d.cc b/src/ops/conv_2d.cc
index 94850a178d..2428c9b99a 100644
--- a/src/ops/conv_2d.cc
+++ b/src/ops/conv_2d.cc
@@ -588,12 +588,13 @@ OpMeta *Conv2D::init_task(Task const *task,
   //     regions[4], task->regions[4], FID_DATA, ctx, runtime,
   //     false/*readOutput*/);
 
-  Conv2DMeta *m = new Conv2DMeta(handle);
+  Conv2DMeta *m = new Conv2DMeta(handle, conv);
   m->relu = conv->activation == AC_MODE_RELU;
   m->use_bias = conv->use_bias;
   m->profiling = conv->profiling;
   m->inference_debugging = conv->inference_debugging;
-  m->trainableInputs[0] = conv->trainableInputs[0];
+  m->trainable_inputs[0] = conv->trainable_inputs[0];
+  m->reset_input_grads[0] = conv->trainable_inputs[0];
   std::strcpy(m->op_name, conv->name);
   m->layer_guid = conv->layer_guid;
 
@@ -753,7 +754,7 @@ void Conv2D::backward(FFModel const &ff) {
                                                     inputs[0]->region));
   launcher.add_field(rid++, FID_DATA);
   // regions[1](I/O): input_grad
-  if (trainableInputs[0]) {
+  if (trainable_inputs[0]) {
     launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
                                                       0 /*projection id*/,
                                                       READ_WRITE,
@@ -803,7 +804,7 @@ void Conv2D::backward(FFModel const &ff) {
 
 /*
   region(I): input
-  region(I/O): input_grad (if trainableInputs[0])
+  region(I/O): input_grad (if trainable_inputs[0])
   region(I): output
   region(I/O): output_grad
   region(I): filter
@@ -816,17 +817,17 @@ void Conv2D::backward_task(Task const *task,
                            Runtime *runtime) {
   // Conv2D* conv = (Conv2D*) task->args;
   Conv2DMeta const *m = *((Conv2DMeta **)task->local_args);
-  assert(regions.size() == (5 + static_cast<size_t>(m->trainableInputs[0]) +
+  assert(regions.size() == (5 + static_cast<size_t>(m->trainable_inputs[0]) +
                             static_cast<size_t>(m->use_bias)));
   assert(task->regions.size() ==
-         (5 + static_cast<size_t>(m->trainableInputs[0]) +
+         (5 + static_cast<size_t>(m->trainable_inputs[0]) +
           static_cast<size_t>(m->use_bias)));
   size_t rid = 0;
   TensorAccessorR<float, Conv2DInput::NUMDIM> acc_input(
       regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
   rid++;
   float *acc_input_grad_ptr = NULL;
-  if (m->trainableInputs[0]) {
+  if (m->trainable_inputs[0]) {
     TensorAccessorW<float, Conv2DInput::NUMDIM> acc_input_grad(
         regions[rid],
         task->regions[rid],
@@ -1119,7 +1120,7 @@ bool Conv2D::measure_operator_cost(Simulator *sim,
   int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2;
   int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2;
 
-  Conv2DMeta *m = sim->conv2d_meta;
+  Conv2DMeta *m = new Conv2DMeta(sim->handler, this);
   m->relu = activation == AC_MODE_RELU;
   // require input_c is divisible by groups
 
diff --git a/src/ops/element_binary.cc b/src/ops/element_binary.cc
index 4352f459b9..cf8696182b 100644
--- a/src/ops/element_binary.cc
+++ b/src/ops/element_binary.cc
@@ -429,7 +429,7 @@ OpMeta *ElementBinary::init_task(Task const *task,
   FFHandler handle = *((FFHandler *)task->local_args);
   ElementBinaryMeta *m = new ElementBinaryMeta(handle, eb);
   for (int i = 0; i < eb->numInputs; i++) {
-    m->trainableInputs[i] = eb->trainableInputs[i];
+    m->trainable_inputs[i] = eb->trainable_inputs[i];
   }
   m->op_type = eb->op_type;
   m->profiling = eb->profiling;
@@ -892,7 +892,7 @@ void ElementBinary::backward(FFModel const &ff) {
                                                       inputs[0]->region));
     launcher.add_field(rid++, FID_DATA);
     // regions[2](I/O): input0_grad
-    if (trainableInputs[0]) {
+    if (trainable_inputs[0]) {
       launcher.add_region_requirement(
           RegionRequirement(inputs[0]->part_grad,
                             0 /*projection id*/,
@@ -910,7 +910,7 @@ void ElementBinary::backward(FFModel const &ff) {
                                                         inputs[1]->region));
       launcher.add_field(rid++, FID_DATA);
       // regions[4](I/O): input1_grad
-      if (trainableInputs[1]) {
+      if (trainable_inputs[1]) {
         launcher.add_region_requirement(
             RegionRequirement(inputs[1]->part_grad,
                               0 /*projection id*/,
@@ -980,7 +980,7 @@ void ElementBinary::backward_task(Task const *task,
     in0_ptr = helperGetTensorPointerRO<float>(
         regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
     rid++;
-    if (m->trainableInputs[0]) {
+    if (m->trainable_inputs[0]) {
       Domain in0_grad_domain = runtime->get_index_space_domain(
           ctx, task->regions[rid].region.get_index_space());
       assert(in0_domain == in0_grad_domain);
@@ -998,7 +998,7 @@ void ElementBinary::backward_task(Task const *task,
       in1_ptr = helperGetTensorPointerRO<float>(
           regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
       rid++;
-      if (m->trainableInputs[1]) {
+      if (m->trainable_inputs[1]) {
         Domain in1_grad_domain = runtime->get_index_space_domain(
             ctx, task->regions[rid].region.get_index_space());
         // assert(out_grad_domain == in1_domain);
diff --git a/src/ops/element_unary.cc b/src/ops/element_unary.cc
index 0e1d115557..09cf13c717 100644
--- a/src/ops/element_unary.cc
+++ b/src/ops/element_unary.cc
@@ -354,7 +354,7 @@ OpMeta *ElementUnary::init_task(Task const *task,
                                 Runtime *runtime) {
   ElementUnary *eu = (ElementUnary *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  ElementUnaryMeta *m = new ElementUnaryMeta(handle);
+  ElementUnaryMeta *m = new ElementUnaryMeta(handle, eu);
   m->op_type = eu->op_type;
   m->data_type = eu->outputs[0]->data_type;
   // Input and output should have the same data type
@@ -737,7 +737,7 @@ bool ElementUnary::measure_operator_cost(Simulator *sim,
   if (!inputs[0]->get_sub_tensor(mv, sub_input)) {
     return false;
   }
-  ElementUnaryMeta *m = sim->ele_unary_meta;
+  ElementUnaryMeta *m = new ElementUnaryMeta(sim->handler, this);
   m->op_type = op_type;
   if (use_cudnn(m->op_type)) {
     Domain input_domain, output_domain;
diff --git a/src/ops/element_unary.cpp b/src/ops/element_unary.cpp
index e20200420f..435abdfe11 100644
--- a/src/ops/element_unary.cpp
+++ b/src/ops/element_unary.cpp
@@ -282,7 +282,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m,
                                    stream);
 }
 
-ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) {
+ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary)
+    : OpMeta(handler, unary) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
   checkCUDNN(miopenCreateActivationDescriptor(&actiDesc));
diff --git a/src/ops/element_unary.cu b/src/ops/element_unary.cu
index c7f5e90f4c..15e6852388 100644
--- a/src/ops/element_unary.cu
+++ b/src/ops/element_unary.cu
@@ -291,7 +291,8 @@ void ElementUnary::backward_kernel_wrapper(ElementUnaryMeta const *m,
                                    stream);
 }
 
-ElementUnaryMeta::ElementUnaryMeta(FFHandler handler) : OpMeta(handler) {
+ElementUnaryMeta::ElementUnaryMeta(FFHandler handler, ElementUnary const *unary)
+    : OpMeta(handler, unary) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
   checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));
diff --git a/src/ops/embedding.cc b/src/ops/embedding.cc
index e630563b63..95b538bdb6 100644
--- a/src/ops/embedding.cc
+++ b/src/ops/embedding.cc
@@ -469,7 +469,7 @@ FutureMap Embedding::inference(FFModel const &ff,
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   size_t machine_view_hash = view->hash();
 
-  IndexLauncher launcher(EMBED_FWD_TASK_ID,
+  IndexLauncher launcher(EMBED_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(NULL, 0),
                          argmap,
@@ -559,12 +559,6 @@ void Embedding::forward_task(Task const *task,
   }
   forward_kernel_wrapper(
       m, input, output, kernel, in_dim, out_dim, effective_batch_size);
-  if (m->inference_debugging) {
-    assert(task->index_point.get_dim() == 1);
-    int shard_id = task->index_point.point_data[0];
-    Embedding::save_inference_tensors_to_file(
-        m, shard_id, nullptr, {input}, {kernel}, {output});
-  }
 }
 
 /*
@@ -672,6 +666,16 @@ void Embedding::backward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
+Legion::FutureMap
+    Embedding::peft_bwd(FFModel const &ff,
+                        BatchConfigFuture const &bc,
+                        std::vector<ParallelTensor> const &batch_inputs,
+                        std::vector<ParallelTensor> const &batch_outputs,
+                        MachineView const *mv) {
+  // nothing to do (backward function only updates weights)
+  return FutureMap();
+}
+
 void Embedding::backward_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
                               Context ctx,
diff --git a/src/ops/experts.cc b/src/ops/experts.cc
index 8c66f9c7bc..3acc68ed9b 100644
--- a/src/ops/experts.cc
+++ b/src/ops/experts.cc
@@ -589,18 +589,7 @@ OpMeta *Experts::init_task(Task const *task,
                            Runtime *runtime) {
   Experts const *exp = (Experts *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  ExpertsMeta *m = new ExpertsMeta(handle,
-                                   exp->num_experts,
-                                   exp->experts_start_idx,
-                                   exp->data_dim,
-                                   exp->out_dim,
-                                   exp->experts_num_layers,
-                                   exp->experts_internal_dim_size,
-                                   exp->effective_batch_size,
-                                   exp->num_chosen_experts,
-                                   exp->alpha,
-                                   exp->use_bias,
-                                   exp->activation);
+  ExpertsMeta *m = new ExpertsMeta(handle, exp);
   m->profiling = exp->profiling;
   m->inference_debugging = exp->inference_debugging;
   std::strcpy(m->op_name, exp->name);
@@ -682,7 +671,7 @@ FutureMap Experts::inference(FFModel const &ff,
   size_t machine_view_hash = view->hash();
   /* std::cout << "Experts op machine_view: " << *(MachineView const *)mv
             << std::endl; */
-  // int num_active_tokens = bc->num_active_tokens();
+  // int num_active_infr_tokens = bc->num_active_infr_tokens();
   IndexLauncher launcher(EXPERTS_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(nullptr, 0),
@@ -1075,7 +1064,7 @@ void Experts::inference_task(Task const *task,
                                   output_ptr,
                                   weights_ptr,
                                   bias_ptr,
-                                  bc->num_active_tokens(),
+                                  bc->num_active_infr_tokens(),
                                   chosen_experts,
                                   batch_size,
                                   out_dim);
diff --git a/src/ops/experts.cpp b/src/ops/experts.cpp
index c06f02a647..502be878a9 100644
--- a/src/ops/experts.cpp
+++ b/src/ops/experts.cpp
@@ -27,7 +27,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
                                      float *output,
                                      float const *weights,
                                      float const *biases,
-                                     int num_active_tokens,
+                                     int num_active_infr_tokens,
                                      int chosen_experts,
                                      int batch_size,
                                      int out_dim) {
@@ -35,25 +35,15 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   handle_unimplemented_hip_kernel(OP_EXPERTS);
 }
 
-ExpertsMeta::ExpertsMeta(FFHandler handler,
-                         int _num_experts,
-                         int _experts_start_idx,
-                         int _data_dim,
-                         int _out_dim,
-                         int _experts_num_layers,
-                         int _experts_internal_dim_size,
-                         int _effective_batch_size,
-                         int _num_chosen_experts,
-                         float _alpha,
-                         bool _use_bias,
-                         ActiMode _activation)
-    : OpMeta(handler), num_experts(_num_experts),
-      experts_start_idx(_experts_start_idx), data_dim(_data_dim),
-      out_dim(_out_dim), experts_num_layers(_experts_num_layers),
-      experts_internal_dim_size(_experts_internal_dim_size),
-      effective_batch_size(_effective_batch_size),
-      num_chosen_experts(_num_chosen_experts), alpha(_alpha),
-      use_bias(_use_bias), activation(_activation) {}
+ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e)
+    : OpMeta(handler, e), num_experts(e->num_experts),
+      experts_start_idx(e->experts_start_idx), data_dim(e->data_dim),
+      out_dim(e->out_dim), experts_num_layers(e->experts_num_layers),
+      experts_internal_dim_size(e->experts_internal_dim_size),
+      effective_batch_size(e->effective_batch_size),
+      num_chosen_experts(e->num_chosen_experts), alpha(e->alpha),
+      use_bias(e->use_bias), activation(e->activation) {}
+
 ExpertsMeta::~ExpertsMeta(void) {}
 
 }; // namespace FlexFlow
diff --git a/src/ops/experts.cu b/src/ops/experts.cu
index ce15cdff55..f6f555d1ad 100644
--- a/src/ops/experts.cu
+++ b/src/ops/experts.cu
@@ -515,7 +515,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
                                      float *output,
                                      float const *weights,
                                      float const *biases,
-                                     int num_active_tokens,
+                                     int num_active_infr_tokens,
                                      int chosen_experts,
                                      int batch_size,
                                      int out_dim) {
@@ -529,8 +529,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
     cudaEventRecord(t_start, stream);
   }
 
-  assert(num_active_tokens > 0);
-  assert(num_active_tokens <= m->effective_batch_size);
+  assert(num_active_infr_tokens > 0);
+  assert(num_active_infr_tokens <= m->effective_batch_size);
   assert(m->effective_batch_size == batch_size);
 
   int num_experts_per_block = m->num_experts;
@@ -540,7 +540,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   int data_dim = m->data_dim;
   int num_chosen_experts = m->num_chosen_experts;
   // int num_tokens = m->effective_batch_size;
-  int num_tokens = num_active_tokens;
+  int num_tokens = num_active_infr_tokens;
   int expert_capacity = m->expert_capacity;
 
   assert(chosen_experts == num_chosen_experts);
@@ -579,14 +579,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
 #ifdef INFERENCE_TESTS
   // Checking
   // 1. check that m->sorted_indices contains indices sorted
-  int *indices_cpu = download_tensor<int>(indices, num_indices);
+  int *indices_cpu = copy_tensor_dev_to_host<int>(indices, num_indices);
   // assert(indices_cpu != nullptr);
   std::vector<int> indices_vec(indices_cpu, indices_cpu + num_indices);
   std::vector<int> indices_vec_sorted(indices_vec.size());
   std::copy(indices_vec.begin(), indices_vec.end(), indices_vec_sorted.begin());
   std::stable_sort(indices_vec_sorted.begin(), indices_vec_sorted.end());
 
-  int *thrust_sorted_indices_cpu = download_tensor<int>(
+  int *thrust_sorted_indices_cpu = copy_tensor_dev_to_host<int>(
       m->sorted_indices, m->num_chosen_experts * m->effective_batch_size);
   // assert(thrust_sorted_indices_cpu != nullptr);
   std::vector<int> thrust_sorted_indices_vec(
@@ -613,7 +613,7 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
     assert(indices_vec_sorted[i] == thrust_sorted_indices_vec[i]);
   }
   // 2. check that indices[m->original_indices[i]] = i
-  int *thrust_original_indices_cpu = download_tensor<int>(
+  int *thrust_original_indices_cpu = copy_tensor_dev_to_host<int>(
       m->original_indices, m->num_chosen_experts * m->effective_batch_size);
   // assert(thrust_original_indices_cpu != nullptr);
   std::vector<int> thrust_original_indices_vec(
@@ -668,8 +668,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   }
   assert(non_zero_experts_count == non_zero_experts_check.size());
   // 7. check exp_local_label_to_index
-  int *non_zero_expert_labels_cpu =
-      download_tensor<int>(m->non_zero_expert_labels, non_zero_experts_count);
+  int *non_zero_expert_labels_cpu = copy_tensor_dev_to_host<int>(
+      m->non_zero_expert_labels, non_zero_experts_count);
   // assert(non_zero_expert_labels_cpu != nullptr);
   std::vector<int> non_zero_expert_labels_vec(non_zero_expert_labels_cpu,
                                               non_zero_expert_labels_cpu +
@@ -684,8 +684,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
                         non_zero_experts_check_vec.end()));
   assert(non_zero_expert_labels_vec == non_zero_experts_check_vec);
 
-  int *exp_local_label_to_index =
-      download_tensor<int>(m->exp_local_label_to_index, non_zero_experts_count);
+  int *exp_local_label_to_index = copy_tensor_dev_to_host<int>(
+      m->exp_local_label_to_index, non_zero_experts_count);
   // assert(exp_local_label_to_index != nullptr);
   std::vector<int> exp_local_label_to_index_vec(exp_local_label_to_index,
                                                 exp_local_label_to_index +
@@ -699,8 +699,8 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   }
 
   // 8. Check expert_start_indexes
-  int *expert_start_indices_thrust =
-      download_tensor<int>(m->expert_start_indexes, non_zero_experts_count + 1);
+  int *expert_start_indices_thrust = copy_tensor_dev_to_host<int>(
+      m->expert_start_indexes, non_zero_experts_count + 1);
   // assert(expert_start_indices_thrust != nullptr);
   std::vector<int> expert_start_indices_thrust_vec(
       expert_start_indices_thrust,
@@ -746,9 +746,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   int *num_assignments_per_expert_thrust =
       (int *)calloc(non_zero_experts_count, sizeof(int));
   assert(num_assignments_per_expert_thrust != nullptr);
-  assert(download_tensor<int>(m->num_assignments_per_expert,
-                              num_assignments_per_expert_thrust,
-                              non_zero_experts_count));
+  assert(copy_tensor_dev_to_host<int>(m->num_assignments_per_expert,
+                                      num_assignments_per_expert_thrust,
+                                      non_zero_experts_count));
   assert(num_assignments_per_expert_thrust != nullptr);
   std::vector<int> num_assignments_per_expert_thrust_vec(
       num_assignments_per_expert_thrust,
@@ -759,9 +759,9 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   int *destination_start_indices_thrust =
       (int *)calloc(non_zero_experts_count, sizeof(int));
   assert(destination_start_indices_thrust != nullptr);
-  assert(download_tensor<int>(m->destination_start_indices,
-                              destination_start_indices_thrust,
-                              non_zero_experts_count));
+  assert(copy_tensor_dev_to_host<int>(m->destination_start_indices,
+                                      destination_start_indices_thrust,
+                                      non_zero_experts_count));
   assert(destination_start_indices_thrust != nullptr);
   std::vector<int> destination_start_indices_thrust_vec(
       destination_start_indices_thrust,
@@ -1233,25 +1233,14 @@ void Experts::forward_kernel_wrapper(ExpertsMeta const *m,
   }
 }
 
-ExpertsMeta::ExpertsMeta(FFHandler handler,
-                         int _num_experts,
-                         int _experts_start_idx,
-                         int _data_dim,
-                         int _out_dim,
-                         int _experts_num_layers,
-                         int _experts_internal_dim_size,
-                         int _effective_batch_size,
-                         int _num_chosen_experts,
-                         float _alpha,
-                         bool _use_bias,
-                         ActiMode _activation)
-    : OpMeta(handler), num_experts(_num_experts),
-      experts_start_idx(_experts_start_idx), data_dim(_data_dim),
-      out_dim(_out_dim), experts_num_layers(_experts_num_layers),
-      experts_internal_dim_size(_experts_internal_dim_size),
-      effective_batch_size(_effective_batch_size),
-      num_chosen_experts(_num_chosen_experts), alpha(_alpha),
-      use_bias(_use_bias), activation(_activation) {
+ExpertsMeta::ExpertsMeta(FFHandler handler, Experts const *e)
+    : OpMeta(handler, e), num_experts(e->num_experts),
+      experts_start_idx(e->experts_start_idx), data_dim(e->data_dim),
+      out_dim(e->out_dim), experts_num_layers(e->experts_num_layers),
+      experts_internal_dim_size(e->experts_internal_dim_size),
+      effective_batch_size(e->effective_batch_size),
+      num_chosen_experts(e->num_chosen_experts), alpha(e->alpha),
+      use_bias(e->use_bias), activation(e->activation) {
   expert_capacity =
       ceil(alpha * num_chosen_experts / num_experts * effective_batch_size);
 
diff --git a/src/ops/flat.cc b/src/ops/flat.cc
index 80aedbbb31..e9f637294a 100644
--- a/src/ops/flat.cc
+++ b/src/ops/flat.cc
@@ -187,7 +187,8 @@ OpMeta *Flat::init_task(Task const *task,
                         Context ctx,
                         Runtime *runtime) {
   FFHandler handler = *((FFHandler const *)task->local_args);
-  FlatMeta *m = new FlatMeta(handler);
+  Flat *flat = (Flat *)task->args;
+  FlatMeta *m = new FlatMeta(handler, flat);
   return m;
 }
 
diff --git a/src/ops/fused.cc b/src/ops/fused.cc
index 9ad5c4dc9c..121139beb1 100644
--- a/src/ops/fused.cc
+++ b/src/ops/fused.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/fused.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/batch_norm.h"
@@ -87,12 +88,32 @@ FusedOp::FusedOp(FFModel &model, Op *op)
     // weights[i]->owner_idx = i;
     weight_data_types[i] = op->weights[i]->data_type;
   }
-  numOutputs = op->numOutputs;
-  for (int i = 0; i < numOutputs; i++) {
-    outputs[i] = op->outputs[i];
-    outputs[i]->owner_op = this;
-    outputs[i]->owner_idx = i;
-    output_data_types[i] = op->outputs[i]->data_type;
+  numOutputs = 0;
+  for (int i = 0; i < op->numOutputs; i++) {
+    bool found = false;
+    // Handle in-place outputs
+    for (int j = 0; j < numInputs; j++) {
+      if (inputs[j]->region == op->outputs[i]->region) {
+        // This output is one of the inputs
+        assert(!found);
+        assert(inputs[j]->region != LogicalRegion::NO_REGION);
+        op_output_source[i] = SOURCE_INPUT;
+        op_input_idx[i] = j;
+        found = true;
+        break;
+      }
+    }
+    if (found) {
+      // do nothing
+    } else {
+      outputs[numOutputs] = op->outputs[i];
+      output_data_types[numOutputs] = op->outputs[i]->data_type;
+      op_output_source[i] = SOURCE_OUTPUT;
+      op_output_idx[i] = numOutputs;
+      outputs[numOutputs]->owner_op = this;
+      outputs[numOutputs]->owner_idx = numOutputs;
+      numOutputs++;
+    }
   }
   numOperators = 1;
   op_num_inputs[0] = op->numInputs;
@@ -109,10 +130,53 @@ FusedOp::FusedOp(FFModel &model, Op *op)
     op_weight_source[i] = SOURCE_WEIGHT;
     op_weight_idx[i] = i;
   }
-  for (int i = 0; i < numOutputs; i++) {
-    op_output_source[i] = SOURCE_OUTPUT;
-    op_output_idx[i] = i;
-  }
+  // for (int i = 0; i < numOutputs; i++) {
+  //   op_output_source[i] = SOURCE_OUTPUT;
+  //   op_output_idx[i] = i;
+  // }
+#if 0
+  int input_offset = 0, weight_offset = 0, output_offset = 0;
+  printf("\nNew fused op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: "
+         "#inputs=%i, #outputs=%i, #weights=%i\n",
+         op->name,
+         get_operator_type_name(op->op_type).c_str(),
+         op->numInputs,
+         op->numOutputs,
+         op->numWeights,
+         numInputs,
+         numOutputs,
+         numWeights);
+  printf("op_input_idx:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_idx[i]);
+  }
+  printf("\n");
+  printf("op_input_source:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_source[i]);
+  }
+  printf("\n");
+  printf("op_output_idx:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_idx[i]);
+  }
+  printf("\n");
+  printf("op_output_source:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_source[i]);
+  }
+  printf("\n");
+  printf("op_weight_idx:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_idx[i]);
+  }
+  printf("\n");
+  printf("op_weight_source:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_source[i]);
+  }
+  printf("\n");
+#endif
 }
 
 bool FusedOp::use_same_regions(
@@ -165,7 +229,8 @@ bool FusedOp::add_operator(
   // op->name, op_config));
   // Cannot fuse parallel operators (except allreduce) since they have different
   // paralel_is in forward and backward
-  assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE);
+  assert(!op->is_parallel_op() || op->op_type == OP_ALLREDUCE ||
+         op->op_type == OP_PARALLEL_IDENTITY);
   // Currently don't consider nested fusion
   assert(op->op_type != OP_FUSED);
   MachineView my_view = outputs[0]->machine_view;
@@ -271,6 +336,18 @@ bool FusedOp::add_operator(
         found = true;
         op_output_source[output_offset + i] = SOURCE_OUTPUT;
         op_output_idx[output_offset + i] = j;
+        break;
+      }
+    }
+    for (int j = 0; j < numInputs; j++) {
+      if (inputs[j]->region == op->outputs[i]->region) {
+        // This input is one of my inputs
+        assert(!found);
+        assert(inputs[j]->region != LogicalRegion::NO_REGION);
+        op_output_source[output_offset + i] = SOURCE_INPUT;
+        op_output_idx[output_offset + i] = j;
+        found = true;
+        break;
       }
     }
     if (found) {
@@ -311,6 +388,50 @@ bool FusedOp::add_operator(
             "Reach to the #outputs limit during fusion.\n"
             "Consider increase MAX_NUM_OUTPUTS to allow more fusions.\n");
   }
+
+#if 0
+  printf("\nAdd op: %s (%s), #input:%i, #output:%i, #weights:%i. Fused: "
+         "#inputs=%i, #outputs=%i, #weights=%i\n",
+         op->name,
+         get_operator_type_name(op->op_type).c_str(),
+         op->numInputs,
+         op->numOutputs,
+         op->numWeights,
+         numInputs,
+         numOutputs,
+         numWeights);
+  printf("op_input_idx:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_idx[i]);
+  }
+  printf("\n");
+  printf("op_input_source:\t");
+  for (int i = 0; i < input_offset + op->numInputs; i++) {
+    printf("%i\t", op_input_source[i]);
+  }
+  printf("\n");
+  printf("op_output_idx:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_idx[i]);
+  }
+  printf("\n");
+  printf("op_output_source:\t");
+  for (int i = 0; i < output_offset + op->numOutputs; i++) {
+    printf("%i\t", op_output_source[i]);
+  }
+  printf("\n");
+  printf("op_weight_idx:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_idx[i]);
+  }
+  printf("\n");
+  printf("op_weight_source:\t");
+  for (int i = 0; i < weight_offset + op->numWeights; i++) {
+    printf("%i\t", op_weight_source[i]);
+  }
+  printf("\n");
+#endif
+
   return true;
 }
 
@@ -404,9 +525,13 @@ void FusedOp::init_inference(FFModel const &ff,
     }
     for (int i = 0; i < op_num_outputs[op]; i++) {
       int my_off = op_output_idx[i + ooff];
-      assert(op_output_source[i + ooff] == SOURCE_OUTPUT);
-      assert(my_off < batch_outputs.size());
-      my_batch_outputs.push_back(batch_outputs[my_off]);
+      if (op_output_source[i + ooff] == SOURCE_OUTPUT) {
+        my_batch_outputs.push_back(batch_outputs[my_off]);
+      } else if (op_output_source[i + ooff] == SOURCE_INPUT) {
+        my_batch_outputs.push_back(batch_inputs[my_off]);
+      } else {
+        assert(false);
+      }
     }
     ioff += op_num_inputs[op];
     ooff += op_num_outputs[op];
@@ -526,10 +651,6 @@ FutureMap FusedOp::inference(FFModel const &ff,
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
   MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
   size_t machine_view_hash = view->hash();
-  // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig
-  // so we transfer the maximum of them
-  // size_t batch_config_size =
-  //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
   IndexLauncher launcher(FUSEDOP_INF_TASK_ID,
                          parallel_is,
                          TaskArgument(nullptr, 0),
@@ -571,6 +692,83 @@ FutureMap FusedOp::inference(FFModel const &ff,
                           batch_outputs[i]->region));
     launcher.add_field(offset + i, FID_DATA);
   }
+  offset += numOutputs;
+  // add softmax output grad
+  if (operators[numOperators - 1]->op_type == OP_SOFTMAX) {
+    // printf("operator %i is last SOFTMAX! adding grad for output %i\n",
+    //        numOperators - 1,
+    //        numOutputs - 1);
+    assert(outputs[numOutputs - 1]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[numOutputs - 1]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[numOutputs - 1]->region_grad));
+    launcher.add_field(offset, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+FutureMap FusedOp::peft_bwd(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
+  // Set iter_config
+  iter_config = ff.iter_config;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  size_t machine_view_hash = view->hash();
+  // bc is one of BatchConfig, TreeVerifyBatchConfig, and BeamSearchBatchConfig
+  // so we transfer the maximum of them
+  // size_t batch_config_size =
+  //    std::max(sizeof(TreeVerifyBatchConfig), sizeof(BeamSearchBatchConfig));
+  IndexLauncher launcher(FUSEDOP_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  int offset = 0;
+  for (int i = 0; i < numInputs; i++) {
+    assert(inputs[i]->part != LogicalPartition::NO_PART);
+    assert(inputs[i]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_inputs[i]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_inputs[i]->region_grad));
+    launcher.add_field(offset + i, FID_DATA);
+  }
+  offset += numInputs;
+  for (int i = 0; i < numWeights; i++) {
+    assert(weights[i]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(RegionRequirement(weights[i]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[i]->region));
+    launcher.add_field(offset + i, FID_DATA);
+  }
+  offset += numWeights;
+  for (int i = 0; i < numOutputs; i++) {
+    assert(outputs[i]->region != LogicalRegion::NO_REGION);
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[i]->part_grad,
+                          0 /*projection id*/,
+                          i == numOutputs - 1 ? READ_WRITE : WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[i]->region_grad));
+    launcher.add_field(offset + i, FID_DATA);
+  }
   return runtime->execute_index_space(ctx, launcher);
 }
 
diff --git a/src/ops/fused.cpp b/src/ops/fused.cpp
index 3282bc57d9..9f826cd611 100644
--- a/src/ops/fused.cpp
+++ b/src/ops/fused.cpp
@@ -15,6 +15,7 @@
 
 #include "flexflow/ops/fused.h"
 #include "flexflow/accessor.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/add_bias_residual_layer_norm.h"
 #include "flexflow/ops/batch_norm.h"
@@ -30,6 +31,7 @@
 #include "flexflow/ops/kernels/embedding_kernels.h"
 #include "flexflow/ops/kernels/flat_kernels.h"
 #include "flexflow/ops/kernels/linear_kernels.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
 #include "flexflow/ops/kernels/reshape_kernels.h"
 #include "flexflow/ops/kernels/residual_rms_norm_kernels.h"
@@ -42,6 +44,7 @@
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -78,17 +81,27 @@ OpMeta *FusedOp::init_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void FusedOp::forward_task(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime) {
+__host__ void
+    FusedOp::inference_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  // Return if no active tokens
+  if (bc->num_tokens == 0) {
+    return;
+  }
+
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
-  assert((int)regions.size() ==
-         fused->numInputs + fused->numWeights + fused->numOutputs);
+  bool softmax_grad_additional_region =
+      (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX);
+  assert((int)regions.size() == fused->numInputs + fused->numWeights +
+                                    fused->numOutputs +
+                                    softmax_grad_additional_region);
   GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
   GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS];
@@ -124,6 +137,7 @@ __host__ void FusedOp::forward_task(Task const *task,
                                          ctx,
                                          runtime);
   }
+  roff += fused->numOutputs;
   // Assert that all meta share the same dnn/blas handler
   int start = 0;
   for (start = 0; start < fused->numOperators; start++) {
@@ -138,11 +152,6 @@ __host__ void FusedOp::forward_task(Task const *task,
     }
   }
 
-  hipStream_t stream;
-  if (start < fused->numOperators) {
-    checkCUDA(get_legion_stream(&stream));
-  }
-
   int ioff = 0, woff = 0, ooff = 0;
   for (int op = 0; op < fused->numOperators; op++) {
     GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
@@ -163,8 +172,9 @@ __host__ void FusedOp::forward_task(Task const *task,
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      my_output_accessor[i] = output_accessor[i + ooff];
+      my_output_accessor[i] = output_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -179,21 +189,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                                 m->legion_axis);
         break;
       }
-      case OP_CONV2D: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_dim() == 5);
-        assert(my_weight_accessor[0].domain.get_dim() == 5);
-        assert(my_output_accessor[0].domain.get_dim() == 5);
-        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
-        Kernels::Conv2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            my_weight_accessor[1].get_float_ptr());
-        break;
-      }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -209,16 +204,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                   my_weight_accessor[1].get_float_ptr());
         break;
       }
-      case OP_DROPOUT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
-        Kernels::Dropout::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
-        break;
-      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -229,25 +214,48 @@ __host__ void FusedOp::forward_task(Task const *task,
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
         assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        float const *bias_ptr = nullptr;
+        void const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
           assert(my_weight_accessor[1].domain.get_volume() == out_dim);
           if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].get_float_ptr();
+            bias_ptr = my_weight_accessor[1].ptr;
           }
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        Kernels::Linear::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            bias_ptr,
-            in_dim,
-            out_dim,
-            batch_size);
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_accessor[0].data_type);
+        batch_size = bc->num_active_infr_tokens();
+        Kernels::Linear::forward_kernel_wrapper(m,
+                                                my_input_accessor[0].ptr,
+                                                my_output_accessor[0].ptr,
+                                                my_weight_accessor[0].ptr,
+                                                bias_ptr,
+                                                in_dim,
+                                                out_dim,
+                                                batch_size);
+        break;
+      }
+      case OP_LORA: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_accessor[0].domain;
+        Domain output_domain = my_output_accessor[0].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_accessor[0].data_type);
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr);
+        Kernels::LoraLinear::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -375,87 +383,127 @@ __host__ void FusedOp::forward_task(Task const *task,
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU: {
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
         ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        ElementUnary::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        if (m->data_type == DT_HALF) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_half_ptr(),
+              my_output_accessor[0].get_half_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else if (m->data_type == DT_FLOAT) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_float_ptr(),
+              my_output_accessor[0].get_float_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else {
+          assert(false && "Unsupported data type in ElementUnary forward");
+        }
         break;
       }
-      case OP_POOL2D: {
+      case OP_RMS_NORM: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
-        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
-        Kernels::Pool2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
+        RMSNormMeta *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::inference_kernel_wrapper(m,
+                                                   bc,
+                                                   my_input_accessor[0],
+                                                   my_weight_accessor[0],
+                                                   my_output_accessor[0]);
         break;
       }
-      case OP_FLAT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Flat::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::inference_kernel_wrapper(
+            m,
+            bc,
+            my_input_accessor[0],
+            my_input_accessor[1],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            my_output_accessor[1]);
         break;
       }
-      case OP_SOFTMAX: {
+      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
+        IncMultiHeadSelfAttentionMeta *m =
+            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
         }
+        IncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_RESHAPE: {
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Reshape::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        TreeIncMultiHeadSelfAttentionMeta *m =
+            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        TreeVerifyBatchConfig const &tree_bc =
+            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            &tree_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_TRANSPOSE: {
+      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
-        Kernels::Transpose::forward_kernel_wrapper(
+        SpecIncMultiHeadSelfAttentionMeta const *m =
+            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        // BeamSearchBatchConfig const *beam_bc =
+        //     (BeamSearchBatchConfig *)task->args;
+        BeamSearchBatchConfig const &beam_bc =
+            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain,
-            my_output_accessor[0].domain);
+            &beam_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
       case OP_LAYERNORM: {
@@ -477,23 +525,127 @@ __host__ void FusedOp::forward_task(Task const *task,
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator ResidualLayerNorm does not support "
-                        "the forward() task");
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op];
+        if (m->use_two_residuals) {
+          assert(fused->op_num_inputs[op] == 3);
+        } else {
+          assert(fused->op_num_inputs[op] == 2);
+        }
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 0);
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 1); // weight
+          } else {
+            assert(fused->op_num_weights[op] == 2); // weight + bias
+          }
+        }
+        GenericTensorAccessorR residual2;
+        if (m->use_two_residuals) {
+          residual2 = my_input_accessor[2];
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                    bc,
+                                                    my_input_accessor[0],
+                                                    my_input_accessor[1],
+                                                    residual2,
+                                                    my_output_accessor[0],
+                                                    my_output_accessor[1],
+                                                    gamma,
+                                                    beta);
         break;
       }
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator AddBiasResidualLayerNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(false && "Operator ResidualRMSNorm does not support "
-                        "the forward() task");
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 2);
+        AddBiasResidualLayerNormMeta *m =
+            (AddBiasResidualLayerNormMeta *)metas->meta[op];
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1); // attn bias
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 2); // attn bias + weight
+          } else {
+            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          }
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[1];
+          if (m->use_bias) {
+            beta = my_weight_accessor[2];
+          }
+        }
+        AddBiasResidualLayerNorm::inference_kernel_wrapper(
+            m,
+            bc,
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_input_accessor[1],
+            my_output_accessor[0],
+            my_output_accessor[1],
+            gamma,
+            beta);
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
-        assert(false && "Operator SigmoidSiluMulti does not support "
-                        "the forward() task");
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::inference_kernel_wrapper(m,
+                                                   bc,
+                                                   my_input_accessor[0],
+                                                   my_input_accessor[1],
+                                                   my_output_accessor[0]);
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        if (op == fused->numOperators - 1) { // if this is the final operator
+          output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO(
+              fused->output_data_types[fused->numOutputs - 1],
+              regions[roff],
+              task->regions[roff],
+              FID_DATA,
+              ctx,
+              runtime);
+        }
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::inference_kernel_wrapper(
+            m,
+            bc,
+            (op == fused->numOperators - 1),
+            my_input_accessor[0],
+            my_output_accessor[0],
+            output_accessor[fused->numOutputs]);
+        break;
+      }
+      case OP_ALLREDUCE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        Kernels::AllReduce::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        Kernels::ParallelIdentity::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       default: {
@@ -503,6 +655,33 @@ __host__ void FusedOp::forward_task(Task const *task,
         assert(false && "Fusion currently does not support type");
       }
     }
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_PARALLEL_IDENTITY ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        input_accessors_to_save.push_back(my_input_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(my_output_accessor[i]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save);
+    }
     ioff += fused->op_num_inputs[op];
     woff += fused->op_num_weights[op];
     ooff += fused->op_num_outputs[op];
@@ -517,18 +696,525 @@ __host__ void FusedOp::forward_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void
-    FusedOp::inference_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
+__host__ void FusedOp::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
-  FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
+  FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
+  // BatchConfig const *bc = (BatchConfig *)task->args;
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  if (bc->num_tokens == 0) {
+  // Return if no active PEFT bwd tokens
+  if (bc->num_active_peft_tokens() == 0) {
     return;
   }
+
+  assert(metas->numOperators == fused->numOperators);
+  assert(regions.size() == task->regions.size());
+  assert((int)regions.size() ==
+         fused->numInputs + fused->numWeights + fused->numOutputs);
+  // Domain input_domain[MAX_NUM_INPUTS];
+  // Domain weight_domain[MAX_NUM_WEIGHTS];
+  // Domain output_domain[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS];
+  assert(fused->numInputs <= MAX_NUM_INPUTS);
+  for (int i = 0; i < fused->numInputs; i++) {
+    // input_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i].region.get_index_space());
+    input_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->input_data_types[i],
+                                         regions[i],
+                                         task->regions[i],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  int roff = fused->numInputs;
+  assert(fused->numWeights <= MAX_NUM_WEIGHTS);
+  for (int i = 0; i < fused->numWeights; i++) {
+    // weight_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    weight_accessor[i] =
+        helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  roff += fused->numWeights;
+  assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
+  for (int i = 0; i < fused->numOutputs; i++) {
+    // output_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    output_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->output_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  // Assert that all meta share the same dnn/blas handler
+  int start = 0;
+  for (start = 0; start < fused->numOperators; start++) {
+    if (metas->meta[start] != NULL) {
+      break;
+    }
+  }
+  for (int op = start + 1; op < fused->numOperators; op++) {
+    if (metas->meta[op] != NULL) {
+      assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
+    }
+  }
+
+  int ioff = 0, woff = 0, ooff = 0;
+  // Domain my_id[MAX_NUM_INPUTS];
+  // Domain my_wd[MAX_NUM_WEIGHTS];
+  // Domain my_od[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS];
+
+  // Do backpropagation in the reverse ordering
+  for (int op = 0; op < fused->numOperators; op++) {
+    ioff += fused->op_num_inputs[op];
+    woff += fused->op_num_weights[op];
+    ooff += fused->op_num_outputs[op];
+  }
+
+  for (int op = fused->numOperators - 1; op >= 0; op--) {
+    ioff -= fused->op_num_inputs[op];
+    woff -= fused->op_num_weights[op];
+    ooff -= fused->op_num_outputs[op];
+    for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+      int my_off = fused->op_input_idx[i + ioff];
+      if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+        // my_id[i] = input_domain[my_off];
+        my_input_grad_accessor[i] = input_grad_accessor[my_off];
+      } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+        // my_id[i] = output_domain[my_off];
+        my_input_grad_accessor[i] = output_grad_accessor[my_off];
+      } else {
+        assert(false);
+      }
+    }
+    for (int i = 0; i < fused->op_num_weights[op]; i++) {
+      assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
+      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
+      my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
+    }
+    for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
+      assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
+      // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
+      // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
+      my_output_grad_accessor[i] = output_grad_accessor[my_off];
+    }
+    switch (fused->op_op_type[op]) {
+      case OP_CONCAT: {
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        // TODO: implement this
+        assert(false);
+        // ConcatMeta *m = (ConcatMeta *)metas->meta[op];
+        // int num_inputs = fused->op_num_inputs[op];
+        // Kernels::Concat::peft_bwd_kernel_wrapper(m,
+        //                                          my_output_accessor[0],
+        //                                          my_input_accessor,
+        //                                         num_inputs,
+        //                                          m->legion_axis);
+        break;
+      }
+      case OP_BATCHNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain.get_dim() == 5);
+        assert(my_output_grad_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 2);
+        assert(my_weight_accessor[1].domain.get_dim() == 2);
+        // TODO: implement this
+        assert(false);
+        // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+        // BatchNorm::peft_bwd_kernel_kernel(
+        //     m,
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[1].get_float_ptr());
+        break;
+      }
+      case OP_LINEAR: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain kernel_domain = my_weight_accessor[0].domain;
+        int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
+        int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
+        LinearMeta *m = (LinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_grad_accessor[0].data_type);
+        int num_infr_tokens = bc->num_active_infr_tokens();
+        int num_peft_tokens = bc->num_active_peft_tokens();
+        Kernels::Linear::peft_bwd_kernel_wrapper(m,
+                                                 my_input_grad_accessor[0].ptr,
+                                                 my_output_grad_accessor[0].ptr,
+                                                 my_weight_accessor[0].ptr,
+                                                 in_dim,
+                                                 out_dim,
+                                                 num_infr_tokens,
+                                                 num_peft_tokens);
+        break;
+      }
+      case OP_LORA: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_grad_accessor[0].domain;
+        Domain output_domain = my_output_grad_accessor[0].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_grad_accessor[0].data_type);
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr);
+        Kernels::LoraLinear::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_BATCHMATMUL: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain out_domain = my_output_grad_accessor[0].domain;
+        Domain a_domain = my_input_grad_accessor[0].domain;
+        Domain b_domain = my_input_grad_accessor[1].domain;
+        int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
+        assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
+        int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
+        assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
+        int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
+        assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
+        assert(a_domain.get_dim() == b_domain.get_dim());
+        assert(a_domain.get_dim() == out_domain.get_dim());
+        int batch = 1;
+        for (int i = 2; i < a_domain.get_dim(); i++) {
+          int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
+          assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
+          assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
+          batch *= dim_size;
+        }
+        // TODO: implement me
+        assert(false);
+        // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+        // Kernels::BatchMatmul::backward_kernel_wrapper(
+        //     meta,
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_input_accessor[1].get_float_ptr(),
+        //     (float const *)nullptr,
+        //     m,
+        //     n,
+        //     k,
+        //     batch,
+        //     meta->a_seq_length_dim,
+        //     meta->b_seq_length_dim,
+        //     fused->iter_config.seq_length);
+        break;
+      }
+      case OP_EW_ADD:
+      case OP_EW_SUB:
+      case OP_EW_MUL:
+      case OP_EW_DIV:
+      case OP_EW_MAX:
+      case OP_EW_MIN: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain ==
+               my_input_grad_accessor[1].domain);
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+        // Kernels::ElementBinary::forward_kernel_wrapper(m,
+        //                                                my_input_accessor[0],
+        //                                                my_input_accessor[1],
+        //                                                my_output_accessor[0]);
+        break;
+      }
+      case OP_EMBEDDING: {
+        // Currently assume the Embedding layer cannot be finetuned
+        // so we do nothing for embedding
+        break;
+      }
+      case OP_GELU:
+      case OP_RELU:
+      case OP_SIGMOID:
+      case OP_TANH:
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // TODO: implement me
+        assert(false);
+        // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+        //   if (m->data_type == DT_HALF) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_half_ptr(),
+        //         my_output_accessor[0].get_half_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else if (m->data_type == DT_FLOAT) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_float_ptr(),
+        //         my_output_accessor[0].get_float_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else {
+        //     assert(false && "Unsupported data type in ElementUnary forward");
+        //   }
+        break;
+      }
+      case OP_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::peft_bwd_kernel_wrapper(m,
+                                                  bc,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_weight_accessor[0]);
+        break;
+      }
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
+            m,
+            bc,
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            my_output_grad_accessor[0],
+            my_output_grad_accessor[1],
+            my_weight_accessor[0]);
+        break;
+      }
+      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        IncMultiHeadSelfAttentionMeta *m =
+            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+            m,
+            bc,
+            task->index_point.point_data[0],
+            my_input_grad_accessor[0],
+            my_weight_accessor[0],
+            my_output_grad_accessor[0],
+            biases);
+        break;
+      }
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
+      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+        // TODO: implement me
+        assert(false);
+        break;
+      }
+      case OP_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
+        if (m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        LayerNorm::peft_bwd_kernel_wrapper(
+            m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma);
+        break;
+      }
+      case OP_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualLayerNormMeta const *m =
+            (ResidualLayerNormMeta *)metas->meta[op];
+        if (m->use_two_residuals) {
+          assert(fused->op_num_inputs[op] == 3);
+        } else {
+          assert(fused->op_num_inputs[op] == 2);
+        }
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 0);
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 1); // weight
+          } else {
+            assert(fused->op_num_weights[op] == 2); // weight + bias
+          }
+        }
+        GenericTensorAccessorW residual2;
+        if (m->use_two_residuals) {
+          residual2 = my_input_grad_accessor[2];
+        }
+        GenericTensorAccessorR gamma;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+        }
+        ResidualLayerNorm::peft_bwd_kernel_wrapper(m,
+                                                   my_output_grad_accessor[1],
+                                                   my_input_grad_accessor[0],
+                                                   my_input_grad_accessor[1],
+                                                   residual2,
+                                                   gamma);
+        break;
+      }
+      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 2);
+        AddBiasResidualLayerNormMeta const *m =
+            (AddBiasResidualLayerNormMeta *)metas->meta[op];
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1); // attn bias
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 2); // attn bias + weight
+          } else {
+            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          }
+        }
+        GenericTensorAccessorR gamma;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[1];
+        }
+
+        AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+            m,
+            my_output_grad_accessor[1],
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            gamma);
+        break;
+      }
+      case OP_SIGMOID_SILU_MULTI: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::peft_bwd_kernel_wrapper(m,
+                                                  bc,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_input_grad_accessor[1]);
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               my_output_grad_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_ALLREDUCE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        Kernels::AllReduce::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        Kernels::ParallelIdentity::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      default: {
+        fprintf(stderr,
+                "Fusion currently does not support type = %d\n",
+                fused->op_op_type[op]);
+        assert(false && "Fusion currently does not support type");
+      }
+    }
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_PARALLEL_IDENTITY ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        input_accessors_to_save.push_back(my_input_grad_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(my_output_grad_accessor[i]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save,
+                                              false);
+    }
+  }
+}
+
+/*
+  regions[...](I): inputs
+  regions[...](I): weights
+  regions[...](O): outputs
+*/
+__host__ void FusedOp::forward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  // const FusedOp* fused = (FusedOp*) task->args;
+  FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
+  FusedOp const *fused = metas->fused_op;
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
   assert((int)regions.size() ==
@@ -582,11 +1268,6 @@ __host__ void
     }
   }
 
-  hipStream_t stream;
-  if (start < fused->numOperators) {
-    checkCUDA(get_legion_stream(&stream));
-  }
-
   int ioff = 0, woff = 0, ooff = 0;
   for (int op = 0; op < fused->numOperators; op++) {
     GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
@@ -595,8 +1276,10 @@ __host__ void
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+        assert(my_off < fused->numInputs);
         my_input_accessor[i] = input_accessor[my_off];
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+        assert(my_off < fused->numOutputs);
         my_input_accessor[i] = output_accessor[my_off];
       } else {
         assert(false);
@@ -604,11 +1287,14 @@ __host__ void
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+      assert(fused->op_weight_idx[i + woff] < fused->numWeights);
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      my_output_accessor[i] = output_accessor[i + ooff];
+      assert(my_off < fused->numOutputs);
+      my_output_accessor[i] = output_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -623,6 +1309,21 @@ __host__ void
                                                 m->legion_axis);
         break;
       }
+      case OP_CONV2D: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 5);
+        assert(my_output_accessor[0].domain.get_dim() == 5);
+        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
+        Kernels::Conv2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            my_weight_accessor[1].get_float_ptr());
+        break;
+      }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -638,6 +1339,16 @@ __host__ void
                                   my_weight_accessor[1].get_float_ptr());
         break;
       }
+      case OP_DROPOUT: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
+        Kernels::Dropout::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
+        break;
+      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -648,27 +1359,25 @@ __host__ void
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
         assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        void const *bias_ptr = nullptr;
+        float const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
           assert(my_weight_accessor[1].domain.get_volume() == out_dim);
           if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].ptr;
+            bias_ptr = my_weight_accessor[1].get_float_ptr();
           }
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        assert(m->input_type[0] == my_input_accessor[0].data_type);
-        assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_tokens();
-        Kernels::Linear::forward_kernel_wrapper(m,
-                                                my_input_accessor[0].ptr,
-                                                my_output_accessor[0].ptr,
-                                                my_weight_accessor[0].ptr,
-                                                bias_ptr,
-                                                in_dim,
-                                                out_dim,
-                                                batch_size);
+        Kernels::Linear::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            bias_ptr,
+            in_dim,
+            out_dim,
+            batch_size);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -796,124 +1505,78 @@ __host__ void
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU:
-      case OP_SCALAR_TRUE_DIV: {
+      case OP_ELU: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
         ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        if (m->data_type == DT_HALF) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else if (m->data_type == DT_FLOAT) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(false && "Unsupported data type in ElementUnary forward");
-        }
+        ElementUnary::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_RMS_NORM: {
+      case OP_POOL2D: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-        Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                 my_input_accessor[0],
-                                                 my_weight_accessor[0],
-                                                 my_output_accessor[0]);
+        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
+        Kernels::Pool2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
         break;
       }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-        Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                         my_input_accessor[0],
-                                                         my_input_accessor[1],
-                                                         my_weight_accessor[0],
-                                                         my_output_accessor[0],
-                                                         my_output_accessor[1]);
+      case OP_FLAT: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Flat::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_SOFTMAX: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        IncMultiHeadSelfAttentionMeta const *m =
-            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        IncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::forward_kernel_wrapper(
+            m, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
-      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_RESHAPE: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        TreeIncMultiHeadSelfAttentionMeta *m =
-            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        TreeVerifyBatchConfig const &tree_bc =
-            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &tree_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Reshape::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_TRANSPOSE: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        SpecIncMultiHeadSelfAttentionMeta const *m =
-            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // BeamSearchBatchConfig const *beam_bc =
-        //     (BeamSearchBatchConfig *)task->args;
-        BeamSearchBatchConfig const &beam_bc =
-            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
+        Kernels::Transpose::forward_kernel_wrapper(
             m,
-            &beam_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain,
+            my_output_accessor[0].domain);
         break;
       }
       case OP_LAYERNORM: {
@@ -935,119 +1598,23 @@ __host__ void
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualLayerNormMeta const *m =
-            (ResidualLayerNormMeta *)metas->meta[op];
-        if (m->use_two_residuals) {
-          assert(fused->op_num_inputs[op] == 3);
-        } else {
-          assert(fused->op_num_inputs[op] == 2);
-        }
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 0);
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 1); // weight
-          } else {
-            assert(fused->op_num_weights[op] == 2); // weight + bias
-          }
-        }
-        GenericTensorAccessorR residual2;
-        if (m->use_two_residuals) {
-          residual2 = my_input_accessor[2];
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
-          }
-        }
-        ResidualLayerNorm::inference_kernel_wrapper(m,
-                                                    my_input_accessor[0],
-                                                    my_input_accessor[1],
-                                                    residual2,
-                                                    my_output_accessor[0],
-                                                    my_output_accessor[1],
-                                                    gamma,
-                                                    beta);
+        assert(false && "Operator ResidualLayerNorm does not support "
+                        "the forward() task");
         break;
       }
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 2);
-        AddBiasResidualLayerNormMeta const *m =
-            (AddBiasResidualLayerNormMeta *)metas->meta[op];
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 1); // attn bias
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 2); // attn bias + weight
-          } else {
-            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
-          }
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[1];
-          if (m->use_bias) {
-            beta = my_weight_accessor[2];
-          }
-        }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
-        AddBiasResidualLayerNorm::inference_kernel_wrapper(
-            m,
-            attn_bias_dim,
-            residual_volume,
-            my_input_accessor[0],
-            my_output_accessor[0],
-            my_output_accessor[1],
-            my_input_accessor[1],
-            my_weight_accessor[0],
-            gamma,
-            beta);
+        assert(false && "Operator AddBiasResidualLayerNorm does not support "
+                        "the forward() task");
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 1);
-        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
-        SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_input_accessor[1],
-                                                   my_output_accessor[0]);
-        break;
-      }
-      case OP_SOFTMAX: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
-        }
+        assert(false && "Operator SigmoidSiluMulti does not support "
+                        "the forward() task");
         break;
       }
-      case OP_ALLREDUCE: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
-        Kernels::AllReduce::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(false && "Operator ResidualRMSNorm does not support "
+                        "the forward() task");
         break;
       }
       default: {
@@ -1176,9 +1743,6 @@ __host__ void FusedOp::backward_task(Task const *task,
     }
   }
 
-  hipStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-
   int ioff = 0, woff = 0, ooff = 0;
   GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
@@ -1202,6 +1766,7 @@ __host__ void FusedOp::backward_task(Task const *task,
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
         my_input_accessor[i] = input_accessor[my_off];
         my_input_grad_accessor[i] = input_grad_accessor[my_off];
+        assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
         my_input_accessor[i] = output_accessor[my_off];
         my_input_grad_accessor[i] = output_grad_accessor[my_off];
@@ -1220,9 +1785,9 @@ __host__ void FusedOp::backward_task(Task const *task,
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      my_output_accessor[i] = output_accessor[fused->op_output_idx[i + ooff]];
-      my_output_grad_accessor[i] =
-          output_grad_accessor[fused->op_output_idx[i + ooff]];
+      int my_off = fused->op_output_idx[i + ooff];
+      my_output_accessor[i] = output_accessor[my_off];
+      my_output_grad_accessor[i] = output_grad_accessor[my_off];
       assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain);
     }
     switch (fused->op_op_type[op]) {
diff --git a/src/ops/fused.cu b/src/ops/fused.cu
index 483028599e..cab28181da 100644
--- a/src/ops/fused.cu
+++ b/src/ops/fused.cu
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/accessor.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/ops/add_bias_residual_layer_norm.h"
 #include "flexflow/ops/batch_norm.h"
@@ -30,6 +31,7 @@
 #include "flexflow/ops/kernels/embedding_kernels.h"
 #include "flexflow/ops/kernels/flat_kernels.h"
 #include "flexflow/ops/kernels/linear_kernels.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
 #include "flexflow/ops/kernels/reshape_kernels.h"
 #include "flexflow/ops/kernels/residual_rms_norm_kernels.h"
@@ -42,6 +44,7 @@
 #include "flexflow/ops/spec_inc_multihead_self_attention.h"
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
@@ -77,27 +80,32 @@ OpMeta *FusedOp::init_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void FusedOp::forward_task(Task const *task,
-                                    std::vector<PhysicalRegion> const &regions,
-                                    Context ctx,
-                                    Runtime *runtime) {
+__host__ void
+    FusedOp::inference_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
   FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  // Return if no active tokens
+  if (bc->num_tokens == 0) {
+    return;
+  }
+
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
-  assert((int)regions.size() ==
-         fused->numInputs + fused->numWeights + fused->numOutputs);
-  // Domain input_domain[MAX_NUM_INPUTS];
-  // Domain weight_domain[MAX_NUM_WEIGHTS];
-  // Domain output_domain[MAX_NUM_OUTPUTS];
+  bool softmax_grad_additional_region =
+      (fused->op_op_type[fused->numOperators - 1] == OP_SOFTMAX);
+  assert((int)regions.size() == fused->numInputs + fused->numWeights +
+                                    fused->numOutputs +
+                                    softmax_grad_additional_region);
   GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
   GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS];
   assert(fused->numInputs <= MAX_NUM_INPUTS);
   for (int i = 0; i < fused->numInputs; i++) {
-    // input_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i].region.get_index_space());
     input_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->input_data_types[i],
                                          regions[i],
@@ -109,8 +117,6 @@ __host__ void FusedOp::forward_task(Task const *task,
   int roff = fused->numInputs;
   assert(fused->numWeights <= MAX_NUM_WEIGHTS);
   for (int i = 0; i < fused->numWeights; i++) {
-    // weight_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     weight_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
                                          regions[i + roff],
@@ -122,8 +128,6 @@ __host__ void FusedOp::forward_task(Task const *task,
   roff += fused->numWeights;
   assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
   for (int i = 0; i < fused->numOutputs; i++) {
-    // output_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     output_accessor[i] =
         helperGetGenericTensorAccessorWO(fused->output_data_types[i],
                                          regions[i + roff],
@@ -132,6 +136,7 @@ __host__ void FusedOp::forward_task(Task const *task,
                                          ctx,
                                          runtime);
   }
+  roff += fused->numOutputs;
   // Assert that all meta share the same dnn/blas handler
   int start = 0;
   for (start = 0; start < fused->numOperators; start++) {
@@ -148,36 +153,39 @@ __host__ void FusedOp::forward_task(Task const *task,
 
   int ioff = 0, woff = 0, ooff = 0;
   for (int op = 0; op < fused->numOperators; op++) {
-    // Domain my_id[MAX_NUM_INPUTS];
-    // Domain my_wd[MAX_NUM_WEIGHTS];
-    // Domain my_od[MAX_NUM_OUTPUTS];
+#if 0
+    std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl;
+#endif
     GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
     GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
     GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        // my_id[i] = input_domain[my_off];
         my_input_accessor[i] = input_accessor[my_off];
+#if 0
+        printf("\tmy_input_accessor[%i] = input_accessor[%i]\n", i, my_off);
+#endif
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        // my_id[i] = output_domain[my_off];
         my_input_accessor[i] = output_accessor[my_off];
+#if 0
+        printf("\tmy_input_accessor[%i] = output_accessor[%i]\n", i, my_off);
+#endif
       } else {
         assert(false);
       }
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
       int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
-      // my_od[i] = output_domain[my_off];
-      // my_op[i] = output_ptr[my_off];
       my_output_accessor[i] = output_accessor[my_off];
+#if 0
+      printf("\tmy_output_accessor[%i] = output_accessor[%i]\n", i, my_off);
+#endif
     }
     switch (fused->op_op_type[op]) {
       case OP_CONCAT: {
@@ -192,21 +200,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                                 m->legion_axis);
         break;
       }
-      case OP_CONV2D: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_dim() == 5);
-        assert(my_weight_accessor[0].domain.get_dim() == 5);
-        assert(my_output_accessor[0].domain.get_dim() == 5);
-        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
-        Kernels::Conv2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            my_weight_accessor[1].get_float_ptr());
-        break;
-      }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -222,16 +215,6 @@ __host__ void FusedOp::forward_task(Task const *task,
                                   my_weight_accessor[1].get_float_ptr());
         break;
       }
-      case OP_DROPOUT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
-        Kernels::Dropout::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
-        break;
-      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -242,25 +225,48 @@ __host__ void FusedOp::forward_task(Task const *task,
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
         assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        float const *bias_ptr = nullptr;
+        void const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
           assert(my_weight_accessor[1].domain.get_volume() == out_dim);
           if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].get_float_ptr();
+            bias_ptr = my_weight_accessor[1].ptr;
           }
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        Kernels::Linear::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_weight_accessor[0].get_float_ptr(),
-            bias_ptr,
-            in_dim,
-            out_dim,
-            batch_size);
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_accessor[0].data_type);
+        batch_size = bc->num_active_infr_tokens();
+        Kernels::Linear::forward_kernel_wrapper(m,
+                                                my_input_accessor[0].ptr,
+                                                my_output_accessor[0].ptr,
+                                                my_weight_accessor[0].ptr,
+                                                bias_ptr,
+                                                in_dim,
+                                                out_dim,
+                                                batch_size);
+        break;
+      }
+      case OP_LORA: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_accessor[0].domain;
+        Domain output_domain = my_output_accessor[0].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int batch_size = my_input_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_accessor[0].data_type);
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_accessor[1].ptr == my_output_accessor[0].ptr);
+        Kernels::LoraLinear::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -388,88 +394,127 @@ __host__ void FusedOp::forward_task(Task const *task,
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU: {
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
         assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
         ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        ElementUnary::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        if (m->data_type == DT_HALF) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_half_ptr(),
+              my_output_accessor[0].get_half_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else if (m->data_type == DT_FLOAT) {
+          ElementUnary::forward_kernel_wrapper(
+              m,
+              my_input_accessor[0].get_float_ptr(),
+              my_output_accessor[0].get_float_ptr(),
+              my_input_accessor[0].domain.get_volume());
+        } else {
+          assert(false && "Unsupported data type in ElementUnary forward");
+        }
         break;
       }
-      case OP_POOL2D: {
+      case OP_RMS_NORM: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_weights[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
-        // assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
-        Kernels::Pool2D::forward_kernel_wrapper(
-            m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr());
+        RMSNormMeta *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::inference_kernel_wrapper(m,
+                                                   bc,
+                                                   my_input_accessor[0],
+                                                   my_weight_accessor[0],
+                                                   my_output_accessor[0]);
         break;
       }
-      case OP_FLAT: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Flat::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualRMSNormMeta *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::inference_kernel_wrapper(
+            m,
+            bc,
+            my_input_accessor[0],
+            my_input_accessor[1],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            my_output_accessor[1]);
         break;
       }
-      case OP_SOFTMAX: {
+      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
+        IncMultiHeadSelfAttentionMeta *m =
+            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
         }
+        IncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_RESHAPE: {
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        Kernels::Reshape::forward_kernel_wrapper(
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain.get_volume());
+        TreeIncMultiHeadSelfAttentionMeta *m =
+            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        TreeVerifyBatchConfig const &tree_bc =
+            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
+            m,
+            &tree_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
-      case OP_TRANSPOSE: {
+      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
-        Kernels::Transpose::forward_kernel_wrapper(
+        SpecIncMultiHeadSelfAttentionMeta const *m =
+            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        // BeamSearchBatchConfig const *beam_bc =
+        //     (BeamSearchBatchConfig *)task->args;
+        BeamSearchBatchConfig const &beam_bc =
+            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
             m,
-            my_input_accessor[0].get_float_ptr(),
-            my_output_accessor[0].get_float_ptr(),
-            my_input_accessor[0].domain,
-            my_output_accessor[0].domain);
+            &beam_bc,
+            task->index_point.point_data[0],
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_output_accessor[0],
+            biases);
         break;
       }
       case OP_LAYERNORM: {
@@ -491,39 +536,694 @@ __host__ void FusedOp::forward_task(Task const *task,
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator ResidualLayerNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(false && "Operator AddBiasResidualLayerNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_SIGMOID_SILU_MULTI: {
-        assert(false && "Operator SigmoidSiluMulti does not support "
-                        "the forward() task");
-        break;
-      }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(false && "Operator ResidualRMSNorm does not support "
-                        "the forward() task");
-        break;
-      }
-      default: {
-        fprintf(stderr,
-                "Fusion currently does not support type = %d\n",
-                fused->op_op_type[op]);
-        assert(false && "Fusion currently does not support type");
-      }
-    }
-    ioff += fused->op_num_inputs[op];
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualLayerNormMeta *m = (ResidualLayerNormMeta *)metas->meta[op];
+        if (m->use_two_residuals) {
+          assert(fused->op_num_inputs[op] == 3);
+        } else {
+          assert(fused->op_num_inputs[op] == 2);
+        }
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 0);
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 1); // weight
+          } else {
+            assert(fused->op_num_weights[op] == 2); // weight + bias
+          }
+        }
+        GenericTensorAccessorR residual2;
+        if (m->use_two_residuals) {
+          residual2 = my_input_accessor[2];
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        ResidualLayerNorm::inference_kernel_wrapper(m,
+                                                    bc,
+                                                    my_input_accessor[0],
+                                                    my_input_accessor[1],
+                                                    residual2,
+                                                    my_output_accessor[0],
+                                                    my_output_accessor[1],
+                                                    gamma,
+                                                    beta);
+        break;
+      }
+      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 2);
+        AddBiasResidualLayerNormMeta *m =
+            (AddBiasResidualLayerNormMeta *)metas->meta[op];
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1); // attn bias
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 2); // attn bias + weight
+          } else {
+            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          }
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[1];
+          if (m->use_bias) {
+            beta = my_weight_accessor[2];
+          }
+        }
+        AddBiasResidualLayerNorm::inference_kernel_wrapper(
+            m,
+            bc,
+            my_input_accessor[0],
+            my_weight_accessor[0],
+            my_input_accessor[1],
+            my_output_accessor[0],
+            my_output_accessor[1],
+            gamma,
+            beta);
+        break;
+      }
+      case OP_SIGMOID_SILU_MULTI: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        SigmoidSiluMultiMeta *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::inference_kernel_wrapper(m,
+                                                   bc,
+                                                   my_input_accessor[0],
+                                                   my_input_accessor[1],
+                                                   my_output_accessor[0]);
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        if (op == fused->numOperators - 1) { // if this is the final operator
+          output_accessor[fused->numOutputs] = helperGetGenericTensorAccessorWO(
+              fused->output_data_types[fused->numOutputs - 1],
+              regions[roff],
+              task->regions[roff],
+              FID_DATA,
+              ctx,
+              runtime);
+        }
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::inference_kernel_wrapper(
+            m,
+            bc,
+            (op == fused->numOperators - 1),
+            my_input_accessor[0],
+            my_output_accessor[0],
+            output_accessor[fused->numOutputs]);
+        break;
+      }
+      case OP_ALLREDUCE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        Kernels::AllReduce::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        Kernels::ParallelIdentity::inference_kernel_wrapper(
+            m, bc, my_input_accessor[0], my_output_accessor[0]);
+        break;
+      }
+      default: {
+        fprintf(stderr,
+                "Fusion currently does not support type = %d\n",
+                fused->op_op_type[op]);
+        assert(false && "Fusion currently does not support type");
+      }
+    }
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_PARALLEL_IDENTITY ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        input_accessors_to_save.push_back(my_input_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(my_output_accessor[i]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save);
+    }
+    ioff += fused->op_num_inputs[op];
     woff += fused->op_num_weights[op];
     ooff += fused->op_num_outputs[op];
   }
-  // for (int i = 0; i < fused->numOutputs; i++)
-  //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
-  //   "[Fused:forward:output]");
+  // for (int i = 0; i < fused->numOutputs; i++)
+  //   print_tensor<float>(output_ptr[i], output_domain[i].get_volume(),
+  //   "[Fused:forward:output]");
+}
+
+/*
+  regions[...](I): inputs
+  regions[...](I): weights
+  regions[...](O): outputs
+*/
+__host__ void FusedOp::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+  // const FusedOp* fused = (FusedOp*) task->args;
+  FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
+  FusedOp const *fused = metas->fused_op;
+  // BatchConfig const *bc = (BatchConfig *)task->args;
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  // Return if no active PEFT bwd tokens
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+
+  assert(metas->numOperators == fused->numOperators);
+  assert(regions.size() == task->regions.size());
+  assert((int)regions.size() ==
+         fused->numInputs + fused->numWeights + fused->numOutputs);
+  // Domain input_domain[MAX_NUM_INPUTS];
+  // Domain weight_domain[MAX_NUM_WEIGHTS];
+  // Domain output_domain[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW output_grad_accessor[MAX_NUM_OUTPUTS];
+  assert(fused->numInputs <= MAX_NUM_INPUTS);
+  for (int i = 0; i < fused->numInputs; i++) {
+    // input_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i].region.get_index_space());
+    input_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->input_data_types[i],
+                                         regions[i],
+                                         task->regions[i],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  int roff = fused->numInputs;
+  assert(fused->numWeights <= MAX_NUM_WEIGHTS);
+  for (int i = 0; i < fused->numWeights; i++) {
+    // weight_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    weight_accessor[i] =
+        helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  roff += fused->numWeights;
+  assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
+  for (int i = 0; i < fused->numOutputs; i++) {
+    // output_domain[i] = runtime->get_index_space_domain(
+    //     ctx, task->regions[i + roff].region.get_index_space());
+    output_grad_accessor[i] =
+        helperGetGenericTensorAccessorRW(fused->output_data_types[i],
+                                         regions[i + roff],
+                                         task->regions[i + roff],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  // Assert that all meta share the same dnn/blas handler
+  int start = 0;
+  for (start = 0; start < fused->numOperators; start++) {
+    if (metas->meta[start] != NULL) {
+      break;
+    }
+  }
+  for (int op = start + 1; op < fused->numOperators; op++) {
+    if (metas->meta[op] != NULL) {
+      assert(metas->meta[start]->handle.blas == metas->meta[op]->handle.blas);
+      assert(metas->meta[start]->handle.dnn == metas->meta[op]->handle.dnn);
+    }
+  }
+
+  int ioff = 0, woff = 0, ooff = 0;
+  // Domain my_id[MAX_NUM_INPUTS];
+  // Domain my_wd[MAX_NUM_WEIGHTS];
+  // Domain my_od[MAX_NUM_OUTPUTS];
+  GenericTensorAccessorW my_input_grad_accessor[MAX_NUM_INPUTS];
+  GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
+  GenericTensorAccessorW my_output_grad_accessor[MAX_NUM_OUTPUTS];
+
+  // Do backpropagation in the reverse ordering
+  for (int op = 0; op < fused->numOperators; op++) {
+    ioff += fused->op_num_inputs[op];
+    woff += fused->op_num_weights[op];
+    ooff += fused->op_num_outputs[op];
+  }
+
+  for (int op = fused->numOperators - 1; op >= 0; op--) {
+#if 0
+    std::cout << get_operator_type_name(fused->op_op_type[op]) << std::endl;
+#endif
+    ioff -= fused->op_num_inputs[op];
+    woff -= fused->op_num_weights[op];
+    ooff -= fused->op_num_outputs[op];
+    for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+      int my_off = fused->op_input_idx[i + ioff];
+      if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
+        // my_id[i] = input_domain[my_off];
+        my_input_grad_accessor[i] = input_grad_accessor[my_off];
+#if 0
+        printf("\tmy_input_grad_accessor[%i] = input_grad_accessor[%i]\n", i, my_off);
+#endif
+      } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
+        // my_id[i] = output_domain[my_off];
+        my_input_grad_accessor[i] = output_grad_accessor[my_off];
+#if 0
+        printf("\tmy_input_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off);
+#endif
+      } else {
+        assert(false);
+      }
+    }
+    for (int i = 0; i < fused->op_num_weights[op]; i++) {
+      assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
+      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
+      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
+      my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
+    }
+    for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+      int my_off = fused->op_output_idx[i + ooff];
+      assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
+      // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
+      // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
+      my_output_grad_accessor[i] = output_grad_accessor[my_off];
+#if 0
+      printf("\tmy_output_grad_accessor[%i] = output_grad_accessor[%i]\n", i, my_off);
+#endif
+    }
+    switch (fused->op_op_type[op]) {
+      case OP_CONCAT: {
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        // TODO: implement this
+        assert(false);
+        // ConcatMeta *m = (ConcatMeta *)metas->meta[op];
+        // int num_inputs = fused->op_num_inputs[op];
+        // Kernels::Concat::peft_bwd_kernel_wrapper(m,
+        //                                          my_output_accessor[0],
+        //                                          my_input_accessor,
+        //                                         num_inputs,
+        //                                          m->legion_axis);
+        break;
+      }
+      case OP_BATCHNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain.get_dim() == 5);
+        assert(my_output_grad_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 2);
+        assert(my_weight_accessor[1].domain.get_dim() == 2);
+        // TODO: implement this
+        assert(false);
+        // BatchNormMeta *m = (BatchNormMeta *)metas->meta[op];
+        // BatchNorm::peft_bwd_kernel_kernel(
+        //     m,
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[0].get_float_ptr(),
+        //     my_weight_accessor[1].get_float_ptr());
+        break;
+      }
+      case OP_LINEAR: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain kernel_domain = my_weight_accessor[0].domain;
+        int in_dim = kernel_domain.hi()[0] - kernel_domain.lo()[0] + 1;
+        int out_dim = kernel_domain.hi()[1] - kernel_domain.lo()[1] + 1;
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
+        LinearMeta *m = (LinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->input_type[0] == my_output_grad_accessor[0].data_type);
+        int num_infr_tokens = bc->num_active_infr_tokens();
+        int num_peft_tokens = bc->num_active_peft_tokens();
+        Kernels::Linear::peft_bwd_kernel_wrapper(m,
+                                                 my_input_grad_accessor[0].ptr,
+                                                 my_output_grad_accessor[0].ptr,
+                                                 my_weight_accessor[0].ptr,
+                                                 in_dim,
+                                                 out_dim,
+                                                 num_infr_tokens,
+                                                 num_peft_tokens);
+        break;
+      }
+      case OP_LORA: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain input_domain = my_input_grad_accessor[0].domain;
+        Domain output_domain = my_output_grad_accessor[0].domain;
+        int in_dim = input_domain.hi()[0] - input_domain.lo()[0] + 1;
+        int out_dim = output_domain.hi()[0] - output_domain.lo()[0] + 1;
+        int batch_size = my_input_grad_accessor[0].domain.get_volume() / in_dim;
+        assert(my_output_grad_accessor[0].domain.get_volume() ==
+               out_dim * batch_size);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               in_dim * batch_size);
+        LoraLinearMeta *m = (LoraLinearMeta *)metas->meta[op];
+        assert(m->input_type[0] == my_input_grad_accessor[0].data_type);
+        assert(m->output_type[0] == my_output_grad_accessor[0].data_type);
+        // Assert that the output and the second input are at the same place
+        // since we ``inplace'' the output for LoRA
+        assert(my_input_grad_accessor[1].ptr == my_output_grad_accessor[0].ptr);
+        Kernels::LoraLinear::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_BATCHMATMUL: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        Domain out_domain = my_output_grad_accessor[0].domain;
+        Domain a_domain = my_input_grad_accessor[0].domain;
+        Domain b_domain = my_input_grad_accessor[1].domain;
+        int m = b_domain.hi()[0] - b_domain.lo()[0] + 1;
+        assert(m == out_domain.hi()[0] - out_domain.lo()[0] + 1);
+        int n = a_domain.hi()[1] - a_domain.lo()[1] + 1;
+        assert(n == out_domain.hi()[1] - out_domain.lo()[1] + 1);
+        int k = a_domain.hi()[0] - a_domain.lo()[0] + 1;
+        assert(k == b_domain.hi()[1] - b_domain.lo()[1] + 1);
+        assert(a_domain.get_dim() == b_domain.get_dim());
+        assert(a_domain.get_dim() == out_domain.get_dim());
+        int batch = 1;
+        for (int i = 2; i < a_domain.get_dim(); i++) {
+          int dim_size = a_domain.hi()[i] - a_domain.lo()[i] + 1;
+          assert(dim_size == b_domain.hi()[i] - b_domain.lo()[i] + 1);
+          assert(dim_size == out_domain.hi()[i] - out_domain.lo()[i] + 1);
+          batch *= dim_size;
+        }
+        // TODO: implement me
+        assert(false);
+        // BatchMatmulMeta *meta = (BatchMatmulMeta *)metas->meta[op];
+        // Kernels::BatchMatmul::backward_kernel_wrapper(
+        //     meta,
+        //     my_output_accessor[0].get_float_ptr(),
+        //     my_input_accessor[0].get_float_ptr(),
+        //     my_input_accessor[1].get_float_ptr(),
+        //     (float const *)nullptr,
+        //     m,
+        //     n,
+        //     k,
+        //     batch,
+        //     meta->a_seq_length_dim,
+        //     meta->b_seq_length_dim,
+        //     fused->iter_config.seq_length);
+        break;
+      }
+      case OP_EW_ADD:
+      case OP_EW_SUB:
+      case OP_EW_MUL:
+      case OP_EW_DIV:
+      case OP_EW_MAX:
+      case OP_EW_MIN: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain ==
+               my_input_grad_accessor[1].domain);
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // ElementBinaryMeta *m = (ElementBinaryMeta *)metas->meta[op];
+        // Kernels::ElementBinary::forward_kernel_wrapper(m,
+        //                                                my_input_accessor[0],
+        //                                                my_input_accessor[1],
+        //                                                my_output_accessor[0]);
+        break;
+      }
+      case OP_EMBEDDING: {
+        // Currently assume the Embedding layer cannot be finetuned
+        // so we do nothing for embedding
+        break;
+      }
+      case OP_GELU:
+      case OP_RELU:
+      case OP_SIGMOID:
+      case OP_TANH:
+      case OP_ELU:
+      case OP_SCALAR_TRUE_DIV: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain ==
+               my_output_grad_accessor[0].domain);
+        // TODO: implement me
+        assert(false);
+        // ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+        //   if (m->data_type == DT_HALF) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_half_ptr(),
+        //         my_output_accessor[0].get_half_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else if (m->data_type == DT_FLOAT) {
+        //     ElementUnary::forward_kernel_wrapper(
+        //         m,
+        //         my_input_accessor[0].get_float_ptr(),
+        //         my_output_accessor[0].get_float_ptr(),
+        //         my_input_accessor[0].domain.get_volume());
+        //   } else {
+        //     assert(false && "Unsupported data type in ElementUnary forward");
+        //   }
+        break;
+      }
+      case OP_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
+        Kernels::RMSNorm::peft_bwd_kernel_wrapper(m,
+                                                  bc,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_weight_accessor[0]);
+        break;
+      }
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
+        Kernels::ResidualRMSNorm::peft_bwd_kernel_wrapper(
+            m,
+            bc,
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            my_output_grad_accessor[0],
+            my_output_grad_accessor[1],
+            my_weight_accessor[0]);
+        break;
+      }
+      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        IncMultiHeadSelfAttentionMeta *m =
+            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
+        assert(fused->op_num_weights[op] ==
+               (1 + (int)(*m->qkv_bias || *m->final_bias)));
+        GenericTensorAccessorR biases;
+        if (*m->qkv_bias || *m->final_bias) {
+          assert(fused->op_num_weights[op] == 2);
+          biases = my_weight_accessor[1];
+        }
+        IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+            m,
+            bc,
+            task->index_point.point_data[0],
+            my_input_grad_accessor[0],
+            my_weight_accessor[0],
+            my_output_grad_accessor[0],
+            biases);
+        break;
+      }
+      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION:
+      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+        // TODO: implement me
+        assert(false);
+        break;
+      }
+      case OP_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        LayerNormMeta const *m = (LayerNormMeta *)metas->meta[op];
+        if (m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1 + (int)(m->use_bias));
+        }
+        GenericTensorAccessorR gamma, beta;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+          if (m->use_bias) {
+            beta = my_weight_accessor[1];
+          }
+        }
+        LayerNorm::peft_bwd_kernel_wrapper(
+            m, my_output_grad_accessor[0], my_input_grad_accessor[0], gamma);
+        break;
+      }
+      case OP_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_outputs[op] == 2);
+        ResidualLayerNormMeta const *m =
+            (ResidualLayerNormMeta *)metas->meta[op];
+        if (m->use_two_residuals) {
+          assert(fused->op_num_inputs[op] == 3);
+        } else {
+          assert(fused->op_num_inputs[op] == 2);
+        }
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 0);
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 1); // weight
+          } else {
+            assert(fused->op_num_weights[op] == 2); // weight + bias
+          }
+        }
+        GenericTensorAccessorW residual2;
+        if (m->use_two_residuals) {
+          residual2 = my_input_grad_accessor[2];
+        }
+        GenericTensorAccessorR gamma;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[0];
+        }
+        ResidualLayerNorm::peft_bwd_kernel_wrapper(m,
+                                                   my_output_grad_accessor[1],
+                                                   my_input_grad_accessor[0],
+                                                   my_input_grad_accessor[1],
+                                                   residual2,
+                                                   gamma);
+        break;
+      }
+      case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 2);
+        AddBiasResidualLayerNormMeta const *m =
+            (AddBiasResidualLayerNormMeta *)metas->meta[op];
+        if (!m->elementwise_affine) {
+          assert(fused->op_num_weights[op] == 1); // attn bias
+        } else {
+          if (!m->use_bias) {
+            assert(fused->op_num_weights[op] == 2); // attn bias + weight
+          } else {
+            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
+          }
+        }
+        GenericTensorAccessorR gamma;
+        if (m->elementwise_affine) {
+          gamma = my_weight_accessor[1];
+        }
+
+        AddBiasResidualLayerNorm::peft_bwd_kernel_wrapper(
+            m,
+            my_output_grad_accessor[1],
+            my_input_grad_accessor[0],
+            my_input_grad_accessor[1],
+            gamma);
+        break;
+      }
+      case OP_SIGMOID_SILU_MULTI: {
+        assert(fused->op_num_inputs[op] == 2);
+        assert(fused->op_num_outputs[op] == 1);
+        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
+        SigmoidSiluMulti::peft_bwd_kernel_wrapper(m,
+                                                  bc,
+                                                  my_output_grad_accessor[0],
+                                                  my_input_grad_accessor[0],
+                                                  my_input_grad_accessor[1]);
+        break;
+      }
+      case OP_SOFTMAX: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_grad_accessor[0].domain.get_volume() ==
+               my_output_grad_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_ALLREDUCE: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
+        Kernels::AllReduce::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        ParallelIdentityMeta const *m = (ParallelIdentityMeta *)metas->meta[op];
+        Kernels::ParallelIdentity::peft_bwd_kernel_wrapper(
+            m, bc, my_input_grad_accessor[0], my_output_grad_accessor[0]);
+        break;
+      }
+      default: {
+        fprintf(stderr,
+                "Fusion currently does not support type = %d\n",
+                fused->op_op_type[op]);
+        assert(false && "Fusion currently does not support type");
+      }
+    }
+    if (metas->meta[op]->inference_debugging &&
+        !(fused->op_op_type[op] == OP_ALLREDUCE ||
+          fused->op_op_type[op] == OP_PARALLEL_IDENTITY ||
+          fused->op_op_type[op] == OP_REPLICATE ||
+          fused->op_op_type[op] == OP_REPARTITION ||
+          fused->op_op_type[op] == OP_COMBINE)) {
+      std::vector<GenericTensorAccessorR> input_accessors_to_save;
+      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
+      std::vector<GenericTensorAccessorR> output_accessors_to_save;
+      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
+        input_accessors_to_save.push_back(my_input_grad_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_weights[op]; i++) {
+        weight_accessors_to_save.push_back(my_weight_accessor[i]);
+      }
+      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
+        output_accessors_to_save.push_back(my_output_grad_accessor[i]);
+      }
+      assert(task->index_point.get_dim() == 1);
+      int shard_id = task->index_point.point_data[0];
+      FusedOp::save_inference_tensors_to_file(metas->meta[op],
+                                              shard_id,
+                                              bc,
+                                              input_accessors_to_save,
+                                              weight_accessors_to_save,
+                                              output_accessors_to_save,
+                                              false);
+    }
+  }
 }
 
 /*
@@ -531,35 +1231,22 @@ __host__ void FusedOp::forward_task(Task const *task,
   regions[...](I): weights
   regions[...](O): outputs
 */
-__host__ void
-    FusedOp::inference_task(Task const *task,
-                            std::vector<PhysicalRegion> const &regions,
-                            Context ctx,
-                            Runtime *runtime) {
+__host__ void FusedOp::forward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
   // const FusedOp* fused = (FusedOp*) task->args;
-  FusedOpMeta *metas = *((FusedOpMeta **)task->local_args);
+  FusedOpMeta const *metas = *((FusedOpMeta **)task->local_args);
   FusedOp const *fused = metas->fused_op;
-  // BatchConfig const *bc = (BatchConfig *)task->args;
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-  // Return if no active tokens
-  if (bc->num_tokens == 0) {
-    return;
-  }
-
   assert(metas->numOperators == fused->numOperators);
   assert(regions.size() == task->regions.size());
   assert((int)regions.size() ==
          fused->numInputs + fused->numWeights + fused->numOutputs);
-  // Domain input_domain[MAX_NUM_INPUTS];
-  // Domain weight_domain[MAX_NUM_WEIGHTS];
-  // Domain output_domain[MAX_NUM_OUTPUTS];
   GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
   GenericTensorAccessorW output_accessor[MAX_NUM_OUTPUTS];
   assert(fused->numInputs <= MAX_NUM_INPUTS);
   for (int i = 0; i < fused->numInputs; i++) {
-    // input_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i].region.get_index_space());
     input_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->input_data_types[i],
                                          regions[i],
@@ -571,8 +1258,6 @@ __host__ void
   int roff = fused->numInputs;
   assert(fused->numWeights <= MAX_NUM_WEIGHTS);
   for (int i = 0; i < fused->numWeights; i++) {
-    // weight_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     weight_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
                                          regions[i + roff],
@@ -584,8 +1269,6 @@ __host__ void
   roff += fused->numWeights;
   assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
   for (int i = 0; i < fused->numOutputs; i++) {
-    // output_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     output_accessor[i] =
         helperGetGenericTensorAccessorWO(fused->output_data_types[i],
                                          regions[i + roff],
@@ -610,20 +1293,15 @@ __host__ void
 
   int ioff = 0, woff = 0, ooff = 0;
   for (int op = 0; op < fused->numOperators; op++) {
-    // Domain my_id[MAX_NUM_INPUTS];
-    // Domain my_wd[MAX_NUM_WEIGHTS];
-    // Domain my_od[MAX_NUM_OUTPUTS];
     GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
     GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
     GenericTensorAccessorW my_output_accessor[MAX_NUM_OUTPUTS];
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        // my_id[i] = input_domain[my_off];
         assert(my_off < fused->numInputs);
         my_input_accessor[i] = input_accessor[my_off];
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        // my_id[i] = output_domain[my_off];
         assert(my_off < fused->numOutputs);
         my_input_accessor[i] = output_accessor[my_off];
       } else {
@@ -632,8 +1310,6 @@ __host__ void
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
       assert(fused->op_weight_idx[i + woff] < fused->numWeights);
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
     }
@@ -641,8 +1317,6 @@ __host__ void
       int my_off = fused->op_output_idx[i + ooff];
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
       assert(my_off < fused->numOutputs);
-      // my_od[i] = output_domain[fused->op_output_idx[i + ooff]];
-      // my_op[i] = output_ptr[fused->op_output_idx[i + ooff]];
       my_output_accessor[i] = output_accessor[my_off];
     }
     switch (fused->op_op_type[op]) {
@@ -658,6 +1332,21 @@ __host__ void
                                                 m->legion_axis);
         break;
       }
+      case OP_CONV2D: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain.get_dim() == 5);
+        assert(my_weight_accessor[0].domain.get_dim() == 5);
+        assert(my_output_accessor[0].domain.get_dim() == 5);
+        Conv2DMeta *m = (Conv2DMeta *)metas->meta[op];
+        Kernels::Conv2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            my_weight_accessor[1].get_float_ptr());
+        break;
+      }
       case OP_BATCHNORM: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -673,6 +1362,16 @@ __host__ void
                                   my_weight_accessor[1].get_float_ptr());
         break;
       }
+      case OP_DROPOUT: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_outputs[op] == 1);
+        DropoutMeta *m = (DropoutMeta *)metas->meta[op];
+        Kernels::Dropout::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
+        break;
+      }
       case OP_LINEAR: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_outputs[op] == 1);
@@ -683,27 +1382,25 @@ __host__ void
         assert(my_output_accessor[0].domain.get_volume() ==
                out_dim * batch_size);
         assert(my_input_accessor[0].domain.get_volume() == in_dim * batch_size);
-        void const *bias_ptr = nullptr;
+        float const *bias_ptr = nullptr;
         LinearMeta *m = (LinearMeta *)metas->meta[op];
         if (fused->op_num_weights[op] == 2) {
           assert(my_weight_accessor[1].domain.get_volume() == out_dim);
           if (!m->add_bias_only_once || task->index_point.point_data[0] == 0) {
-            bias_ptr = my_weight_accessor[1].ptr;
+            bias_ptr = my_weight_accessor[1].get_float_ptr();
           }
         } else {
           assert(fused->op_num_weights[op] == 1);
         }
-        assert(m->input_type[0] == my_input_accessor[0].data_type);
-        assert(m->input_type[0] == my_output_accessor[0].data_type);
-        batch_size = bc->num_active_tokens();
-        Kernels::Linear::forward_kernel_wrapper(m,
-                                                my_input_accessor[0].ptr,
-                                                my_output_accessor[0].ptr,
-                                                my_weight_accessor[0].ptr,
-                                                bias_ptr,
-                                                in_dim,
-                                                out_dim,
-                                                batch_size);
+        Kernels::Linear::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_weight_accessor[0].get_float_ptr(),
+            bias_ptr,
+            in_dim,
+            out_dim,
+            batch_size);
         break;
       }
       case OP_BATCHMATMUL: {
@@ -831,126 +1528,78 @@ __host__ void
       case OP_RELU:
       case OP_SIGMOID:
       case OP_TANH:
-      case OP_ELU:
-      case OP_SCALAR_TRUE_DIV: {
+      case OP_ELU: {
+        assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
+        assert(fused->op_num_outputs[op] == 1);
+        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
+        ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
+        ElementUnary::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
+        break;
+      }
+      case OP_POOL2D: {
         assert(fused->op_num_inputs[op] == 1);
         assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain == my_output_accessor[0].domain);
-        ElementUnaryMeta *m = (ElementUnaryMeta *)metas->meta[op];
-        if (m->data_type == DT_HALF) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else if (m->data_type == DT_FLOAT) {
-          ElementUnary::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr(),
-              my_input_accessor[0].domain.get_volume());
-        } else {
-          assert(false && "Unsupported data type in ElementUnary forward");
-        }
+        Pool2DMeta *m = (Pool2DMeta *)metas->meta[op];
+        Kernels::Pool2D::forward_kernel_wrapper(
+            m,
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr());
         break;
       }
-      case OP_RMS_NORM: {
+      case OP_FLAT: {
         assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        RMSNormMeta const *m = (RMSNormMeta *)metas->meta[op];
-        Kernels::RMSNorm::forward_kernel_wrapper(m,
-                                                 my_input_accessor[0],
-                                                 my_weight_accessor[0],
-                                                 my_output_accessor[0]);
-        break;
-      }
-      case OP_RESIDUAL_RMS_NORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_weights[op] == 1);
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualRMSNormMeta const *m = (ResidualRMSNormMeta *)metas->meta[op];
-        Kernels::ResidualRMSNorm::forward_kernel_wrapper(m,
-                                                         my_input_accessor[0],
-                                                         my_input_accessor[1],
-                                                         my_weight_accessor[0],
-                                                         my_output_accessor[0],
-                                                         my_output_accessor[1]);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Flat::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_SOFTMAX: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        IncMultiHeadSelfAttentionMeta const *m =
-            (IncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        IncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
+        Kernels::Softmax::forward_kernel_wrapper(
+            m, my_input_accessor[0], my_output_accessor[0]);
         break;
       }
-      case OP_TREE_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_RESHAPE: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        TreeIncMultiHeadSelfAttentionMeta *m =
-            (TreeIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // TreeVerifyBatchConfig const *tree_bc =
-        //     (TreeVerifyBatchConfig *)task->args;
-        TreeVerifyBatchConfig const &tree_bc =
-            Future(task->futures[0]).get_result<TreeVerifyBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        TreeIncMultiHeadSelfAttention::inference_kernel_wrapper(
-            m,
-            &tree_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        Kernels::Reshape::forward_kernel_wrapper(
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain.get_volume());
         break;
       }
-      case OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION: {
+      case OP_TRANSPOSE: {
         assert(fused->op_num_inputs[op] == 1);
+        assert(fused->op_num_weights[op] == 0);
         assert(fused->op_num_outputs[op] == 1);
-        SpecIncMultiHeadSelfAttentionMeta const *m =
-            (SpecIncMultiHeadSelfAttentionMeta *)metas->meta[op];
-        // BeamSearchBatchConfig const *beam_bc =
-        //     (BeamSearchBatchConfig *)task->args;
-        BeamSearchBatchConfig const &beam_bc =
-            Future(task->futures[0]).get_result<BeamSearchBatchConfig>();
-        assert(fused->op_num_weights[op] ==
-               (1 + (int)(*m->qkv_bias || *m->final_bias)));
-        GenericTensorAccessorR biases;
-        if (*m->qkv_bias || *m->final_bias) {
-          assert(fused->op_num_weights[op] == 2);
-          biases = my_weight_accessor[1];
-        }
-        SpecIncMultiHeadSelfAttention::inference_kernel_wrapper(
+        assert(my_input_accessor[0].domain.get_volume() ==
+               my_output_accessor[0].domain.get_volume());
+        TransposeMeta *m = (TransposeMeta *)metas->meta[op];
+        Kernels::Transpose::forward_kernel_wrapper(
             m,
-            &beam_bc,
-            task->index_point.point_data[0],
-            my_input_accessor[0],
-            my_weight_accessor[0],
-            my_output_accessor[0],
-            biases);
+            my_input_accessor[0].get_float_ptr(),
+            my_output_accessor[0].get_float_ptr(),
+            my_input_accessor[0].domain,
+            my_output_accessor[0].domain);
         break;
       }
       case OP_LAYERNORM: {
@@ -972,119 +1621,23 @@ __host__ void
         break;
       }
       case OP_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_outputs[op] == 2);
-        ResidualLayerNormMeta const *m =
-            (ResidualLayerNormMeta *)metas->meta[op];
-        if (m->use_two_residuals) {
-          assert(fused->op_num_inputs[op] == 3);
-        } else {
-          assert(fused->op_num_inputs[op] == 2);
-        }
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 0);
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 1); // weight
-          } else {
-            assert(fused->op_num_weights[op] == 2); // weight + bias
-          }
-        }
-        GenericTensorAccessorR residual2;
-        if (m->use_two_residuals) {
-          residual2 = my_input_accessor[2];
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[0];
-          if (m->use_bias) {
-            beta = my_weight_accessor[1];
-          }
-        }
-        ResidualLayerNorm::inference_kernel_wrapper(m,
-                                                    my_input_accessor[0],
-                                                    my_input_accessor[1],
-                                                    residual2,
-                                                    my_output_accessor[0],
-                                                    my_output_accessor[1],
-                                                    gamma,
-                                                    beta);
+        assert(false && "Operator ResidualLayerNorm does not support "
+                        "the forward() task");
         break;
       }
       case OP_ADD_BIAS_RESIDUAL_LAYERNORM: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 2);
-        AddBiasResidualLayerNormMeta const *m =
-            (AddBiasResidualLayerNormMeta *)metas->meta[op];
-        if (!m->elementwise_affine) {
-          assert(fused->op_num_weights[op] == 1); // attn bias
-        } else {
-          if (!m->use_bias) {
-            assert(fused->op_num_weights[op] == 2); // attn bias + weight
-          } else {
-            assert(fused->op_num_weights[op] == 3); // attn bias + weight + bias
-          }
-        }
-        GenericTensorAccessorR gamma, beta;
-        if (m->elementwise_affine) {
-          gamma = my_weight_accessor[1];
-          if (m->use_bias) {
-            beta = my_weight_accessor[2];
-          }
-        }
-        Domain attn_bias_domain = my_weight_accessor[0].domain;
-        Domain residual_domain = my_input_accessor[1].domain;
-        int attn_bias_dim =
-            attn_bias_domain.hi()[0] - attn_bias_domain.lo()[0] + 1;
-        int residual_volume = residual_domain.get_volume();
-        AddBiasResidualLayerNorm::inference_kernel_wrapper(
-            m,
-            attn_bias_dim,
-            residual_volume,
-            my_input_accessor[0],
-            my_output_accessor[0],
-            my_output_accessor[1],
-            my_input_accessor[1],
-            my_weight_accessor[0],
-            gamma,
-            beta);
+        assert(false && "Operator AddBiasResidualLayerNorm does not support "
+                        "the forward() task");
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
-        assert(fused->op_num_inputs[op] == 2);
-        assert(fused->op_num_outputs[op] == 1);
-        SigmoidSiluMultiMeta const *m = (SigmoidSiluMultiMeta *)metas->meta[op];
-        SigmoidSiluMulti::inference_kernel_wrapper(m,
-                                                   my_input_accessor[0],
-                                                   my_input_accessor[1],
-                                                   my_output_accessor[0]);
-        break;
-      }
-      case OP_SOFTMAX: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_weights[op] == 0);
-        assert(fused->op_num_outputs[op] == 1);
-        assert(my_input_accessor[0].domain.get_volume() ==
-               my_output_accessor[0].domain.get_volume());
-        SoftmaxMeta *m = (SoftmaxMeta *)metas->meta[op];
-        if (m->input_type == DT_HALF) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_half_ptr(),
-              my_output_accessor[0].get_half_ptr());
-        } else if (m->input_type == DT_FLOAT) {
-          Kernels::Softmax::forward_kernel_wrapper(
-              m,
-              my_input_accessor[0].get_float_ptr(),
-              my_output_accessor[0].get_float_ptr());
-        }
+        assert(false && "Operator SigmoidSiluMulti does not support "
+                        "the forward() task");
         break;
       }
-      case OP_ALLREDUCE: {
-        assert(fused->op_num_inputs[op] == 1);
-        assert(fused->op_num_outputs[op] == 1);
-        AllReduceMeta const *m = (AllReduceMeta *)metas->meta[op];
-        Kernels::AllReduce::inference_kernel_wrapper(
-            m, bc, my_input_accessor[0], my_output_accessor[0]);
+      case OP_RESIDUAL_RMS_NORM: {
+        assert(false && "Operator ResidualRMSNorm does not support "
+                        "the forward() task");
         break;
       }
       default: {
@@ -1094,37 +1647,6 @@ __host__ void
         assert(false && "Fusion currently does not support type");
       }
     }
-    if (metas->meta[op]->inference_debugging) {
-      std::vector<GenericTensorAccessorR> input_accessors_to_save;
-      std::vector<GenericTensorAccessorR> weight_accessors_to_save;
-      std::vector<GenericTensorAccessorR> output_accessors_to_save;
-      for (int i = 0; i < fused->op_num_inputs[op]; i++) {
-        int my_off = fused->op_input_idx[i + ioff];
-        if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-          input_accessors_to_save.push_back(input_accessor[my_off]);
-        } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-          input_accessors_to_save.push_back(output_accessor[my_off]);
-        } else {
-          assert(false);
-        }
-      }
-      for (int i = 0; i < fused->op_num_weights[op]; i++) {
-        assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-        weight_accessors_to_save.push_back(
-            weight_accessor[fused->op_weight_idx[i + woff]]);
-      }
-      for (int i = 0; i < fused->op_num_outputs[op]; i++) {
-        output_accessors_to_save.push_back(output_accessor[i + ooff]);
-      }
-      assert(task->index_point.get_dim() == 1);
-      int shard_id = task->index_point.point_data[0];
-      FusedOp::save_inference_tensors_to_file(metas->meta[op],
-                                              shard_id,
-                                              bc,
-                                              input_accessors_to_save,
-                                              weight_accessors_to_save,
-                                              output_accessors_to_save);
-    }
     ioff += fused->op_num_inputs[op];
     woff += fused->op_num_weights[op];
     ooff += fused->op_num_outputs[op];
@@ -1156,9 +1678,6 @@ __host__ void FusedOp::backward_task(Task const *task,
     int sum = fused->numInputs + fused->numWeights + fused->numOutputs;
     assert(sum * 2 == (int)regions.size());
   }
-  // Domain input_domain[MAX_NUM_INPUTS], input_grad_domain[MAX_NUM_INPUTS];
-  // Domain weight_domain[MAX_NUM_WEIGHTS], weight_grad_domain[MAX_NUM_WEIGHTS];
-  // Domain output_domain[MAX_NUM_OUTPUTS], output_grad_domain[MAX_NUM_OUTPUTS];
   GenericTensorAccessorR input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorW input_grad_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR weight_accessor[MAX_NUM_WEIGHTS];
@@ -1168,8 +1687,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   int roff = 0;
   assert(fused->numInputs <= MAX_NUM_INPUTS);
   for (int i = 0; i < fused->numInputs; i++) {
-    // input_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i].region.get_index_space());
     input_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->input_data_types[i],
                                          regions[i],
@@ -1181,8 +1698,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   roff += fused->numInputs;
   assert(fused->numWeights <= MAX_NUM_WEIGHTS);
   for (int i = 0; i < fused->numWeights; i++) {
-    // weight_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     weight_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->weight_data_types[i],
                                          regions[i + roff],
@@ -1194,8 +1709,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   roff += fused->numWeights;
   assert(fused->numOutputs <= MAX_NUM_OUTPUTS);
   for (int i = 0; i < fused->numOutputs; i++) {
-    // output_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     output_accessor[i] =
         helperGetGenericTensorAccessorRO(fused->output_data_types[i],
                                          regions[i + roff],
@@ -1206,8 +1719,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
   roff += fused->numOutputs;
   for (int i = 0; i < fused->numInputs; i++) {
-    // input_grad_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     input_grad_accessor[i] =
         helperGetGenericTensorAccessorRW(fused->input_data_types[i],
                                          regions[i + roff],
@@ -1219,8 +1730,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
   roff += fused->numInputs;
   for (int i = 0; i < fused->numWeights; i++) {
-    // weight_grad_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     weight_grad_accessor[i] =
         helperGetGenericTensorAccessorRW(fused->weight_data_types[i],
                                          regions[i + roff],
@@ -1233,8 +1742,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
   roff += fused->numWeights;
   for (int i = 0; i < fused->numOutputs; i++) {
-    // output_grad_domain[i] = runtime->get_index_space_domain(
-    //     ctx, task->regions[i + roff].region.get_index_space());
     output_grad_accessor[i] =
         helperGetGenericTensorAccessorRW(fused->output_data_types[i],
                                          regions[i + roff],
@@ -1260,9 +1767,6 @@ __host__ void FusedOp::backward_task(Task const *task,
   }
 
   int ioff = 0, woff = 0, ooff = 0;
-  // Domain my_id[MAX_NUM_INPUTS], my_grad_id[MAX_NUM_INPUTS];
-  // Domain my_wd[MAX_NUM_WEIGHTS], my_grad_wd[MAX_NUM_WEIGHTS];
-  // Domain my_od[MAX_NUM_OUTPUTS], my_grad_od[MAX_NUM_OUTPUTS];
   GenericTensorAccessorR my_input_accessor[MAX_NUM_INPUTS];
   GenericTensorAccessorR my_weight_accessor[MAX_NUM_WEIGHTS];
   GenericTensorAccessorR my_output_accessor[MAX_NUM_OUTPUTS];
@@ -1283,19 +1787,11 @@ __host__ void FusedOp::backward_task(Task const *task,
     for (int i = 0; i < fused->op_num_inputs[op]; i++) {
       int my_off = fused->op_input_idx[i + ioff];
       if (fused->op_input_source[i + ioff] == SOURCE_INPUT) {
-        // my_id[i] = input_domain[my_off];
-        // my_ip[i] = input_ptr[my_off];
         my_input_accessor[i] = input_accessor[my_off];
-        // my_grad_id[i] = input_grad_domain[my_off];
-        // my_grad_ip[i] = input_grad_ptr[my_off];
         my_input_grad_accessor[i] = input_grad_accessor[my_off];
         assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else if (fused->op_input_source[i + ioff] == SOURCE_OUTPUT) {
-        // my_id[i] = output_domain[my_off];
-        // my_ip[i] = output_ptr[my_off];
         my_input_accessor[i] = output_accessor[my_off];
-        // my_grad_id[i] = output_grad_domain[my_off];
-        // my_grad_ip[i] = output_grad_ptr[my_off];
         my_input_grad_accessor[i] = output_grad_accessor[my_off];
         assert(my_input_grad_accessor[i].domain == my_input_accessor[i].domain);
       } else {
@@ -1304,11 +1800,7 @@ __host__ void FusedOp::backward_task(Task const *task,
     }
     for (int i = 0; i < fused->op_num_weights[op]; i++) {
       assert(fused->op_weight_source[i + woff] == SOURCE_WEIGHT);
-      // my_wd[i] = weight_domain[fused->op_weight_idx[i + woff]];
-      // my_wp[i] = weight_ptr[fused->op_weight_idx[i + woff]];
       my_weight_accessor[i] = weight_accessor[fused->op_weight_idx[i + woff]];
-      // my_grad_wd[i] = weight_grad_domain[fused->op_weight_idx[i + woff]];
-      // my_grad_wp[i] = weight_grad_ptr[fused->op_weight_idx[i + woff]];
       my_weight_grad_accessor[i] =
           weight_grad_accessor[fused->op_weight_idx[i + woff]];
       assert(my_weight_grad_accessor[i].domain.get_volume() ==
@@ -1317,11 +1809,7 @@ __host__ void FusedOp::backward_task(Task const *task,
     for (int i = 0; i < fused->op_num_outputs[op]; i++) {
       assert(fused->op_output_source[i + ooff] == SOURCE_OUTPUT);
       int my_off = fused->op_output_idx[i + ooff];
-      // my_od[i] = output_domain[my_off];
-      // my_op[i] = output_ptr[my_off];
       my_output_accessor[i] = output_accessor[my_off];
-      // my_grad_od[i] = output_grad_domain[my_off];
-      // my_grad_op[i] = output_grad_ptr[my_off];
       my_output_grad_accessor[i] = output_grad_accessor[my_off];
       assert(my_output_grad_accessor[i].domain == my_output_accessor[i].domain);
     }
diff --git a/src/ops/group_by.cc b/src/ops/group_by.cc
index f2f402737c..03b9a5199b 100644
--- a/src/ops/group_by.cc
+++ b/src/ops/group_by.cc
@@ -99,7 +99,7 @@ Group_byParams Group_by::get_params() const {
   Group_byParams params;
   params.n = this->n;
   params.alpha = this->alpha;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -271,7 +271,7 @@ OpMeta *Group_by::init_task(Task const *task,
                             Runtime *runtime) {
   Group_by *gb = (Group_by *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  GroupByMeta *m = new GroupByMeta(handle, gb->n, gb->alpha);
+  GroupByMeta *m = new GroupByMeta(handle, gb);
   m->profiling = gb->profiling;
   m->inference_debugging = gb->inference_debugging;
   std::strcpy(m->op_name, gb->name);
@@ -579,7 +579,7 @@ bool Group_by::measure_operator_cost(Simulator *sim,
     }
   }
 
-  GroupByMeta *m = new GroupByMeta(sim->handler, n, alpha);
+  GroupByMeta *m = new GroupByMeta(sim->handler, this);
 
   // allocate
   sim->free_all();
diff --git a/src/ops/group_by.cpp b/src/ops/group_by.cpp
index 761c35f182..9ca6f77898 100644
--- a/src/ops/group_by.cpp
+++ b/src/ops/group_by.cpp
@@ -188,9 +188,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m,
                      data_dim);
 }
 
-GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha)
-    : OpMeta(handler), alpha(_alpha) {
-  checkCUDA(hipMalloc(&dev_region_ptrs, n * sizeof(float *)));
+GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb)
+    : OpMeta(handler, gb), alpha(gb->alpha) {
+  checkCUDA(hipMalloc(&dev_region_ptrs, gb->n * sizeof(float *)));
 }
 GroupByMeta::~GroupByMeta(void) {
   checkCUDA(hipFree(&dev_region_ptrs));
diff --git a/src/ops/group_by.cu b/src/ops/group_by.cu
index 0ed09e20b3..43bcb900df 100644
--- a/src/ops/group_by.cu
+++ b/src/ops/group_by.cu
@@ -198,9 +198,9 @@ void Group_by::backward_kernel_wrapper(GroupByMeta const *m,
   }
 }
 
-GroupByMeta::GroupByMeta(FFHandler handler, int n, float _alpha)
-    : OpMeta(handler), alpha(_alpha) {
-  checkCUDA(cudaMalloc(&dev_region_ptrs, n * sizeof(float *)));
+GroupByMeta::GroupByMeta(FFHandler handler, Group_by const *gb)
+    : OpMeta(handler, gb), alpha(gb->alpha) {
+  checkCUDA(cudaMalloc(&dev_region_ptrs, gb->n * sizeof(float *)));
 }
 GroupByMeta::~GroupByMeta(void) {
   checkCUDA(cudaFree(&dev_region_ptrs));
diff --git a/src/ops/inc_multihead_self_attention.cc b/src/ops/inc_multihead_self_attention.cc
index aa60d0f19c..8219cf9e1f 100644
--- a/src/ops/inc_multihead_self_attention.cc
+++ b/src/ops/inc_multihead_self_attention.cc
@@ -363,7 +363,9 @@ IncMultiHeadSelfAttention::IncMultiHeadSelfAttention(
         dims,
         quantization_type == DT_NONE ? this->data_type : quantization_type,
         nullptr /*owner_op*/,
-        true /*create_grad*/,
+        model.config.computationMode == COMP_MODE_INFERENCE
+            ? false
+            : true /*create_grad*/,
         initializer,
         CHOSEN_SYNC_TYPE);
     if (qkv_bias || final_bias) {
@@ -871,6 +873,139 @@ void IncMultiHeadSelfAttention::inference_task(
   }
 }
 
+FutureMap IncMultiHeadSelfAttention::peft_bwd(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  int idx = 0;
+  IndexLauncher launcher(INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(idx++, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        weights[0]->region,
+                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+  launcher.add_field(idx++, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(idx++, FID_DATA);
+  if (qkv_bias || final_bias) {
+    launcher.add_region_requirement(
+        RegionRequirement(weights[1]->part,
+                          0 /*projection id*/,
+                          READ_ONLY,
+                          EXCLUSIVE,
+                          weights[1]->region,
+                          ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+    launcher.add_field(idx++, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): input
+  regions[3](I): weight
+  regions[4](O): output
+*/
+void IncMultiHeadSelfAttention::peft_bwd_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  log_inc_mha.debug("BatchConfig, num_tokens: %d, num_requests: %d",
+                    bc->num_tokens,
+                    bc->num_active_requests());
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+
+  IncMultiHeadSelfAttentionMeta *m =
+      *((IncMultiHeadSelfAttentionMeta **)task->local_args);
+
+  assert(((*m->qkv_bias || *m->final_bias) ? regions.size() == 4
+                                           : regions.size() == 3));
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR biases;
+  if (*m->qkv_bias || *m->final_bias) {
+    biases = helperGetGenericTensorAccessorRO(m->weight_type[1],
+                                              regions[3],
+                                              task->regions[3],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+    Domain bias_domain = runtime->get_index_space_domain(
+        ctx, task->regions[3].region.get_index_space());
+    assert(bias_domain.get_dim() == 4);
+  }
+
+  Domain input_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  Domain weight_domain = runtime->get_index_space_domain(
+      ctx, task->regions[1].region.get_index_space());
+  Domain output_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[2].region.get_index_space());
+
+  assert(input_grad_domain.get_dim() == 4);
+  assert(weight_domain.get_dim() == 2);
+  assert(output_grad_domain.get_dim() == 4);
+
+  assert(task->index_point.get_dim() == 1);
+
+  IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+      m,
+      bc,
+      task->index_point.point_data[0],
+      input_grad,
+      weight,
+      output_grad,
+      biases);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    IncMultiHeadSelfAttention::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
+  }
+}
+
 void IncMultiHeadSelfAttention::backward(FFModel const &ff) {
   // IncMultiHeadSelfAttention does not support backward
   assert(false);
@@ -926,7 +1061,7 @@ IncMultiHeadSelfAttentionParams IncMultiHeadSelfAttention::get_params() const {
   params.quantization_type = this->quantization_type;
   params.offload = this->offload;
   params.num_kv_heads = this->num_kv_heads;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
 
diff --git a/src/ops/inc_multihead_self_attention.cpp b/src/ops/inc_multihead_self_attention.cpp
index d60386f927..826fea4347 100644
--- a/src/ops/inc_multihead_self_attention.cpp
+++ b/src/ops/inc_multihead_self_attention.cpp
@@ -12,13 +12,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
 #include "flexflow/utils/hip_helper.h"
-#include <hip/hip_complex.h>
+#include "hip/hip_complex.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
@@ -27,9 +27,288 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Memory;
 
+#define WARP_SIZE 32
+
 namespace Kernels {
 namespace IncMultiHeadAttention {
 
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_sync(mask, var, srcLane, width);
+#else
+  return __shfl(var, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_xor_sync(mask, var, laneMask, width);
+#else
+  return __shfl_xor(var, laneMask, width);
+#endif
+}
+
+// gridDim = num_heads
+// blockDim = num_tokens/num_request * head_size
+// QKV tensor layout: |QKV| * num_new_tokens. |Q=K=V=head_size * num_heads|
+// one thread process one head_size
+template <typename DT,
+          int THREADS_PER_BLOCK,
+          int Dh,
+          int Dh_MAX,
+          int THREADS_PER_KEY,
+          int THREADS_PER_VALUE>
+__global__ void compute_attention_kernel_generation_kernel(
+    DT const *query,
+    DT const *key_cache,
+    DT const *value_cache,
+    DT *output_ptr,
+    float const scale,
+    int max_seq_length,
+    int per_head_size,
+    int hidden_size,
+    BatchConfig::PerRequestInfo *request_infos) {
+
+  // q, k
+  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using V_vec = typename VEC_V<DT>::Type;
+  using Out_sum = typename Vec_fp32_<V_vec>::Type;
+
+  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+  // eg.  if head_size = 128, thread_per_key = 4, with float32 precision
+  // then K_VEC_SIZE = 1,  QK_VEC_SIZE = 4
+  //  K_ELTS_PER_THREAD = 128 / 4 = 32
+  //  K_VECS_PER_THREAD = 32 / 1 = 32
+  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
+  // constexpr int QK_VEC_SIZE = 16 / sizeof(DT);
+  // // constexpr int QK_VEC_SIZE = sizeof(Qk_vec_k) / sizeof(DT);
+  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
+  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
+
+  // thread id
+  int const tidx = threadIdx.x;
+  // head id
+  int const head_idx = blockIdx.x;
+  // request idx
+  int const request_idx = blockIdx.y;
+
+  int const batch_config_request_id =
+      request_infos[request_idx].batch_config_request_id;
+
+  int const first_step = 0;
+
+  int const tlength =
+      request_infos[batch_config_request_id].first_token_depth_in_request +
+      request_infos[batch_config_request_id].num_tokens_in_batch;
+
+  // shared memory objects
+  extern __shared__ char smem_[];
+
+  float *qk_smem = reinterpret_cast<float *>(smem_);
+  float *out_smem = reinterpret_cast<float *>(smem_);
+
+  float qk_max = -FLT_MAX;
+
+  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
+  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+
+  const DT *q_ptr = query + request_idx * hidden_size * QKV_WEIGHT_NUM +
+                    head_idx * per_head_size;
+  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
+  // DT const *q_ptr =
+  //     query + request_idx * Dh * QKV_WEIGHT_NUM + head_idx * per_head_size;
+
+  // q tensor in this thread
+  // if THREADS_PER_KEY is 4, first thread load 0, 4, 8, 12..., total
+  // K_VECS_PER_THREAD elements
+  // QK_vec_k: 32->1, 64->2, 128->4... head_size
+  // K_vec_k: 4->1, 2->2, 1->4 threads_per_key
+
+  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
+  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
+  int ki_o = tidx % THREADS_PER_KEY;
+  // the first key's offset for this thread
+  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
+  int ko = tidx / THREADS_PER_KEY;
+  // load q tensor
+  Q_vec q_vec[K_VECS_PER_THREAD];
+#pragma unroll
+  for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+    q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
+        q_ptr + ki + ii * THREADS_PER_KEY * K_VEC_SIZE);
+  }
+  __syncthreads();
+  // first iter = 128 / 4 = 32
+  // K_VECS_PER_THREAD = 32
+  //  K_PER_ITER how many keys in this loop
+  //  The number of timesteps loaded per iteration.
+  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+  //   // The number of keys per warp.
+  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+  DT const *k_cache_batch =
+      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+
+  int ti_end =
+      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
+  // get k, perform qk proj
+
+  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+    K_vec k[K_VECS_PER_THREAD];
+    int const ti_circ = ti % max_seq_length;
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+      int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
+      if (ti < tlength) {
+        k[ii] = *reinterpret_cast<K_vec const *>(k_cache_batch +
+                                                 ti_circ * hidden_size +
+                                                 head_idx * per_head_size + jj);
+      }
+      // Compute dot product.
+      // This includes a reduction across the threads in the same thread group.
+    }
+    float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
+    // // todo add positional embedding to the qk production
+    // // Store the product to shared memory. There's one qk value per
+    // timestep.
+    // // Update the max.
+    if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+      // todo add alobi here
+      bool const mask = ti_circ >= tlength;
+      if (mask) {
+        assert(false);
+      }
+      qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+      qk_smem[ti - first_step] = mask ? 0.f : qk;
+    }
+  }
+
+  __syncthreads();
+
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+    qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+  }
+
+  // Decompose the thread index into warp and lane.
+  int const warp = tidx / WARP_SIZE;
+  int const lane = tidx % WARP_SIZE;
+
+  // The warp leader writes the max to shared memory.
+  if (lane == 0) {
+    red_smem[warp] = qk_max;
+  }
+
+  // Make sure the products are in shared memory.
+  __syncthreads();
+
+  // The warps finalize the reduction.
+  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+    qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+  }
+
+  // Broadcast to all the threads in the warp.
+  qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0);
+
+  float exp_sum = 0.f;
+  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+    float logit = __expf(qk_smem[ti - first_step] - qk_max);
+    exp_sum += logit;
+    qk_smem[ti - first_step] = logit;
+  }
+
+  // Compute the sum.
+  exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
+
+  // softmax
+  float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
+  for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+    qk_smem[ti - first_step] *= inv_sum;
+  }
+
+  __syncthreads();
+  // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0) {
+  //   printf("softmax %.10f\n", qk_smem[0]);
+  // }
+
+  // value projection
+  constexpr int V_VEC_SIZE = 16 / sizeof(DT);
+  // A vector of V elements for the current timestep.
+  // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
+  // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
+
+  // The value computed by this thread.
+  int vo = tidx / THREADS_PER_VALUE;
+  // The hidden dimensions computed by this particular thread.
+  int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
+  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+
+  Out_sum out;
+  zero(out);
+
+  // The base pointer for the value in the cache buffer.
+  DT const *v_cache_batch =
+      value_cache + batch_config_request_id * max_seq_length * hidden_size + vi;
+
+  if (Dh == Dh_MAX || vi < Dh) {
+    for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
+      // Load the values from the cache.
+      int const ti_circ = ti % max_seq_length;
+
+      V_vec v = *reinterpret_cast<V_vec const *>(
+          v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
+      float logit = qk_smem[ti - first_step];
+      out = FlexFlow::fma(logit, cast_to_float(v), out);
+    }
+  }
+
+  //   // Make sure we can start writing to shared memory.
+  __syncthreads();
+
+  // Run the final reduction amongst the different groups computing different
+  // partial outputs.
+  if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+    for (int active_groups = V_PER_ITER; active_groups >= 2;
+         active_groups /= 2) {
+
+      // The midpoint in the number of active groups.
+      int midpoint = active_groups / 2;
+
+      // The upper part of active threads store to shared memory.
+      if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+        *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
+            out;
+      }
+      __syncthreads();
+
+      // The bottom warps update their values.
+      if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+        out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
+                  out);
+      }
+      __syncthreads();
+    }
+  }
+
+  // Output the final values.
+  if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+    convert_from_float(
+        *reinterpret_cast<V_vec *>(output_ptr + request_idx * hidden_size +
+                                   head_idx * per_head_size + vi),
+        out);
+  }
+}
+
 // only used by MPT model. https://arxiv.org/abs/2108.12409
 template <typename DT>
 __global__ void apply_position_bias_qkprd(DT *input_ptr,
@@ -86,8 +365,10 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
     // int qkv_index = i / (num_tokens * qProjSize) % 3;
 
     int token_idx = i / (hidden_size * QKV_WEIGHT_NUM);
-    size_t in_token_idx = i - token_idx * hidden_size * 3;
+    size_t in_token_idx = i - token_idx * hidden_size * QKV_WEIGHT_NUM;
+
     int qkv_index = in_token_idx / hidden_size;
+
     int proj_size = qkv_index == 0 ? qProjSize : kProjSize;
 
     int head_idx =
@@ -109,6 +390,7 @@ __global__ void apply_proj_bias_qkv(DT *input_ptr,
     }
   }
 }
+
 template <typename DT>
 __global__ void scaling_query_kernel(DT *input_ptr,
                                      int qProjSize,
@@ -158,6 +440,10 @@ __global__ void
     int token_idx =
         (real_i - head_idx * (num_tokens * proj_size / 2)) / (proj_size / 2);
     size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    // float before_real = complex_input[i].x, before_complex =
+    // complex_input[i].y;
+
     int pos_i = real_i % (proj_size / 2);
     float freq = pos * (1.0 / pow(10000.0, (float)2 * pos_i / proj_size));
     hipFloatComplex complex_pos = {cos(freq), sin(freq)};
@@ -189,7 +475,7 @@ __global__ void
     int head_idx = (real_i - (token_idx * (hidden_size / 2))) / (proj_size / 2);
 
     int real_part_index = idx + head_idx * proj_size +
-                          token_idx * hidden_size * 3 +
+                          token_idx * hidden_size * QKV_WEIGHT_NUM +
                           hidden_size * (q_tensor ? 0 : 1);
     int complex_part_index = real_part_index + (proj_size / 2);
 
@@ -217,28 +503,59 @@ __global__ void
 }
 
 template <typename DT>
-__global__ void store_kv_cache(DT const *devQKVProjArray,
-                               DT *kCache_ptr,
-                               DT *vCache_ptr,
+__global__ void
+    apply_rotary_embedding_bwd(DT *input_ptr,
+                               hipFloatComplex *complex_input,
                                BatchConfig::PerTokenInfo const *tokenInfos,
+                               int proj_size,
                                int num_tokens,
-                               int max_seq_len,
                                int hidden_size) {
   CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
-    int token_idx = i / hidden_size;
-    int offset = i % hidden_size;
-    size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset;
-    DT kVal = devQKVProjArray[val_idx];
-    DT vVal = devQKVProjArray[val_idx + hidden_size];
+    // compute indexes to visit first half proj_size of each of q/k tensor.
+    // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd
+    bool q_tensor = i < (num_tokens * hidden_size / 2);
+    int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2;
+    assert(hidden_size % proj_size == 0);
+    int num_heads = hidden_size / proj_size;
+
+    int token_idx = real_i % num_tokens;
+    int idx = (real_i / num_tokens) % (proj_size / 2);
+    int head_idx = real_i / (num_tokens * proj_size / 2);
+    assert(head_idx < num_heads);
 
-    int const req_id = tokenInfos[token_idx].request_index;
-    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+    int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size +
+                             head_idx * num_tokens * proj_size +
+                             idx * num_tokens + token_idx;
+    int real_part_index = complex_part_index + (proj_size / 2) * num_tokens;
 
-    // key cache
-    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = kVal;
-    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
-               offset] = vVal;
+    complex_input[i] = {input_ptr[real_part_index],
+                        input_ptr[complex_part_index]};
+
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
+    hipFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    complex_input[i] = hipCmulf(complex_input[i], complex_pos);
+    input_ptr[real_part_index] = complex_input[i].x;
+    input_ptr[complex_part_index] = complex_input[i].y;
+  }
+}
+
+template <typename DT>
+__global__ void fill_entries_above_diagonal(DT *matrix,
+                                            size_t num_rows,
+                                            size_t num_cols,
+                                            size_t num_q_heads,
+                                            size_t entries_above_diagonal,
+                                            DT value) {
+  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
+    size_t head_idx = i / entries_above_diagonal;
+    size_t entry_idx = i % entries_above_diagonal;
+    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
+    size_t x = entry_idx - y * (y + 1) / 2;
+    y += (num_cols - num_rows) + 1;
+    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
   }
 }
 
@@ -254,56 +571,68 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
-  DT alpha = 1.0f, beta = 0.0f;
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
-  hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
-  // Compute (W^T)x matmul: einsum(ijkl,im->jmkl)
-  // Weights: qSize x qProjSize x 3 x num_q_heads
-  // Input: qSize x num_tokens
-  // Output >>> qProjSize x num_tokens x 3 x num_q_heads
-  int m_q = m->qProjSize * m->num_q_heads;
-  int m_k = m->kProjSize * m->num_q_heads;
-  int m_v = m->vProjSize * m->num_q_heads;
-  assert(m_q == m_k && m_k == m_v); // keep things simple for now
-  int n = bc->num_active_tokens();
-  int k = m->qSize;
-  int m_ = m_q * QKV_WEIGHT_NUM;
-  int lda = k, ldb = k, ldc = m_;
-  checkCUDA(hipblasGemmEx(m->handle.blas,
-                          HIPBLAS_OP_T,
-                          HIPBLAS_OP_N,
-                          m_,
-                          n,
-                          k,
-                          &alpha,
-                          weight_ptr,
-                          hipblas_data_type,
-                          lda,
-                          input_ptr,
-                          hipblas_data_type,
-                          ldb,
-                          &beta,
-                          output_ptr,
-                          hipblas_data_type,
-                          ldc,
-                          compute_type,
-                          HIPBLAS_GEMM_DEFAULT));
-
-  // apply rotary emmmbedding for q and k
-  // step1 change the k, v to complex tensor
+  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  hipblasDatatype_t compute_type = cublas_data_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+
+  // Step 1: Compute QKV projections
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    // after transpositions
+    int m_q = m->qProjSize * m->num_q_heads;
+    int m_k = m->kProjSize * m->num_q_heads;
+    int m_v = m->vProjSize * m->num_q_heads;
+    assert(m_q == m_k && m_k == m_v); // keep things simple for now
+    int n = bc->num_active_infr_tokens();
+    int k = m->qSize;
+    int m_ = m_q * QKV_WEIGHT_NUM;
+    // before transpositions
+    int lda = k, ldb = k, ldc = m_;
+    // matrix A: QKV weights
+    // matrix A's layout: [qSize (hidden_dim), qProjSize, num_heads, 3]
+    // matrix B: input
+    // matrix B's layout: [qSize (hidden_dim), num_new_tokens]
+    // matrix C: devQKVProjArray
+    // matrix B's layout: [qProjSize, num_heads, 3, num_new_tokens]
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_T,
+                            HIPBLAS_OP_N,
+                            m_,
+                            n,
+                            k,
+                            &alpha,
+                            weight_ptr,
+                            cublas_data_type,
+                            lda,
+                            input_ptr,
+                            cublas_data_type,
+                            ldb,
+                            &beta,
+                            output_ptr,
+                            cublas_data_type,
+                            ldc,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+
   int num_tokens = bc->num_active_tokens();
   int parallelism = m->kProjSize * num_tokens * m->num_q_heads;
   size_t q_array_size = m->qProjSize * num_tokens * m->num_q_heads;
-  // apply bias for q, k, v
+
+  // Step 2: apply bias for QKV, or scale the query
   if (*m->qkv_bias) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv<DT>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_qkv),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
@@ -321,7 +650,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                        m->scaling_factor,
                        m->hidden_size);
   } else if (m->scaling_query) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel<DT>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(scaling_query_kernel),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
@@ -333,10 +662,12 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                        m->scaling_factor,
                        m->hidden_size);
   }
+
+  // Step 3: apply rotary embedding if needed
   if (*m->apply_rotary_embedding) {
     /*q&k*/
     parallelism = num_tokens * m->hidden_size;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf<DT>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_hf),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
@@ -352,14 +683,42 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
+template <typename DT>
+__global__ void store_kv_cache(DT const *devQKVProjArray,
+                               DT *kCache_ptr,
+                               DT *vCache_ptr,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int num_tokens,
+                               int max_seq_len,
+                               int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+
+    DT kVal = devQKVProjArray[val_idx];
+    DT vVal = devQKVProjArray[val_idx + hidden_size];
+    int const req_id = tokenInfos[token_idx].request_index;
+    int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+
+    // key cache
+    kCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = kVal;
+    vCache_ptr[req_id * (hidden_size * max_seq_len) + tok_id * hidden_size +
+               offset] = vVal;
+  }
+}
+
 template <typename DT>
 void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             BatchConfig const *bc,
                             hipStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   if (num_tokens > 0) {
     int parallelism = m->hidden_size * num_tokens;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache<DT>),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(store_kv_cache),
                        GET_BLOCKS(parallelism),
                        min(CUDA_NUM_THREADS, parallelism),
                        0,
@@ -374,6 +733,129 @@ void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
   }
 }
 
+template <typename DT>
+void compute_o_prod_bias(IncMultiHeadSelfAttentionMeta const *m,
+                         BatchConfig const *bc,
+                         int shard_id,
+                         DT *output_ptr,
+                         DT const *weight_ptr,
+                         DT const *bias_ptr,
+                         int num_tokens,
+                         hipStream_t stream) {
+  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+#if CUDA_VERSION >= 11000
+  // TODO: currently set the default to CUBLAS_COMPUTE_16F for best performance
+  hipblasDatatype_t compute_type = HIPBLAS_R_16F;
+#else
+  hipblasDatatype_t compute_type = cublas_data_type;
+#endif
+  // Project to output, save result directly on output tensor
+  {
+    DT alpha = 1.0f, beta = 0.0f;
+    // after transpositions
+    int m_ = m->oProjSize;
+    int k = m->vProjSize * m->num_q_heads;
+    int n = num_tokens;
+    // before transpositions
+    int lda = k, ldb = k, ldc = m_;
+    // matrix A: output projection weight
+    // matrix A's layout: [vProjSize * num_heads, oProjSize]
+    DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                           m->kProjSize * m->num_q_heads +
+                                           m->vProjSize * m->num_q_heads);
+    // matrix B: attn heads
+    // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+    DT const *B = static_cast<DT *>(m->attn_heads);
+    // matrix B: output
+    // matrix B's layout: [oProjSize, num_new_tokens]
+    DT *C = static_cast<DT *>(output_ptr);
+
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_T,
+                            HIPBLAS_OP_N,
+                            m_,
+                            n,
+                            k,
+                            &alpha,
+                            A,
+                            cublas_data_type,
+                            lda,
+                            B,
+                            cublas_data_type,
+                            ldb,
+                            &beta,
+                            C,
+                            cublas_data_type,
+                            ldc,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+  // Add final output bias
+  if (*m->final_bias && shard_id == 0) {
+    int parallelism = m->oProjSize * num_tokens;
+    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
+                          m->kProjSize * m->global_num_q_heads +
+                          m->vProjSize * m->global_num_q_heads;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w),
+                       GET_BLOCKS(parallelism),
+                       min(CUDA_NUM_THREADS, parallelism),
+                       0,
+                       stream,
+                       output_ptr,
+                       bias_ptr,
+                       num_tokens,
+                       qkv_weight_size,
+                       m->oProjSize);
+  }
+}
+
+#define LAUNCH_ATTENTION_SCORE_KERNEL(                                         \
+    DT, Dh, Dh_MAX, THDS_PER_KEY, THREADS_PER_VALUE, THDS_PER_BLOCK, stream)   \
+  smem_sz = smem_size_in_bytes<DT>(m->qProjSize,                               \
+                                   BatchConfig::max_sequence_length(),         \
+                                   THREADS_PER_VALUE,                          \
+                                   THDS_PER_BLOCK);                            \
+  compute_attention_kernel_generation_kernel<DT,                               \
+                                             THDS_PER_BLOCK,                   \
+                                             Dh,                               \
+                                             Dh_MAX,                           \
+                                             THDS_PER_KEY,                     \
+                                             THREADS_PER_VALUE>                \
+      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(                             \
+          static_cast<DT *>(m->devQKVProjArray),                               \
+          static_cast<DT *>(m->keyCache),                                      \
+          static_cast<DT *>(m->valueCache),                                    \
+          output_ptr,                                                          \
+          scale,                                                               \
+          BatchConfig::max_sequence_length(),                                  \
+          m->qProjSize,                                                        \
+          m->hidden_size,                                                      \
+          m->request_infos)
+
+template <typename DT>
+void compute_attention_kernel_generation(IncMultiHeadSelfAttentionMeta const *m,
+                                         BatchConfig const *bc,
+                                         DT *output_ptr,
+                                         hipStream_t stream) {
+  dim3 grid(m->num_q_heads, bc->num_generation_tokens);
+  int const per_head_size = m->qProjSize;
+  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  size_t smem_sz;
+  if (per_head_size == 64) {
+    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
+    LAUNCH_ATTENTION_SCORE_KERNEL(
+        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
+  } else if (per_head_size == 128) {
+    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
+    LAUNCH_ATTENTION_SCORE_KERNEL(
+        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
+  } else {
+    assert(false && "a unsupported head size");
+  }
+}
+
 template <typename DT>
 void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
                              GenericTensorAccessorR const weight,
@@ -393,27 +875,29 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 
     if (m->quantization_type == DT_INT4) {
       int parallelism = m->qProjSize * m->qSize * m->num_q_heads / 2;
-      decompress_int4_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int4_attention_weights),
+                         GET_BLOCKS(parallelism),
+                         min(CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         m->quantized_weight_ptr,
+                         static_cast<DT *>(m->weight_ptr),
+                         m->qProjSize,
+                         m->qSize,
+                         m->num_q_heads);
     } else {
       assert(m->quantization_type == DT_INT8);
       int parallelism = m->qProjSize * m->qSize * m->num_q_heads;
-      decompress_int8_attention_weights<<<GET_BLOCKS(parallelism),
-                                          min(CUDA_NUM_THREADS, parallelism),
-                                          0,
-                                          stream>>>(
-          m->quantized_weight_ptr,
-          static_cast<DT *>(m->weight_ptr),
-          m->qProjSize,
-          m->qSize,
-          m->num_q_heads);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(decompress_int8_attention_weights),
+                         GET_BLOCKS(parallelism),
+                         min(CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         m->quantized_weight_ptr,
+                         static_cast<DT *>(m->weight_ptr),
+                         m->qProjSize,
+                         m->qSize,
+                         m->num_q_heads);
     }
   } else {
     if (data_type == DT_FLOAT) {
@@ -435,7 +919,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
@@ -443,19 +927,13 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                       DT *output_ptr,
                       DT const *bias_ptr,
                       hipStream_t stream) {
-  // here because we need postion info in infernece 1
 
   if (m->offload && m->biasSize > 0) {
     checkCUDA(hipMemcpyAsync(
         m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream));
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
-  checkCUDA(hipMemcpyAsync(m->token_infos,
-                           &(bc->tokensInfo),
-                           bc->num_active_tokens() *
-                               sizeof(BatchConfig::PerTokenInfo),
-                           hipMemcpyHostToDevice,
-                           stream));
+
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
@@ -465,14 +943,520 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
                      static_cast<DT *>(m->devQKVProjArray),
                      bias_ptr,
                      stream);
-
-  // phase 2: Update key/val cache
   update_kv_cache_kernel<DT>(m, bc, stream);
 
-  // phase 3: Compute attention score
-  // 3 kernels for pahse 3: matmul1 - softmax - matmal2
-  compute_attention_kernel(
-      m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
+  if (bc->num_generation_tokens > 0) {
+    // phase 3: Compute attention score for generation tokens
+    compute_attention_kernel_generation<DT>(
+        m, bc, static_cast<DT *>(m->attn_heads), stream);
+  }
+
+  if (bc->num_tokens > bc->num_generation_tokens) {
+    // phase 4: Compute attention score for prompt tokens;
+    compute_attention_kernel_prompt(
+        m, bc, shard_id, bias_ptr, weight_ptr, stream);
+  }
+
+  // compute output production and bias together for all tokens
+  int num_tokens = bc->num_active_tokens();
+  compute_o_prod_bias(
+      m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
+}
+
+std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
+                                int shard_id) {
+  std::string op_name_without_uid =
+      IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
+  }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + op_name_without_uid;
+  dst_filepath /= layername;
+  return dst_filepath.string();
+}
+
+template <typename DT>
+void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                     BatchConfig const *bc,
+                     int shard_id,
+                     DT *input_grad_ptr,
+                     DT const *weight_ptr,
+                     DT const *output_grad_ptr,
+                     DT const *bias_ptr,
+                     hipStream_t stream) {
+  assert(!m->offload);
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+  hipblasDatatype_t compute_type = cublas_data_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+                           bc->requestsInfo[i].num_tokens_in_batch;
+    // Currently assume we are calculating gradients for all tokens
+    // of a request
+    assert(num_tokens == num_total_tokens);
+    int kt_block_size = m->kProjSize;
+    int kt_req_block_size =
+        kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    int vt_block_size = m->vProjSize;
+    int vt_req_block_size =
+        vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
+    // Step 1: compute gradients before final projection
+    {
+      int m_ = m->vProjSize * m->num_q_heads;
+      int n_ = num_tokens;
+      int k_ = m->oProjSize;
+      int lda = m_;
+      int ldb = k_;
+      int ldc = m_;
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: output projection weight
+      // matrix A's layout: [vProjSize * num_heads, oProjSize]
+      DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                             m->kProjSize * m->num_q_heads +
+                                             m->vProjSize * m->num_q_heads);
+      // matrix B: output gradients
+      // matrix B's layout: [oProjSize, num_new_tokens]
+      DT const *B =
+          output_grad_ptr +
+          bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
+      // matrix C: attn_heads gradients
+      // matrix C's layout: [vProjSize * num_heads, num_new_tokens]
+      DT *C = static_cast<DT *>(m->handle.workSpace);
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_N,
+                              m_,
+                              n_,
+                              k_,
+                              &alpha,
+                              A,
+                              cublas_data_type,
+                              lda,
+                              B,
+                              cublas_data_type,
+                              ldb,
+                              &beta,
+                              C,
+                              cublas_data_type,
+                              ldc,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        // save result to file for checking
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0";
+        save_tensor(C, m_ * n_, filename.c_str());
+      }
+    }
+    // Step 2: compute gradients w.r.t. value
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: qk_prods_softmax
+      // matrix A's layout: [num_new_tokens, total_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix B: attn_heads gradients
+      // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->handle.workSpace);
+      // matrix C: gradients for value (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray) +
+              2 * num_tokens *
+                  (m->qProjSize * m->num_q_heads); // skip over regions reserved
+                                                   // for Q and K gradients
+      // after transpositions
+      int m_ = num_tokens;   // total_tokens
+      int n_ = m->vProjSize; // num_new_tokens
+      int k_ = num_tokens;   // num_new_tokens
+      // before transpositions
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // total_tokens
+      // N.B. strides are applied before transpose operations
+      int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * m->vProjSize;
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_T,
+                                            HIPBLAS_OP_T,
+                                            m_,
+                                            n_,
+                                            k_,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+      // save result to file for checking
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0";
+        save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax";
+        save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str());
+      }
+    }
+    // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: attn_heads gradients
+      // matrix A's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *A = static_cast<DT *>(m->handle.workSpace);
+      // matrix B: value cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix C: qk_prods_softmax gradients
+      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+      DT *C = static_cast<DT *>(m->qk_prods_softmax);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = num_tokens;
+      int k_ = m->vProjSize;
+      // before transposition and striding
+      int lda = m->vProjSize * m->num_q_heads;
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // num_new_tokens
+      int strideA = m->vProjSize;
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens
+
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_T,
+                                            HIPBLAS_OP_N,
+                                            m_,
+                                            n_,
+                                            k_,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+        std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache";
+        save_tensor(
+            B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str());
+      }
+    }
+    // Step 4: softmax backpropagation
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      int n_param = m->num_q_heads;
+      int c_param = num_tokens;
+      int h_param = 1;
+      int w_param = num_tokens;
+      checkCUDNN(miopenSet4dTensorDescriptor(
+          m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param));
+      checkCUDNN(miopenSoftmaxBackward_V2(m->handle.dnn,
+                                          &alpha,
+                                          m->qk_tensor,
+                                          m->softmax_activation_buffer,
+                                          m->qk_tensor,
+                                          m->qk_prods_softmax,
+                                          &beta,
+                                          m->qk_tensor,
+                                          m->qk_prods,
+                                          MIOPEN_SOFTMAX_ACCURATE,
+                                          MIOPEN_SOFTMAX_MODE_CHANNEL));
+
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+
+      //  TODO: fill all elements above diagonal to force causal attention
+      size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
+      if (entries_above_diagonal > 0) {
+        size_t parallelism = m->num_q_heads * entries_above_diagonal;
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal),
+                           GET_BLOCKS(parallelism),
+                           min((size_t)CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream,
+                           static_cast<DT *>(m->qk_prods),
+                           num_tokens,
+                           num_tokens,
+                           m->num_q_heads,
+                           entries_above_diagonal,
+                           DT(0.0f));
+      }
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename = get_peft_dbg_folder(m, shard_id) +
+                               ".qk_prods.softmax_grad_in.masked";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+    }
+    // Step 5: compute gradients w.r.t. key
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: query activation (in query_activation_buffer)
+      // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->query_activation_buffer);
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      // after transposition & striding
+      int m_ = num_tokens;
+      int n_ = m->kProjSize;
+      int k_ = num_tokens; // num_new_tokens
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->kProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->kProjSize;
+      int strideC = num_tokens * m->kProjSize;
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_T,
+                                            HIPBLAS_OP_T,
+                                            m_,
+                                            n_,
+                                            k_,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".query_activation";
+        save_tensor(
+            B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".devkproj_pre";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str());
+      }
+    }
+    // Step 6: compute gradients w.r.t query
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: key cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: gradients for query (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = m->qProjSize;
+      int k_ = num_tokens;
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->qProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->qProjSize;
+      int strideC = num_tokens * m->qProjSize;
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_N,
+                                            HIPBLAS_OP_T,
+                                            m_,
+                                            n_,
+                                            k_,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre";
+        save_tensor(C,
+                    num_tokens * m->qProjSize * m->num_q_heads * 3,
+                    filename.c_str());
+      }
+    }
+
+    // Step 7: perform rotary position embeddings (RoPE) bwd
+    {
+      if (*m->apply_rotary_embedding) {
+        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+        assert(m->qProjSize == m->kProjSize);
+        /*q&k*/
+        int parallelism = num_tokens * m->hidden_size;
+        DT *A = static_cast<DT *>(m->devQKVProjArray);
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_rotary_embedding_bwd),
+                           GET_BLOCKS(parallelism),
+                           min(CUDA_NUM_THREADS, parallelism),
+                           0,
+                           stream,
+                           A,
+                           m->complex_input,
+                           m->token_infos,
+                           m->qProjSize,
+                           num_tokens,
+                           m->hidden_size);
+        DT *C = static_cast<DT *>(m->devQKVProjArray);
+        if (m->inference_debugging) {
+          std::string filename =
+              get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray";
+          save_tensor(C,
+                      num_tokens * m->qProjSize * m->num_q_heads * 3,
+                      filename.c_str());
+        }
+      }
+
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      if (m->inference_debugging) {
+        std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str());
+      }
+    }
+
+    // Step 8: compute gradients w.r.t. input
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (!m->reset_input_grads[0]) {
+        beta = 1.0f;
+      }
+      // matrix A: QKV projection weights
+      // matrix A's layout: [qSize, qProjSize * num_q_heads, 3]
+      DT const *A = weight_ptr;
+      // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
+      // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT const *B = static_cast<DT *>(m->devQKVProjArray);
+      // matrix C: gradients w.r.t. input
+      // matrix C's layout: [m->qSize, num_tokens]
+      DT *C = input_grad_ptr +
+              bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
+      int m_ = m->qSize;
+      int n_ = num_tokens;
+      int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+      int lda = m_;
+      int ldb = n_;
+      int ldc = m_;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_T,
+                              m_,
+                              n_,
+                              k_,
+                              &alpha,
+                              A,
+                              cublas_data_type,
+                              lda,
+                              B,
+                              cublas_data_type,
+                              ldb,
+                              &beta,
+                              C,
+                              cublas_data_type,
+                              ldc,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
+        save_tensor(C, num_tokens * m->qSize, filename.c_str());
+      }
+    }
+  }
 }
 
 } // namespace IncMultiHeadAttention
@@ -481,42 +1465,47 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
 using namespace Kernels::IncMultiHeadAttention;
 
 template <typename DT>
-__global__ void fill_entries_above_diagonal(DT *matrix,
-                                            size_t num_rows,
-                                            size_t num_cols,
-                                            size_t num_q_heads,
-                                            size_t entries_above_diagonal,
-                                            DT value) {
-  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
-    size_t head_idx = i / entries_above_diagonal;
-    size_t entry_idx = i % entries_above_diagonal;
-    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
-    size_t x = entry_idx - y * (y + 1) / 2;
-    y += (num_cols - num_rows) + 1;
-    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
+__global__ void store_query_cache(DT const *devQKVProjArray,
+                                  DT *qCache_ptr,
+                                  int num_tokens,
+                                  int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+
+    size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset;
+
+    DT qVal = devQKVProjArray[val_idx];
+
+    // query cache
+    qCache_ptr[i] = qVal;
   }
 }
 
 template <typename DT>
-void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
-                              BatchConfig const *bc,
-                              int shard_id,
-                              DT *output_ptr,
-                              DT const *bias_ptr,
-                              DT const *weight_ptr,
-                              hipStream_t stream) {
+void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
+                                     BatchConfig const *bc,
+                                     int shard_id,
+                                     DT const *bias_ptr,
+                                     DT const *weight_ptr,
+                                     hipStream_t stream) {
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
-  hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-  miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  hipblasDatatype_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+  hipblasDatatype_t compute_type = cublas_data_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   hipblasDatatype_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
@@ -530,64 +1519,102 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i]) {
+    if (bc->request_completed[i] ||
+        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
-    // bc->token_last_available_idx[i] + 1;
-    // Compute (QK^T/sqrt(d_k))
-    // a flag of using this scaling alpha
-    int m_ = num_new_tokens;
-    int n = total_tokens;
-    int k = m->qProjSize;
-    int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
-        ldc = m_;
-    int strideA = q_block_size;
-    int strideB = kt_block_size;
-    int strideC = num_new_tokens * total_tokens;
-    DT alpha = 1.0f, beta = 0.0f;
-    if (*m->qk_prod_scaling) {
-      alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    // Copy query to m->query_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
+      if (activation_size_needed > m->allocated_peft_buffer_size1) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->query_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size1 = activation_size_needed;
+      }
+      int parallelism = m->hidden_size * num_tokens;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(store_query_cache),
+                         GET_BLOCKS(parallelism),
+                         min(CUDA_NUM_THREADS, parallelism),
+                         0,
+                         stream,
+                         static_cast<DT *>(m->devQKVProjArray),
+                         static_cast<DT *>(m->query_activation_buffer),
+                         num_tokens,
+                         m->hidden_size);
     }
-    // To get A, skip over Q entries from previous requests (same head)
-    DT const *A = static_cast<DT *>(m->devQKVProjArray) +
-                  tokens_previous_requests * m->qProjSize * m->num_q_heads *
-                      QKV_WEIGHT_NUM;
-    // To get B, skip over K entries from previous requests (all heads +
-    // padding)
-    DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
-    // To get C, skip over QK^T products from previous requests
+    // Step 1: compute query-key product QK.T/sqrt(d_k)
+    {
+      // Scale by sqrt(d_k) as per the original attention paper
+      DT alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = static_cast<DT>(1.0f / sqrt(m->kProjSize));
+      }
+      // after transpositions
+      int m_ = num_new_tokens;
+      int n = total_tokens;
+      int k = m->qProjSize;
+      // before transpositions
+      int lda = k * m->num_q_heads * QKV_WEIGHT_NUM, ldb = k * m->num_q_heads,
+          ldc = m_;
+      // N.B. strides are applied before transpose operations
+      int strideA = q_block_size;
+      int strideB = kt_block_size;
+      int strideC = num_new_tokens * total_tokens;
+
+      // matrix A: devQKVProjArray
+      // matrix A's layout: [qProjSize, num_heads, 3, num_new_tokens]
+      // To get query projection, skip over Q entries from previous requests
+      DT const *A = static_cast<DT *>(m->devQKVProjArray) +
+                    bc->requestsInfo[i].first_token_offset_in_batch *
+                        m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
+      // matrix B: key cache
+      // matrix B's layout: [kProjSize * num_heads, total_tokens]
+      // To get B, skip over K entries from previous requests (all heads +
+      // padding)
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: qk_prods
+      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+      // To get C, skip over QK.T products from previous requests
+      DT *C = static_cast<DT *>(m->qk_prods);
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_T,
+                                            HIPBLAS_OP_N,
+                                            m_,
+                                            n,
+                                            k,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+    }
+    // Step 2: Add alibi position bias to qk production
+    // matrix C: qk_prods
+    // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+    // To get C, skip over QK.T products from previous requests
     DT *C = static_cast<DT *>(m->qk_prods);
-    checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                          HIPBLAS_OP_T,
-                                          HIPBLAS_OP_N,
-                                          m_,
-                                          n,
-                                          k,
-                                          &alpha,
-                                          A,
-                                          hipblas_data_type,
-                                          lda,
-                                          strideA,
-                                          B,
-                                          hipblas_data_type,
-                                          ldb,
-                                          strideB,
-                                          &beta,
-                                          C,
-                                          hipblas_data_type,
-                                          ldc,
-                                          strideC,
-                                          m->num_q_heads,
-                                          compute_type,
-                                          HIPBLAS_GEMM_DEFAULT));
-
-    // add alibi position bias to qk production
     if (*m->position_bias) {
       size_t parallelism = m->num_q_heads * total_tokens * num_new_tokens;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd<DT>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_position_bias_qkprd),
                          GET_BLOCKS(parallelism),
                          min((size_t)CUDA_NUM_THREADS, parallelism),
                          0,
@@ -599,13 +1626,14 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
                          m->global_num_q_heads,
                          shard_id);
     }
-    // Fill all elements above diagonal in qk prods with -inf to force
-    // causal attention.
+
+    // Step 3: Apply causal mask. Fill all elements above diagonal in qk prods
+    // with -inf to force causal attention.
     assert(num_new_tokens <= total_tokens);
     size_t entries_above_diagonal = num_new_tokens * (num_new_tokens - 1) / 2;
     if (entries_above_diagonal > 0) {
       size_t parallelism = m->num_q_heads * entries_above_diagonal;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal<DT>),
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(fill_entries_above_diagonal),
                          GET_BLOCKS(parallelism),
                          min((size_t)CUDA_NUM_THREADS, parallelism),
                          0,
@@ -617,137 +1645,129 @@ void compute_attention_kernel(IncMultiHeadSelfAttentionMeta const *m,
                          entries_above_diagonal,
                          static_cast<DT>(-INFINITY));
     }
-    // Compute Softmax(QK^T/sqrt(d_k))
-    // Before modifying the parameters below, make sure to read the following
-    // description of the CUDNN_TENSOR_NCHW tensor layout, from
-    // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnTensorFormat_t:
-    // This tensor format specifies that the data is laid out in the following
-    // order: batch size, feature maps, rows, columns. The strides are
-    // implicitly defined in such a way that the data are contiguous in memory
-    // with no padding between images, feature maps, rows, and columns; the
-    // columns are the inner dimension and the images are the outermost
-    // dimension.
-    int n_param = m->num_q_heads;
-    int c_param = total_tokens;
-    int h_param = 1;
-    int w_param = num_new_tokens;
-    checkCUDNN(miopenSet4dTensorDescriptor(
-        m->qk_tensor, miopen_data_type, n_param, c_param, h_param, w_param));
-    float softmax_alpha = 1.0f, softmax_beta = 0.0f;
-    DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
-    // The softmax operation below is executed according to the
-    // CUDNN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
-    // softmax operation is computed per spatial location (H,W) per image (N)
-    // across dimension C.
-    checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
-                                       &softmax_alpha,
-                                       m->qk_tensor,
-                                       C,
-                                       &softmax_beta,
-                                       m->qk_tensor,
-                                       C_softmax,
-                                       MIOPEN_SOFTMAX_ACCURATE,
-                                       MIOPEN_SOFTMAX_MODE_CHANNEL));
-    // Matmul softmax(QK^T/sqrt(d_k)) by V
-    alpha = 1.0f, beta = 0.0f;
-    m_ = num_new_tokens;
-    n = m->vProjSize;
-    k = total_tokens;
-    lda = m_, ldb = n * m->num_q_heads, ldc = m_;
-    strideA = num_new_tokens * total_tokens;
-    strideB = vt_block_size;
-    strideC = num_new_tokens * m->vProjSize;
-    // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-    // requests (all heads)
-    A = C_softmax;
-    // To get B, skip over V^T entries from previous requests (all heads +
-    // padding)
-    B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
-    // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
-    // requests
-    C = static_cast<DT *>(m->attn_heads) +
-        tokens_previous_requests * m->num_q_heads * m->vProjSize;
-
-    checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
-                                          HIPBLAS_OP_N,
-                                          HIPBLAS_OP_T,
-                                          m_,
-                                          n,
-                                          k,
-                                          &alpha,
-                                          A,
-                                          hipblas_data_type,
-                                          lda,
-                                          strideA,
-                                          B,
-                                          hipblas_data_type,
-                                          ldb,
-                                          strideB,
-                                          &beta,
-                                          C,
-                                          hipblas_data_type,
-                                          ldc,
-                                          strideC,
-                                          m->num_q_heads,
-                                          compute_type,
-                                          HIPBLAS_GEMM_DEFAULT));
-    // Project to output, save result directly on output tensor
-    alpha = 1.0f, beta = 0.0f;
-    m_ = m->oProjSize;
-    k = m->vProjSize * m->num_q_heads;
-    n = num_new_tokens;
-    lda = k, ldb = n, ldc = m_;
-    A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                 m->kProjSize * m->num_q_heads +
-                                 m->vProjSize * m->num_q_heads);
-    B = C;
-    C = static_cast<DT *>(output_ptr) + tokens_previous_requests * m->oProjSize;
 
-    checkCUDA(hipblasGemmEx(m->handle.blas,
-                            HIPBLAS_OP_T,
-                            HIPBLAS_OP_T,
-                            m_,
-                            n,
-                            k,
-                            &alpha,
-                            A,
-                            hipblas_data_type,
-                            lda,
-                            B,
-                            hipblas_data_type,
-                            ldb,
-                            &beta,
-                            C,
-                            hipblas_data_type,
-                            ldc,
-                            compute_type,
-                            HIPBLAS_GEMM_DEFAULT));
+    // Step 4: Compute Softmax(QK.T/sqrt(d_k))
+    {
+      // Before modifying the parameters below, make sure to read the following
+      // description of the HIPDNN_TENSOR_NCHW tensor layout, from
+      // https://docs.nvidia.com/deeplearning/cudnn/api/index.html#hipdnnTensorFormat_t:
+      // This tensor format specifies that the data is laid out in the following
+      // order: batch size, feature maps, rows, columns. The strides are
+      // implicitly defined in such a way that the data are contiguous in memory
+      // with no padding between images, feature maps, rows, and columns; the
+      // columns are the inner dimension and the images are the outermost
+      // dimension.
+      int n_param = m->num_q_heads;
+      int c_param = total_tokens;
+      int h_param = 1;
+      int w_param = num_new_tokens;
+      checkCUDNN(miopenSet4dTensorDescriptor(
+          m->qk_tensor, cudnn_data_type, n_param, c_param, h_param, w_param));
+      float softmax_alpha = 1.0f, softmax_beta = 0.0f;
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      // The softmax operation below is executed according to the
+      // MIOPEN_SOFTMAX_MODE_CHANNEL, which is also described in the docs: The
+      // softmax operation is computed per spatial location (H,W) per image (N)
+      // across dimension C.
+      checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
+                                         &softmax_alpha,
+                                         m->qk_tensor,
+                                         C,
+                                         &softmax_beta,
+                                         m->qk_tensor,
+                                         C_softmax,
+                                         MIOPEN_SOFTMAX_ACCURATE,
+                                         MIOPEN_SOFTMAX_MODE_CHANNEL));
+    }
+    // Copy C_softmax to m->softmax_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
+      if (activation_size_needed > m->allocated_peft_buffer_size2) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->softmax_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size2 = activation_size_needed;
+      }
+      checkCUDA(hipMemcpyAsync(m->softmax_activation_buffer,
+                               C_softmax,
+                               sizeof(DT) * total_tokens * num_new_tokens *
+                                   m->num_q_heads,
+                               hipMemcpyDeviceToDevice,
+                               stream));
+    }
+    // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
+    // softmax(QK.T/sqrt(d_k)).T
+    {
+      DT alpha = 1.0f, beta = 0.0f;
+      // after transpositions
+      int m_ = m->vProjSize;
+      int n = num_new_tokens;
+      int k = total_tokens;
+      // before transpositions
+      int lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
+      // N.B. strides are applied before transpose operations
+      int strideA = vt_block_size;
+      int strideB = num_new_tokens * total_tokens;
+      int strideC = m->vProjSize;
+      // matrix A: value cache
+      // matrix A's layout: [vProjSize, num_heads, total_tokens]
+      // To get A, skip over V.T entries from previous requests (all heads +
+      // padding)
+      DT *A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix B: qk_prods_softmax
+      // matrix B's layout: [num_new_tokens, total_tokens, num_heads]
+      // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
+      // requests (all heads)
+      DT *B = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix C: attn heads
+      // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
+      // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
+      // requests
+      // store the result attn heads, also skip the genration tokens
+      DT *C = static_cast<DT *>(m->attn_heads) +
+              (bc->requestsInfo[i].first_token_offset_in_batch) *
+                  m->num_q_heads * m->vProjSize;
+      checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
+                                            HIPBLAS_OP_N,
+                                            HIPBLAS_OP_T,
+                                            m_,
+                                            n,
+                                            k,
+                                            &alpha,
+                                            A,
+                                            cublas_data_type,
+                                            lda,
+                                            strideA,
+                                            B,
+                                            cublas_data_type,
+                                            ldb,
+                                            strideB,
+                                            &beta,
+                                            C,
+                                            cublas_data_type,
+                                            ldc,
+                                            strideC,
+                                            m->num_q_heads,
+                                            compute_type,
+                                            HIPBLAS_GEMM_DEFAULT));
+    }
     tokens_previous_requests += num_new_tokens;
   }
-
-  if (*m->final_bias && shard_id == 0) {
-    int parallelism = m->oProjSize * num_tokens;
-    int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
-                          m->kProjSize * m->global_num_q_heads +
-                          m->vProjSize * m->global_num_q_heads;
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(apply_proj_bias_w<DT>),
-                       GET_BLOCKS(parallelism),
-                       min(CUDA_NUM_THREADS, parallelism),
-                       0,
-                       stream,
-                       output_ptr,
-                       bias_ptr,
-                       num_tokens,
-                       qkv_weight_size,
-                       m->oProjSize);
+  if (tokens_previous_requests != (num_tokens - bc->num_generation_tokens)) {
+    bc->print();
+    printf("tokens_previous_requests: %i\n", tokens_previous_requests);
+    printf("num_tokens: %i\n", num_tokens);
+    printf("bc->num_generation_tokens: %i\n", bc->num_generation_tokens);
   }
-
-  assert(tokens_previous_requests == num_tokens);
+  assert(tokens_previous_requests == (num_tokens - bc->num_generation_tokens));
 }
 
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
-    IncMultiHeadSelfAttentionMeta const *m,
+    IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
@@ -813,10 +1833,71 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
     checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
     checkCUDA(hipEventDestroy(t_start));
     checkCUDA(hipEventDestroy(t_end));
-    printf("IncMultiHeadSelfAttention forward time = %.2fms\n", elapsed);
-    // print_tensor<3, float>(acc_query.ptr, acc_query.rect,
-    // "[Attention:forward:query]"); print_tensor<3, float>(acc_output.ptr,
-    // acc_output.rect, "[Attention:forward:output]");
+    printf("IncMultiHeadSelfAttention forward time = %.9fms\n", elapsed);
+  }
+}
+
+/*static*/
+void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+    IncMultiHeadSelfAttentionMeta *m,
+    BatchConfig const *bc,
+    int shard_id,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &bias) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  bool use_bias = *m->qkv_bias || *m->final_bias;
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  // assert(input.data_type == weight.data_type);
+  assert(input_grad.data_type == output_grad.data_type);
+  if (use_bias) {
+    assert(input_grad.data_type == bias.data_type);
+  }
+
+  if (input_grad.data_type == DT_HALF) {
+    assert(!m->offload);
+    half const *bias_ptr =
+        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_half_ptr(),
+                                                    weight.get_half_ptr(),
+                                                    output_grad.get_half_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else if (input_grad.data_type == DT_FLOAT) {
+    assert(!m->offload);
+    float const *bias_ptr =
+        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_float_ptr(),
+                                                    weight.get_float_ptr(),
+                                                    output_grad.get_float_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else {
+    assert(false && "Unspported data type");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed);
   }
 }
 
@@ -895,7 +1976,7 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
   assert(kSize == vSize);
   qProjSize = _qProjSize;
   kProjSize = _kProjSize;
-  assert(qProjSize == kProjSize); // required for attention QK^T matmul
+  assert(qProjSize == kProjSize); // required for attention QK.T matmul
   vProjSize = _vProjSize;
   oProjSize = _oProjSize;
   size_t size_of_dt = data_type_size(attn->data_type);
@@ -949,14 +2030,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
+    int max_tokens_per_batch = infer_mode == TREE_VERIFY_MODE
+                                   ? BatchConfig::max_verify_tokens_per_batch()
+                                   : BatchConfig::max_tokens_per_batch();
     size_t qkv_max_proj_size = max_tokens_per_batch * (qProjSize * num_q_heads +
                                                        kProjSize * num_q_heads +
                                                        vProjSize * num_q_heads);
     size_t key_cache_size = 0, value_cache_size = 0;
     switch (infer_mode) {
-      case INC_DECODING_MODE:
-      case TREE_VERIFY_MODE: {
+      case INC_DECODING_MODE: {
         key_cache_size = num_q_heads * kProjSize *
                          BatchConfig::max_requests_per_batch() *
                          BatchConfig::max_sequence_length();
@@ -965,21 +2047,24 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                            BatchConfig::max_sequence_length();
         break;
       }
-      case BEAM_SEARCH_MODE: {
+      case BEAM_SEARCH_MODE:
+      case TREE_VERIFY_MODE: {
+        // a K-ary tree max node is (k^n - 1) / 2
         key_cache_size = num_q_heads * kProjSize *
                          BeamSearchBatchConfig::max_requests_per_batch() *
-                         BatchConfig::max_sequence_length() *
-                         BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+                         (BatchConfig::max_sequence_length() +
+                          BatchConfig::max_spec_tree_token_num());
         value_cache_size = num_q_heads * vProjSize *
                            BeamSearchBatchConfig::max_requests_per_batch() *
-                           BatchConfig::max_sequence_length() *
-                           BeamSearchBatchConfig::MAX_BEAM_WIDTH;
+                           (BatchConfig::max_sequence_length() +
+                            BatchConfig::max_spec_tree_token_num());
         break;
       }
       default:
         assert(false && "Unkown inference mode");
     }
-    size_t tokeninfo_size = max_tokens_per_batch;
+    size_t requestinfo_size = BatchConfig::max_requests_per_batch();
+    // size_t tokeninfo_size = max_tokens_per_batch;
     size_t qk_prod_size =
         max_tokens_per_batch * BatchConfig::max_sequence_length() * num_q_heads;
     size_t attn_heads_size = max_tokens_per_batch * num_q_heads * vProjSize;
@@ -990,7 +2075,6 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
         (qkv_max_proj_size + key_cache_size + value_cache_size +
          2 * qk_prod_size + attn_heads_size) *
             size_of_dt +
-        tokeninfo_size * sizeof(BatchConfig::PerTokenInfo) +
         complex_size * sizeof(hipFloatComplex); // more components will
                                                 // be added here later
     if (offload) {
@@ -1035,10 +2119,15 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *
                                                              size_of_dt);
 
+    token_infos = static_cast<BatchConfig::PerTokenInfo *>(
+        handler.batch_config_metadata->tokens_info);
+    request_infos = static_cast<BatchConfig::PerRequestInfo *>(
+        handler.batch_config_metadata->requestsInfo);
+
     if (offload) {
-      token_infos =
-          gpu_mem_allocator.allocate_reserved<BatchConfig::PerTokenInfo>(
-              tokeninfo_size);
+      // token_infos =
+      //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerTokenInfo>(
+      //         tokeninfo_size);
       // offset += sizeof(BatchConfig::PerTokenInfo) * tokeninfo_size;
       qk_prods = gpu_mem_allocator.allocate_reserved_untyped(qk_prod_size *
                                                              size_of_dt);
@@ -1052,10 +2141,13 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
       complex_input =
           gpu_mem_allocator.allocate_reserved<hipFloatComplex>(complex_size);
       // offset += complex_size * sizeof(hipFloatComplex);
+      // request_infos =
+      //     gpu_mem_allocator.allocate_reserved<BatchConfig::PerRequestInfo>(
+      //         requestinfo_size);
     } else {
-      token_infos =
-          gpu_mem_allocator.allocate_instance<BatchConfig::PerTokenInfo>(
-              tokeninfo_size);
+      // token_infos =
+      //     gpu_mem_allocator.allocate_instance<BatchConfig::PerTokenInfo>(
+      //         tokeninfo_size);
       qk_prods = gpu_mem_allocator.allocate_instance_untyped(qk_prod_size *
                                                              size_of_dt);
       qk_prods_softmax = gpu_mem_allocator.allocate_instance_untyped(
@@ -1064,6 +2156,9 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
                                                                size_of_dt);
       complex_input =
           gpu_mem_allocator.allocate_instance<hipFloatComplex>(complex_size);
+      // request_infos =
+      //     gpu_mem_allocator.allocate_instance<BatchConfig::PerRequestInfo>(
+      //         requestinfo_size);
     }
 
     // allocate more size for quantization data
@@ -1077,6 +2172,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
              gpu_mem_allocator.reserved_allocated_size);
     }
   }
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
   checkCUDA(hipStreamSynchronize(stream));
 }
 
@@ -1098,4 +2195,37 @@ template void Kernels::IncMultiHeadAttention::pre_build_weight_kernel<half>(
     DataType data_type,
     hipStream_t stream);
 
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<float>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    float *output_ptr,
+    float const *weight_ptr,
+    float const *bias_ptr,
+    int num_tokens,
+    hipStream_t stream);
+
+template void Kernels::IncMultiHeadAttention::compute_o_prod_bias<half>(
+    IncMultiHeadSelfAttentionMeta const *m,
+    BatchConfig const *bc,
+    int shard_id,
+    half *output_ptr,
+    half const *weight_ptr,
+    half const *bias_ptr,
+    int num_tokens,
+    hipStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<float>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        float *output_ptr,
+        hipStream_t stream);
+
+template void
+    Kernels::IncMultiHeadAttention::compute_attention_kernel_generation<half>(
+        IncMultiHeadSelfAttentionMeta const *m,
+        BatchConfig const *bc,
+        half *output_ptr,
+        hipStream_t stream);
 }; // namespace FlexFlow
diff --git a/src/ops/inc_multihead_self_attention.cu b/src/ops/inc_multihead_self_attention.cu
index a0d31bb6ef..b278611b60 100644
--- a/src/ops/inc_multihead_self_attention.cu
+++ b/src/ops/inc_multihead_self_attention.cu
@@ -12,9 +12,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
-#endif
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/kernels/decompress_kernels.h"
@@ -483,6 +481,63 @@ __global__ void
   }
 }
 
+template <typename DT>
+__global__ void
+    apply_rotary_embedding_bwd(DT *input_ptr,
+                               cuFloatComplex *complex_input,
+                               BatchConfig::PerTokenInfo const *tokenInfos,
+                               int proj_size,
+                               int num_tokens,
+                               int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    // compute indexes to visit first half proj_size of each of q/k tensor.
+    // devQKVProj has shape [num_tokens, qProjSize, num_heads, 3] in peft_bwd
+    bool q_tensor = i < (num_tokens * hidden_size / 2);
+    int real_i = q_tensor ? i : i - num_tokens * hidden_size / 2;
+    assert(hidden_size % proj_size == 0);
+    int num_heads = hidden_size / proj_size;
+
+    int token_idx = real_i % num_tokens;
+    int idx = (real_i / num_tokens) % (proj_size / 2);
+    int head_idx = real_i / (num_tokens * proj_size / 2);
+    assert(head_idx < num_heads);
+
+    int complex_part_index = (q_tensor ? 0 : 1) * num_tokens * hidden_size +
+                             head_idx * num_tokens * proj_size +
+                             idx * num_tokens + token_idx;
+    int real_part_index = complex_part_index + (proj_size / 2) * num_tokens;
+
+    complex_input[i] = {input_ptr[real_part_index],
+                        input_ptr[complex_part_index]};
+
+    size_t pos = tokenInfos[token_idx].abs_depth_in_request;
+
+    float freq = pos * (1.0 / pow(10000.0, (float)2 * idx / proj_size));
+    cuFloatComplex complex_pos = {cos(freq), sin(freq)};
+
+    complex_input[i] = cuCmulf(complex_input[i], complex_pos);
+    input_ptr[real_part_index] = complex_input[i].x;
+    input_ptr[complex_part_index] = complex_input[i].y;
+  }
+}
+
+template <typename DT>
+__global__ void fill_entries_above_diagonal(DT *matrix,
+                                            size_t num_rows,
+                                            size_t num_cols,
+                                            size_t num_q_heads,
+                                            size_t entries_above_diagonal,
+                                            DT value) {
+  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
+    size_t head_idx = i / entries_above_diagonal;
+    size_t entry_idx = i % entries_above_diagonal;
+    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
+    size_t x = entry_idx - y * (y + 1) / 2;
+    y += (num_cols - num_rows) + 1;
+    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
+  }
+}
+
 template <typename DT>
 void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
                         BatchConfig const *bc,
@@ -497,17 +552,18 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
   checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
   assert(m->qSize == m->vSize && m->qSize == m->kSize);
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
 
   // Step 1: Compute QKV projections
   {
@@ -517,7 +573,7 @@ void compute_qkv_kernel(IncMultiHeadSelfAttentionMeta const *m,
     int m_k = m->kProjSize * m->num_q_heads;
     int m_v = m->vProjSize * m->num_q_heads;
     assert(m_q == m_k && m_k == m_v); // keep things simple for now
-    int n = bc->num_active_tokens();
+    int n = bc->num_active_infr_tokens();
     int k = m->qSize;
     int m_ = m_q * QKV_WEIGHT_NUM;
     // before transpositions
@@ -604,7 +660,7 @@ template <typename DT>
 void update_kv_cache_kernel(IncMultiHeadSelfAttentionMeta const *m,
                             BatchConfig const *bc,
                             cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   if (num_tokens > 0) {
     int parallelism = m->hidden_size * num_tokens;
     store_kv_cache<<<GET_BLOCKS(parallelism),
@@ -799,7 +855,7 @@ void pre_build_weight_kernel(IncMultiHeadSelfAttentionMeta const *m,
 }
 
 template <typename DT>
-void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
+void inference_kernel(IncMultiHeadSelfAttentionMeta *m,
                       BatchConfig const *bc,
                       int shard_id,
                       DT const *input_ptr,
@@ -843,6 +899,504 @@ void inference_kernel(IncMultiHeadSelfAttentionMeta const *m,
       m, bc, shard_id, output_ptr, weight_ptr, bias_ptr, num_tokens, stream);
 }
 
+std::string get_peft_dbg_folder(IncMultiHeadSelfAttentionMeta const *m,
+                                int shard_id) {
+  std::string op_name_without_uid =
+      IncMultiHeadSelfAttention::get_op_name_without_uid(m);
+  fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
+  }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + op_name_without_uid;
+  dst_filepath /= layername;
+  return dst_filepath.string();
+}
+
+template <typename DT>
+void peft_bwd_kernel(IncMultiHeadSelfAttentionMeta const *m,
+                     BatchConfig const *bc,
+                     int shard_id,
+                     DT *input_grad_ptr,
+                     DT const *weight_ptr,
+                     DT const *output_grad_ptr,
+                     DT const *bias_ptr,
+                     cudaStream_t stream) {
+  assert(!m->offload);
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  assert(data_type_size(m->output_type[0]) == sizeof(DT));
+  cudaDataType_t compute_type = cublas_data_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int num_total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
+                           bc->requestsInfo[i].num_tokens_in_batch;
+    // Currently assume we are calculating gradients for all tokens
+    // of a request
+    assert(num_tokens == num_total_tokens);
+    int kt_block_size = m->kProjSize;
+    int kt_req_block_size =
+        kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    int vt_block_size = m->vProjSize;
+    int vt_req_block_size =
+        vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+    assert(m->qProjSize == m->kProjSize && m->kProjSize == m->vProjSize);
+    // Step 1: compute gradients before final projection
+    {
+      int m_ = m->vProjSize * m->num_q_heads;
+      int n_ = num_tokens;
+      int k_ = m->oProjSize;
+      int lda = m_;
+      int ldb = k_;
+      int ldc = m_;
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: output projection weight
+      // matrix A's layout: [vProjSize * num_heads, oProjSize]
+      DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                             m->kProjSize * m->num_q_heads +
+                                             m->vProjSize * m->num_q_heads);
+      // matrix B: output gradients
+      // matrix B's layout: [oProjSize, num_new_tokens]
+      DT const *B =
+          output_grad_ptr +
+          bc->requestsInfo[i].first_token_offset_in_batch * m->oProjSize;
+      // matrix C: attn_heads gradients
+      // matrix C's layout: [vProjSize * num_heads, num_new_tokens]
+      DT *C = static_cast<DT *>(m->handle.workSpace);
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             m_,
+                             n_,
+                             k_,
+                             &alpha,
+                             A,
+                             cublas_data_type,
+                             lda,
+                             B,
+                             cublas_data_type,
+                             ldb,
+                             &beta,
+                             C,
+                             cublas_data_type,
+                             ldc,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        // save result to file for checking
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".o_proj.input_gradient_0";
+        save_tensor(C, m_ * n_, filename.c_str());
+      }
+    }
+    // Step 2: compute gradients w.r.t. value
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: qk_prods_softmax
+      // matrix A's layout: [num_new_tokens, total_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods_softmax);
+      // matrix B: attn_heads gradients
+      // matrix B's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->handle.workSpace);
+      // matrix C: gradients for value (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray) +
+              2 * num_tokens *
+                  (m->qProjSize * m->num_q_heads); // skip over regions reserved
+                                                   // for Q and K gradients
+      // after transpositions
+      int m_ = num_tokens;   // total_tokens
+      int n_ = m->vProjSize; // num_new_tokens
+      int k_ = num_tokens;   // num_new_tokens
+      // before transpositions
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // total_tokens
+      // N.B. strides are applied before transpose operations
+      int strideA = num_tokens * num_tokens; // num_new_tokens * total_tokens
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * m->vProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      // save result to file for checking
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".v_proj.input_gradient_0";
+        save_tensor(C, m_ * n_ * m->num_q_heads, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax";
+        save_tensor(A, m_ * k_ * m->num_q_heads, filename2.c_str());
+      }
+    }
+    // Step 3: compute gradients w.r.t. the qk_prods_softmax tensor
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      // matrix A: attn_heads gradients
+      // matrix A's layout: [vProjSize * num_heads, num_new_tokens]
+      DT const *A = static_cast<DT *>(m->handle.workSpace);
+      // matrix B: value cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // matrix C: qk_prods_softmax gradients
+      // matrix C's layout: [num_new_tokens, total_tokens, num_heads]
+      DT *C = static_cast<DT *>(m->qk_prods_softmax);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = num_tokens;
+      int k_ = m->vProjSize;
+      // before transposition and striding
+      int lda = m->vProjSize * m->num_q_heads;
+      int ldb = m->vProjSize * m->num_q_heads;
+      int ldc = num_tokens; // num_new_tokens
+      int strideA = m->vProjSize;
+      int strideB = m->vProjSize;
+      int strideC = num_tokens * num_tokens; // num_new_tokens * total_tokens
+
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_N,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+        std::string filename2 = get_peft_dbg_folder(m, shard_id) + ".vcache";
+        save_tensor(
+            B, m->vProjSize * m->num_q_heads * num_tokens, filename2.c_str());
+      }
+    }
+    // Step 4: softmax backpropagation
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      int n_param = m->num_q_heads;
+      int c_param = num_tokens;
+      int h_param = 1;
+      int w_param = num_tokens;
+      checkCUDNN(cudnnSetTensor4dDescriptor(m->qk_tensor,
+                                            CUDNN_TENSOR_NCHW,
+                                            cudnn_data_type,
+                                            n_param,
+                                            c_param,
+                                            h_param,
+                                            w_param));
+      checkCUDNN(cudnnSoftmaxBackward(m->handle.dnn,
+                                      CUDNN_SOFTMAX_ACCURATE,
+                                      CUDNN_SOFTMAX_MODE_CHANNEL,
+                                      &alpha,
+                                      m->qk_tensor,
+                                      m->softmax_activation_buffer,
+                                      m->qk_tensor,
+                                      m->qk_prods_softmax,
+                                      &beta,
+                                      m->qk_tensor,
+                                      m->qk_prods));
+
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".qk_prods.softmax_grad_in";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+
+      //  TODO: fill all elements above diagonal to force causal attention
+      size_t entries_above_diagonal = num_tokens * (num_tokens - 1) / 2;
+      if (entries_above_diagonal > 0) {
+        size_t parallelism = m->num_q_heads * entries_above_diagonal;
+        fill_entries_above_diagonal<<<GET_BLOCKS(parallelism),
+                                      min((size_t)CUDA_NUM_THREADS,
+                                          parallelism),
+                                      0,
+                                      stream>>>(static_cast<DT *>(m->qk_prods),
+                                                num_tokens,
+                                                num_tokens,
+                                                m->num_q_heads,
+                                                entries_above_diagonal,
+                                                DT(0.0f));
+      }
+      if (m->inference_debugging) {
+        DT *C = static_cast<DT *>(m->qk_prods);
+        std::string filename = get_peft_dbg_folder(m, shard_id) +
+                               ".qk_prods.softmax_grad_in.masked";
+        save_tensor(
+            C, num_tokens * num_tokens * m->num_q_heads, filename.c_str());
+      }
+    }
+    // Step 5: compute gradients w.r.t. key
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: query activation (in query_activation_buffer)
+      // matrix B's layout: [m->qProjSize * num_heads, num_new_tokens]
+      DT const *B = static_cast<DT *>(m->query_activation_buffer);
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      // after transposition & striding
+      int m_ = num_tokens;
+      int n_ = m->kProjSize;
+      int k_ = num_tokens; // num_new_tokens
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->kProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->kProjSize;
+      int strideC = num_tokens * m->kProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_T,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".query_activation";
+        save_tensor(
+            B, m->qProjSize * m->num_q_heads * num_tokens, filename.c_str());
+        std::string filename2 =
+            get_peft_dbg_folder(m, shard_id) + ".devkproj_pre";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename2.c_str());
+      }
+    }
+    // Step 6: compute gradients w.r.t query
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (*m->qk_prod_scaling) {
+        alpha = 1.0f / sqrt(m->kProjSize);
+      }
+      // matrix A: gradients w.r.t. qk_prods
+      // matrix A's layout: [num_new_tokens, num_tokens, num_heads]
+      DT const *A = static_cast<DT *>(m->qk_prods);
+      // matrix B: key cache
+      // matrix B's layout: [vProjSize * num_heads, max_num_tokens, num_req]
+      DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+      // matrix C: gradients for query (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C = static_cast<DT *>(m->devQKVProjArray);
+      // after transposition & striding
+      int m_ = num_tokens; // num_new_tokens
+      int n_ = m->qProjSize;
+      int k_ = num_tokens;
+      // before transposition and striding
+      int lda = num_tokens; // num_new_tokens
+      int ldb = m->qProjSize * m->num_q_heads;
+      int ldc = num_tokens;
+      int strideA = num_tokens * num_tokens;
+      int strideB = m->qProjSize;
+      int strideC = num_tokens * m->qProjSize;
+      checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
+                                           CUBLAS_OP_N,
+                                           CUBLAS_OP_T,
+                                           m_,
+                                           n_,
+                                           k_,
+                                           &alpha,
+                                           A,
+                                           cublas_data_type,
+                                           lda,
+                                           strideA,
+                                           B,
+                                           cublas_data_type,
+                                           ldb,
+                                           strideB,
+                                           &beta,
+                                           C,
+                                           cublas_data_type,
+                                           ldc,
+                                           strideC,
+                                           m->num_q_heads,
+                                           compute_type,
+                                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray_pre";
+        save_tensor(C,
+                    num_tokens * m->qProjSize * m->num_q_heads * 3,
+                    filename.c_str());
+      }
+    }
+
+    // Step 7: perform rotary position embeddings (RoPE) bwd
+    {
+      if (*m->apply_rotary_embedding) {
+        assert(m->hidden_size == m->qProjSize * m->num_q_heads);
+        assert(m->qProjSize == m->kProjSize);
+        /*q&k*/
+        int parallelism = num_tokens * m->hidden_size;
+        DT *A = static_cast<DT *>(m->devQKVProjArray);
+        apply_rotary_embedding_bwd<<<GET_BLOCKS(parallelism),
+                                     min(CUDA_NUM_THREADS, parallelism),
+                                     0,
+                                     stream>>>(A,
+                                               m->complex_input,
+                                               m->token_infos,
+                                               m->qProjSize,
+                                               num_tokens,
+                                               m->hidden_size);
+        DT *C = static_cast<DT *>(m->devQKVProjArray);
+        if (m->inference_debugging) {
+          std::string filename =
+              get_peft_dbg_folder(m, shard_id) + ".devQKVPRojArray";
+          save_tensor(C,
+                      num_tokens * m->qProjSize * m->num_q_heads * 3,
+                      filename.c_str());
+        }
+      }
+
+      // matrix C: gradients for key (saved as part of m->devQKVProjArray)
+      // matrix C's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT *C =
+          static_cast<DT *>(m->devQKVProjArray) +
+          num_tokens *
+              (m->qProjSize *
+               m->num_q_heads); // skip over regions reserved for Q gradients
+      if (m->inference_debugging) {
+        std::string filename = get_peft_dbg_folder(m, shard_id) + ".devkproj";
+        save_tensor(
+            C, num_tokens * (m->qProjSize * m->num_q_heads), filename.c_str());
+      }
+    }
+
+    // Step 8: compute gradients w.r.t. input
+    {
+      float alpha = 1.0f, beta = 0.0f;
+      if (!m->reset_input_grads[0]) {
+        beta = 1.0f;
+      }
+      // matrix A: QKV projection weights
+      // matrix A's layout: [qSize, qProjSize * num_q_heads, 3]
+      DT const *A = weight_ptr;
+      // matrix B: gradients w.r.t. QKV (concatenated in devQKVArray)
+      // matrix B's layout: [num_tokens, qProjsize * num_heads, 3]
+      DT const *B = static_cast<DT *>(m->devQKVProjArray);
+      // matrix C: gradients w.r.t. input
+      // matrix C's layout: [m->qSize, num_tokens]
+      DT *C = input_grad_ptr +
+              bc->requestsInfo[i].first_token_offset_in_batch * m->qSize;
+      int m_ = m->qSize;
+      int n_ = num_tokens;
+      int k_ = m->num_q_heads * (m->qProjSize + m->kProjSize + m->vProjSize);
+      int lda = m_;
+      int ldb = n_;
+      int ldc = m_;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_T,
+                             m_,
+                             n_,
+                             k_,
+                             &alpha,
+                             A,
+                             cublas_data_type,
+                             lda,
+                             B,
+                             cublas_data_type,
+                             ldb,
+                             &beta,
+                             C,
+                             cublas_data_type,
+                             ldc,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+      if (m->inference_debugging) {
+        std::string filename =
+            get_peft_dbg_folder(m, shard_id) + ".self_attn.input_gradient_0";
+        save_tensor(C, num_tokens * m->qSize, filename.c_str());
+      }
+    }
+  }
+}
+
 } // namespace IncMultiHeadAttention
 } // namespace Kernels
 
@@ -877,24 +1431,25 @@ __global__ void store_kv_cache(DT const *devQKVProjArray,
 }
 
 template <typename DT>
-__global__ void fill_entries_above_diagonal(DT *matrix,
-                                            size_t num_rows,
-                                            size_t num_cols,
-                                            size_t num_q_heads,
-                                            size_t entries_above_diagonal,
-                                            DT value) {
-  CUDA_KERNEL_LOOP(i, entries_above_diagonal * num_q_heads) {
-    size_t head_idx = i / entries_above_diagonal;
-    size_t entry_idx = i % entries_above_diagonal;
-    size_t y = (-1 + sqrt(8 * (float)entry_idx + 1)) / 2;
-    size_t x = entry_idx - y * (y + 1) / 2;
-    y += (num_cols - num_rows) + 1;
-    matrix[head_idx * num_rows * num_cols + num_cols * y + x] = value;
+__global__ void store_query_cache(DT const *devQKVProjArray,
+                                  DT *qCache_ptr,
+                                  int num_tokens,
+                                  int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_tokens * hidden_size) {
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+
+    size_t val_idx = token_idx * QKV_WEIGHT_NUM * hidden_size + offset;
+
+    DT qVal = devQKVProjArray[val_idx];
+
+    // query cache
+    qCache_ptr[i] = qVal;
   }
 }
 
 template <typename DT>
-void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
+void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta *m,
                                      BatchConfig const *bc,
                                      int shard_id,
                                      DT const *bias_ptr,
@@ -905,17 +1460,18 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
@@ -929,12 +1485,35 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
-    if (bc->request_completed[i] || (!bc->requestsInfo[i].prompt_phase)) {
+    if (bc->request_completed[i] ||
+        (!bc->requestsInfo[i].prompt_phase && !bc->requestsInfo[i].peft_bwd)) {
       continue;
     }
     int num_new_tokens = bc->requestsInfo[i].num_tokens_in_batch;
     int total_tokens = bc->requestsInfo[i].first_token_depth_in_request +
                        bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    // Copy query to m->query_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * m->num_q_heads * m->qProjSize;
+      if (activation_size_needed > m->allocated_peft_buffer_size1) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->query_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size1 = activation_size_needed;
+      }
+      int parallelism = m->hidden_size * num_tokens;
+      store_query_cache<<<GET_BLOCKS(parallelism),
+                          min(CUDA_NUM_THREADS, parallelism),
+                          0,
+                          stream>>>(
+          static_cast<DT *>(m->devQKVProjArray),
+          static_cast<DT *>(m->query_activation_buffer),
+          num_tokens,
+          m->hidden_size);
+    }
     // Step 1: compute query-key product QK.T/sqrt(d_k)
     {
       // Scale by sqrt(d_k) as per the original attention paper
@@ -1066,6 +1645,25 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
                                      m->qk_tensor,
                                      C_softmax));
     }
+    // Copy C_softmax to m->softmax_activation_buffer if we need to compute
+    // PEFT backward
+    if (bc->requestsInfo[i].peft_bwd) {
+      DT *C_softmax = static_cast<DT *>(m->qk_prods_softmax);
+      size_t activation_size_needed =
+          sizeof(DT) * max_peft_tokens * max_peft_tokens * m->num_q_heads;
+      if (activation_size_needed > m->allocated_peft_buffer_size2) {
+        MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+        m->softmax_activation_buffer =
+            allocator->allocate_instance_untyped(activation_size_needed);
+        m->allocated_peft_buffer_size2 = activation_size_needed;
+      }
+      checkCUDA(cudaMemcpyAsync(m->softmax_activation_buffer,
+                                C_softmax,
+                                sizeof(DT) * total_tokens * num_new_tokens *
+                                    m->num_q_heads,
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
     // Step 5: Matmul softmax(QK.T/sqrt(d_k)) by V. Implemented as V @
     // softmax(QK.T/sqrt(d_k)).T
     {
@@ -1090,7 +1688,6 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
       // To get B, skip over softmax(QK.T/sqrt(d_k)) entries from previous
       // requests (all heads)
       DT *B = static_cast<DT *>(m->qk_prods_softmax);
-      ;
       // matrix C: attn heads
       // matrix C's layout: [vProjSize, num_heads, num_new_tokens]
       // To get C, skip over softmax(QK.T/sqrt(d_k))V products from previous
@@ -1136,7 +1733,7 @@ void compute_attention_kernel_prompt(IncMultiHeadSelfAttentionMeta const *m,
 
 /*static*/
 void IncMultiHeadSelfAttention::inference_kernel_wrapper(
-    IncMultiHeadSelfAttentionMeta const *m,
+    IncMultiHeadSelfAttentionMeta *m,
     BatchConfig const *bc,
     int shard_id,
     GenericTensorAccessorR const &input,
@@ -1206,6 +1803,70 @@ void IncMultiHeadSelfAttention::inference_kernel_wrapper(
   }
 }
 
+/*static*/
+void IncMultiHeadSelfAttention::peft_bwd_kernel_wrapper(
+    IncMultiHeadSelfAttentionMeta *m,
+    BatchConfig const *bc,
+    int shard_id,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &bias) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  bool use_bias = *m->qkv_bias || *m->final_bias;
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  // assert(input.data_type == weight.data_type);
+  assert(input_grad.data_type == output_grad.data_type);
+  if (use_bias) {
+    assert(input_grad.data_type == bias.data_type);
+  }
+
+  if (input_grad.data_type == DT_HALF) {
+    assert(!m->offload);
+    half const *bias_ptr =
+        use_bias ? bias.get_half_ptr() : static_cast<half const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_half_ptr(),
+                                                    weight.get_half_ptr(),
+                                                    output_grad.get_half_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else if (input_grad.data_type == DT_FLOAT) {
+    assert(!m->offload);
+    float const *bias_ptr =
+        use_bias ? bias.get_float_ptr() : static_cast<float const *>(nullptr);
+    Kernels::IncMultiHeadAttention::peft_bwd_kernel(m,
+                                                    bc,
+                                                    shard_id,
+                                                    input_grad.get_float_ptr(),
+                                                    weight.get_float_ptr(),
+                                                    output_grad.get_float_ptr(),
+                                                    bias_ptr,
+                                                    stream);
+  } else {
+    assert(false && "Unspported data type");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("IncMultiHeadSelfAttention PEFT backward time = %.9fms\n", elapsed);
+  }
+}
+
 IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     FFHandler handler,
     IncMultiHeadSelfAttention const *attn,
@@ -1424,11 +2085,10 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
     valueCache = gpu_mem_allocator.allocate_instance_untyped(value_cache_size *
                                                              size_of_dt);
 
-    token_infos =
-        static_cast<BatchConfig::PerTokenInfo *>(handler.batch_config_metadata);
-    request_infos = reinterpret_cast<BatchConfig::PerRequestInfo *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo));
+    token_infos = static_cast<BatchConfig::PerTokenInfo *>(
+        handler.batch_config_metadata->tokens_info);
+    request_infos = static_cast<BatchConfig::PerRequestInfo *>(
+        handler.batch_config_metadata->requestsInfo);
 
     if (offload) {
       // token_infos =
@@ -1478,6 +2138,8 @@ IncMultiHeadSelfAttentionMeta::IncMultiHeadSelfAttentionMeta(
              gpu_mem_allocator.reserved_allocated_size);
     }
   }
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
   cudaStreamSynchronize(stream);
 }
 
diff --git a/src/ops/kernels/batch_matmul.cpp b/src/ops/kernels/batch_matmul.cpp
index 7145af2108..8eeede65c7 100644
--- a/src/ops/kernels/batch_matmul.cpp
+++ b/src/ops/kernels/batch_matmul.cpp
@@ -13,13 +13,15 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/kernels/batch_matmul_kernels.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {}
+BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm)
+    : OpMeta(handler, bmm) {}
 
 namespace Kernels {
 namespace BatchMatmul {
diff --git a/src/ops/kernels/batch_matmul.cu b/src/ops/kernels/batch_matmul.cu
index ac280db1a4..97f13fa5a8 100644
--- a/src/ops/kernels/batch_matmul.cu
+++ b/src/ops/kernels/batch_matmul.cu
@@ -13,12 +13,14 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/batch_matmul.h"
 #include "flexflow/ops/kernels/batch_matmul_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-BatchMatmulMeta::BatchMatmulMeta(FFHandler handler) : OpMeta(handler) {}
+BatchMatmulMeta::BatchMatmulMeta(FFHandler handler, BatchMatmul const *bmm)
+    : OpMeta(handler, bmm) {}
 
 namespace Kernels {
 namespace BatchMatmul {
diff --git a/src/ops/kernels/cast_kernels.cpp b/src/ops/kernels/cast_kernels.cpp
index 16b9b4cec0..1e561959f1 100644
--- a/src/ops/kernels/cast_kernels.cpp
+++ b/src/ops/kernels/cast_kernels.cpp
@@ -14,12 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/cast_kernels.h"
+#include "flexflow/ops/cast.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {}
+CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {}
 
 namespace Kernels {
 namespace Cast {
diff --git a/src/ops/kernels/cast_kernels.cu b/src/ops/kernels/cast_kernels.cu
index a96f37dbbd..fdce63b9f1 100644
--- a/src/ops/kernels/cast_kernels.cu
+++ b/src/ops/kernels/cast_kernels.cu
@@ -13,12 +13,13 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/cast.h"
 #include "flexflow/ops/kernels/cast_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-CastMeta::CastMeta(FFHandler handle) : OpMeta(handle) {}
+CastMeta::CastMeta(FFHandler handle, Cast const *cast) : OpMeta(handle, cast) {}
 
 namespace Kernels {
 namespace Cast {
diff --git a/src/ops/kernels/concat_kernels.cpp b/src/ops/kernels/concat_kernels.cpp
index bf5d46b9cc..6c05e0143c 100644
--- a/src/ops/kernels/concat_kernels.cpp
+++ b/src/ops/kernels/concat_kernels.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/kernels/concat_kernels.h"
+#include "flexflow/ops/concat.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -23,6 +24,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Rect;
 
+ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc)
+    : OpMeta(handler, cc) {}
+
 namespace Kernels {
 namespace Concat {
 
diff --git a/src/ops/kernels/concat_kernels.cu b/src/ops/kernels/concat_kernels.cu
index f625560625..2569c36b21 100644
--- a/src/ops/kernels/concat_kernels.cu
+++ b/src/ops/kernels/concat_kernels.cu
@@ -13,6 +13,7 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/concat.h"
 #include "flexflow/ops/kernels/concat_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
@@ -22,6 +23,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Rect;
 
+ConcatMeta::ConcatMeta(FFHandler handler, Concat const *cc)
+    : OpMeta(handler, cc) {}
+
 namespace Kernels {
 namespace Concat {
 
diff --git a/src/ops/kernels/conv_2d_kernels.cpp b/src/ops/kernels/conv_2d_kernels.cpp
index 7d2fa20c49..85a94ad6be 100644
--- a/src/ops/kernels/conv_2d_kernels.cpp
+++ b/src/ops/kernels/conv_2d_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/ops/kernels/conv_2d_kernels.h"
+#include "flexflow/ops/conv_2d.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) {
+Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv)
+    : OpMeta(handler, conv) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&biasTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
@@ -326,7 +328,7 @@ void backward_kernel(Conv2DMeta const *m,
                        output_ptr,
                        n * c * h * w);
   }
-  // Compute filter gradiant
+  // Compute filter gradient
   // NOTE: we use alpha for kernel_grad to accumulate gradients
   checkCUDNN(miopenConvolutionBackwardWeights(m->handle.dnn,
                                               &alpha,
@@ -341,7 +343,7 @@ void backward_kernel(Conv2DMeta const *m,
                                               kernel_grad_ptr,
                                               m->handle.workSpace,
                                               m->handle.workSpaceSize));
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha for bias_grad to accumulate gradients
   if (bias_grad_ptr != NULL) {
     checkCUDNN(miopenConvolutionBackwardBias(m->handle.dnn,
@@ -352,7 +354,7 @@ void backward_kernel(Conv2DMeta const *m,
                                              m->biasTensor,
                                              bias_grad_ptr));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDNN(miopenConvolutionBackwardData(m->handle.dnn,
diff --git a/src/ops/kernels/conv_2d_kernels.cu b/src/ops/kernels/conv_2d_kernels.cu
index 6c0fd85496..661acdf732 100644
--- a/src/ops/kernels/conv_2d_kernels.cu
+++ b/src/ops/kernels/conv_2d_kernels.cu
@@ -1,9 +1,11 @@
+#include "flexflow/ops/conv_2d.h"
 #include "flexflow/ops/kernels/conv_2d_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-Conv2DMeta::Conv2DMeta(FFHandler handler) : OpMeta(handler) {
+Conv2DMeta::Conv2DMeta(FFHandler handler, Conv2D const *conv)
+    : OpMeta(handler, conv) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
@@ -309,7 +311,7 @@ void backward_kernel(Conv2DMeta const *m,
     reluBackward<<<GET_BLOCKS(n * c * h * w), CUDA_NUM_THREADS, 0, stream>>>(
         output_grad_ptr, output_ptr, n * c * h * w);
   }
-  // Compute filter gradiant
+  // Compute filter gradient
   // NOTE: we use alpha for kernel_grad to accumulate gradients
   checkCUDNN(cudnnConvolutionBackwardFilter(m->handle.dnn,
                                             &alpha,
@@ -324,7 +326,7 @@ void backward_kernel(Conv2DMeta const *m,
                                             &alpha,
                                             m->filterDesc,
                                             kernel_grad_ptr));
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha for bias_grad to accumulate gradients
   if (bias_grad_ptr != NULL) {
     checkCUDNN(cudnnConvolutionBackwardBias(m->handle.dnn,
@@ -335,7 +337,7 @@ void backward_kernel(Conv2DMeta const *m,
                                             m->biasTensor,
                                             bias_grad_ptr));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDNN(cudnnConvolutionBackwardData(m->handle.dnn,
diff --git a/src/ops/kernels/dropout_kernels.cpp b/src/ops/kernels/dropout_kernels.cpp
index 14225f0bce..c8b1887fd4 100644
--- a/src/ops/kernels/dropout_kernels.cpp
+++ b/src/ops/kernels/dropout_kernels.cpp
@@ -28,7 +28,7 @@ DropoutMeta::DropoutMeta(FFHandler handler,
                          Dropout const *dropout,
                          Memory gpu_mem,
                          Domain const &output_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, dropout) {
   profiling = dropout->profiling;
   inference_debugging = dropout->inference_debugging;
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
diff --git a/src/ops/kernels/dropout_kernels.cu b/src/ops/kernels/dropout_kernels.cu
index e142bba83b..d65b951f51 100644
--- a/src/ops/kernels/dropout_kernels.cu
+++ b/src/ops/kernels/dropout_kernels.cu
@@ -27,7 +27,7 @@ DropoutMeta::DropoutMeta(FFHandler handler,
                          Dropout const *dropout,
                          Memory gpu_mem,
                          Domain const &output_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, dropout) {
   profiling = dropout->profiling;
   inference_debugging = dropout->inference_debugging;
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
diff --git a/src/ops/kernels/flat_kernels.cpp b/src/ops/kernels/flat_kernels.cpp
index be48854fc0..6815ce7492 100644
--- a/src/ops/kernels/flat_kernels.cpp
+++ b/src/ops/kernels/flat_kernels.cpp
@@ -14,11 +14,15 @@
  */
 
 #include "flexflow/ops/kernels/flat_kernels.h"
+#include "flexflow/ops/flat.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
+FlatMeta::FlatMeta(FFHandler handler, Flat const *flat)
+    : OpMeta(handler, flat) {}
+
 namespace Kernels {
 namespace Flat {
 
diff --git a/src/ops/kernels/flat_kernels.cu b/src/ops/kernels/flat_kernels.cu
index 3836c02c94..fc0c0270c1 100644
--- a/src/ops/kernels/flat_kernels.cu
+++ b/src/ops/kernels/flat_kernels.cu
@@ -13,11 +13,15 @@
  * limitations under the License.
  */
 
+#include "flexflow/ops/flat.h"
 #include "flexflow/ops/kernels/flat_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
+FlatMeta::FlatMeta(FFHandler handler, Flat const *flat)
+    : OpMeta(handler, flat) {}
+
 namespace Kernels {
 namespace Flat {
 
diff --git a/src/ops/kernels/linear_kernels.cpp b/src/ops/kernels/linear_kernels.cpp
index 072eb5e96b..a36d6719c9 100644
--- a/src/ops/kernels/linear_kernels.cpp
+++ b/src/ops/kernels/linear_kernels.cpp
@@ -14,6 +14,8 @@
  */
 
 #include "flexflow/ops/kernels/linear_kernels.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/kernels/decompress_kernels.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -24,24 +26,53 @@ LinearMeta::LinearMeta(FFHandler handler,
                        Linear const *li,
                        MemoryAllocator gpu_mem_allocator,
                        int weightSize)
-    : OpMeta(handler, li) {
+    : OpMeta(handler, li), weight_ptr(nullptr) {
+  DataType data_type = li->data_type;
+  // allocate weight and bias in the reserve space for cpu offloading
+  if (li->offload) {
+    weight_ptr = gpu_mem_allocator.allocate_reserved_untyped(
+        weightSize * data_type_size(data_type));
+    if (li->quantization_type != DT_NONE) {
+      quantized_weightSize = get_quantization_to_byte_size(
+          data_type, li->quantization_type, weightSize);
+      quantized_weight_ptr =
+          gpu_mem_allocator.allocate_reserved<char>(quantized_weightSize);
+    }
+  }
   // Allocate an all-one's vector
-  float *dram_one_ptr = (float *)malloc(sizeof(float) * batch_size);
-  for (int i = 0; i < batch_size; i++) {
-    dram_one_ptr[i] = 1.0f;
+  gpu_mem_allocator.create_legion_instance(
+      reserveInst, data_type_size(data_type) * batch_size);
+  one_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * batch_size);
+  int parallelism = batch_size;
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  if (data_type == DT_FLOAT) {
+    Kernels::Linear::Internal::
+        build_one_ptr<<<GET_BLOCKS(parallelism),
+                        min(CUDA_NUM_THREADS, parallelism),
+                        0,
+                        stream>>>((float *)one_ptr, batch_size);
+  } else if (data_type == DT_HALF) {
+    Kernels::Linear::Internal::
+        build_one_ptr<<<GET_BLOCKS(parallelism),
+                        min(CUDA_NUM_THREADS, parallelism),
+                        0,
+                        stream>>>((half *)one_ptr, batch_size);
   }
-  float *fb_one_ptr;
-  checkCUDA(hipMalloc(&fb_one_ptr, sizeof(float) * batch_size));
-  checkCUDA(hipMemcpy(fb_one_ptr,
-                      dram_one_ptr,
-                      sizeof(float) * batch_size,
-                      hipMemcpyHostToDevice));
-  one_ptr = (void *)fb_one_ptr;
+
   // Allocate descriptors
   checkCUDNN(miopenCreateActivationDescriptor(&actiDesc));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
+
+  allocated_peft_buffer_size = 0;
+}
+
+LinearMeta::~LinearMeta(void) {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
 }
-LinearMeta::~LinearMeta(void) {}
 
 namespace Kernels {
 namespace Linear {
@@ -96,7 +127,61 @@ void forward_kernel_wrapper(LinearMeta const *m,
                             int batch_size) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::forward_kernel<float>(m,
+                                    input_ptr,
+                                    output_ptr,
+                                    weight_ptr,
+                                    bias_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    batch_size,
+                                    stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::forward_kernel<half>(m,
+                                   input_ptr,
+                                   output_ptr,
+                                   weight_ptr,
+                                   bias_ptr,
+                                   in_dim,
+                                   out_dim,
+                                   batch_size,
+                                   stream);
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("%s [Linear] forward time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[Linear:forward:input]"); print_tensor<float>((float*)weight_ptr, in_dim
+    // * out_dim, "[Linear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[Linear:forward:output]");
+  }
+}
 
+void inference_kernel_wrapper(LinearMeta *m,
+                              BatchConfig const *bc,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *weight_ptr,
+                              void const *bias_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int batch_size) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
   hipEvent_t t_start, t_end;
   if (m->profiling) {
     checkCUDA(hipEventCreate(&t_start));
@@ -126,6 +211,67 @@ void forward_kernel_wrapper(LinearMeta const *m,
                                    stream);
   }
 
+  if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) {
+    // save input activation if needed for PEFT
+    if (bc->num_active_peft_tokens() > 0) {
+      // Check that we have at most one request that requires peft_bwd
+      int num_peft_requests = 0;
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_bwd) {
+          num_peft_requests++;
+        }
+      }
+      assert(num_peft_requests <= 1);
+
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        // Skip non-PEFT requests
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+        int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+        int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
+        if (bc->requestsInfo[i].peft_bwd) {
+          size_t activation_size_needed =
+              data_type_size(m->output_type[0]) * max_peft_tokens * out_dim;
+          if (activation_size_needed > m->allocated_peft_buffer_size) {
+            MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+            m->output_activation_buffer =
+                allocator->allocate_instance_untyped(activation_size_needed);
+            m->allocated_peft_buffer_size = activation_size_needed;
+          }
+          // copy output activation
+          if (m->output_type[0] == DT_FLOAT) {
+            checkCUDA(hipMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<float *>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                hipMemcpyDeviceToDevice,
+                stream));
+          } else if (m->output_type[0] == DT_HALF) {
+            checkCUDA(hipMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<half *>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                hipMemcpyDeviceToDevice,
+                stream));
+          } else {
+            assert(false && "unsupport datatype in layernorm");
+          }
+        }
+      }
+    }
+  }
+
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -134,12 +280,60 @@ void forward_kernel_wrapper(LinearMeta const *m,
     checkCUDA(hipEventDestroy(t_start));
     checkCUDA(hipEventDestroy(t_end));
     printf("%s [Linear] forward time = %.2lfms\n", m->op_name, elapsed);
-    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
-    // "[Linear:forward:input]"); print_tensor<float>(acc_kernel.ptr,
-    // acc_kernel.rect.volume(), "[Linear:forward:kernel]");
-    // print_tensor<float>(acc_bias.ptr, acc_bias.rect.volume(),
-    // "[Linear:forward:bias]"); print_tensor<float>(acc_output.ptr,
-    // acc_output.rect.volume(), "[Linear:forward:output]");
+  }
+}
+
+void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             void *input_grad_ptr,
+                             void *output_grad_ptr,
+                             void const *weight_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int num_infr_tokens,
+                             int num_peft_tokens) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     input_grad_ptr,
+                                     output_grad_ptr,
+                                     weight_ptr,
+                                     in_dim,
+                                     out_dim,
+                                     num_infr_tokens,
+                                     num_peft_tokens,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    input_grad_ptr,
+                                    output_grad_ptr,
+                                    weight_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    num_infr_tokens,
+                                    num_peft_tokens,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[Linear:forward:input]"); print_tensor<float>((float*)weight_ptr, in_dim
+    // * out_dim, "[Linear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[Linear:forward:output]");
   }
 }
 
@@ -223,8 +417,20 @@ Parameter* Linear::get_parameter(int index)
   }
 }
 */
-
 namespace Internal {
+
+template <typename DT>
+__global__ void AddBiasWithReLU(DT *output_ptr,
+                                DT const *bias_ptr,
+                                int out_dim,
+                                int batch_size) {
+  CUDA_KERNEL_LOOP(i, out_dim * batch_size) {
+    int bias_idx = i % out_dim;
+    DT value = output_ptr[i] + bias_ptr[bias_idx];
+    output_ptr[i] = ((float)value > 0.0f) ? value : (DT)0.0f;
+  }
+}
+
 template <typename DT>
 void forward_kernel(LinearMeta const *m,
                     void const *input_ptr,
@@ -234,20 +440,57 @@ void forward_kernel(LinearMeta const *m,
                     int in_dim,
                     int out_dim,
                     int batch_size,
-                    hipStream_t stream) {
+                    ffStream_t stream) {
+  // additional processing for uploading weights
+  if (m->offload) {
+    // Note that we update weight_ptr when uploading weight
+    if (m->quantization_type != DT_NONE) {
+      checkCUDA(hipMemcpyAsync(m->quantized_weight_ptr,
+                               weight_ptr,
+                               m->quantized_weightSize,
+                               hipMemcpyHostToDevice,
+                               stream));
+      if (m->quantization_type == DT_INT4) {
+        int parallelism = in_dim * out_dim / 2;
+        decompress_int4_general_weights<DT>
+            <<<GET_BLOCKS(parallelism),
+               min(CUDA_NUM_THREADS, parallelism),
+               0,
+               stream>>>(m->quantized_weight_ptr,
+                         static_cast<DT *>(m->weight_ptr),
+                         in_dim,
+                         in_dim * out_dim);
+      } else {
+        assert(m->quantization_type == DT_INT8);
+        int parallelism = in_dim * out_dim;
+        decompress_int8_general_weights<DT>
+            <<<GET_BLOCKS(parallelism),
+               min(CUDA_NUM_THREADS, parallelism),
+               0,
+               stream>>>(m->quantized_weight_ptr,
+                         static_cast<DT *>(m->weight_ptr),
+                         in_dim,
+                         in_dim * out_dim);
+      }
+
+    } else {
+      checkCUDA(hipMemcpyAsync(m->weight_ptr,
+                               weight_ptr,
+                               in_dim * out_dim * sizeof(DT),
+                               hipMemcpyHostToDevice,
+                               stream));
+    }
+  }
   checkCUDA(hipblasSetStream(m->handle.blas, stream));
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
   DT alpha = 1.0f, beta = 0.0f;
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
-  hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  hipblasDatatype_t weight_type = m->offload
+                                      ? ff_to_cuda_datatype(m->weight_ptr_type)
+                                      : ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  hipblasDatatype_t compute_type = output_type;
-#else
-  // TODO: currently use the output_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  assert(input_type == weight_type && weight_type == output_type);
   hipblasDatatype_t compute_type = output_type;
-#endif
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_T,
                           HIPBLAS_OP_N,
@@ -255,7 +498,7 @@ void forward_kernel(LinearMeta const *m,
                           batch_size,
                           in_dim,
                           &alpha,
-                          weight_ptr,
+                          m->offload ? m->weight_ptr : weight_ptr,
                           weight_type,
                           in_dim,
                           input_ptr,
@@ -269,6 +512,16 @@ void forward_kernel(LinearMeta const *m,
                           HIPBLAS_GEMM_DEFAULT));
   // use_bias = True
   if (bias_ptr != NULL) {
+    // fuse bias and relu
+    if (m->activation == AC_MODE_RELU) {
+      int parallelism = out_dim * batch_size;
+      AddBiasWithReLU<<<GET_BLOCKS(parallelism), CUDA_NUM_THREADS, 0, stream>>>(
+          static_cast<DT *>(output_ptr),
+          static_cast<DT const *>(bias_ptr),
+          out_dim,
+          batch_size);
+      return;
+    }
     checkCUDA(hipblasGemmEx(m->handle.blas,
                             HIPBLAS_OP_T,
                             HIPBLAS_OP_N,
@@ -306,7 +559,7 @@ void forward_kernel(LinearMeta const *m,
                        GET_BLOCKS(elements),
                        CUDA_NUM_THREADS,
                        0,
-                       0,
+                       stream,
                        elements,
                        B,
                        C,
@@ -318,6 +571,74 @@ void forward_kernel(LinearMeta const *m,
   }
 }
 
+template <typename DT>
+void peft_bwd_kernel(LinearMeta const *m,
+                     void *input_grad_ptr,
+                     void *output_grad_ptr,
+                     void const *kernel_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream) {
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+
+  hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  // update input_grad_ptr and output_grad_ptr offset
+  int num_infr_only_tokens = num_infr_tokens - num_peft_tokens;
+  input_grad_ptr =
+      static_cast<DT *>(input_grad_ptr) + num_infr_only_tokens * in_dim;
+  output_grad_ptr =
+      static_cast<DT *>(output_grad_ptr) + num_infr_only_tokens * out_dim;
+  hipblasDatatype_t compute_type = output_type;
+  int output_size = out_dim * num_peft_tokens;
+  if (m->activation == AC_MODE_RELU) {
+    relu_backward_kernel(m->output_type[0],
+                         output_grad_ptr,
+                         m->output_activation_buffer,
+                         output_size,
+                         stream);
+  } else if (m->activation == AC_MODE_SIGMOID) {
+    sigmoid_backward_kernel(m->output_type[0],
+                            output_grad_ptr,
+                            m->output_activation_buffer,
+                            output_size,
+                            stream);
+  } else {
+    // TODO: only support relu and sigmoid for now
+    assert(m->activation == AC_MODE_NONE);
+  }
+
+  // Compute data gradient
+  // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+  DT alpha = 1.0f;
+  DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+  if (input_grad_ptr != NULL) {
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_N,
+                            HIPBLAS_OP_N,
+                            in_dim,
+                            num_peft_tokens,
+                            out_dim,
+                            &alpha,
+                            kernel_ptr,
+                            weight_type,
+                            in_dim,
+                            output_grad_ptr,
+                            output_type,
+                            out_dim,
+                            &beta,
+                            input_grad_ptr,
+                            input_type,
+                            in_dim,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+}
+
 template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
@@ -335,16 +656,11 @@ void backward_kernel(LinearMeta const *m,
   checkCUDNN(miopenSetStream(m->handle.dnn, stream));
 
   DT alpha = 1.0f;
+  float sgeam_alpha = 1.0f;
   hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   hipblasDatatype_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = output_type;
-#else
-  // TODO: currently use output_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = output_type;
-#endif
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(
@@ -356,7 +672,7 @@ void backward_kernel(LinearMeta const *m,
     // TODO: only support relu and sigmoid for now
     assert(m->activation == AC_MODE_NONE);
   }
-  // Compute weight gradiant
+  // Compute weight gradient
   // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
   checkCUDA(hipblasGemmEx(m->handle.blas,
                           HIPBLAS_OP_N,
@@ -377,7 +693,27 @@ void backward_kernel(LinearMeta const *m,
                           in_dim,
                           compute_type,
                           HIPBLAS_GEMM_DEFAULT));
-  // Compute bias gradiant
+  if (m->kernel_reg_type == REG_MODE_NONE) {
+    // do nothing
+  } else if (m->kernel_reg_type == REG_MODE_L2) {
+    checkCUDA(hipblasSgeam(m->handle.blas,
+                           HIPBLAS_OP_N,
+                           HIPBLAS_OP_N,
+                           in_dim,
+                           out_dim,
+                           &sgeam_alpha,
+                           (float *)kernel_grad_ptr,
+                           in_dim,
+                           &(m->kernel_reg_lambda),
+                           (float *)kernel_ptr,
+                           in_dim,
+                           (float *)kernel_grad_ptr,
+                           in_dim));
+  } else {
+    assert(false && "Only L2 regularization is supported");
+  }
+
+  // Compute bias gradient
   // NOTE: we use alpha=1 for bias_grad to accumulate gradients
   // use_bias = True
   if (bias_grad_ptr != NULL) {
@@ -388,7 +724,7 @@ void backward_kernel(LinearMeta const *m,
                             out_dim,
                             batch_size,
                             &alpha,
-                            m->one_ptr,
+                            static_cast<DT *>(m->one_ptr),
                             HIPBLAS_R_32F,
                             1,
                             output_grad_ptr,
@@ -401,7 +737,7 @@ void backward_kernel(LinearMeta const *m,
                             compute_type,
                             HIPBLAS_GEMM_DEFAULT));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDA(hipblasGemmEx(m->handle.blas,
@@ -426,7 +762,14 @@ void backward_kernel(LinearMeta const *m,
   }
 }
 
+template <typename DT>
+__global__ void build_one_ptr(DT *one_ptr, int batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size) {
+    one_ptr[i] = static_cast<DT>(1.0f);
+  }
+}
+
 } // namespace Internal
 } // namespace Linear
 } // namespace Kernels
-}; // namespace FlexFlow
+} // namespace FlexFlow
diff --git a/src/ops/kernels/linear_kernels.cu b/src/ops/kernels/linear_kernels.cu
index c30c9f71c1..d4f930db6c 100644
--- a/src/ops/kernels/linear_kernels.cu
+++ b/src/ops/kernels/linear_kernels.cu
@@ -63,6 +63,8 @@ LinearMeta::LinearMeta(FFHandler handler,
   // Allocate descriptors
   checkCUDNN(cudnnCreateActivationDescriptor(&actiDesc));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
+
+  allocated_peft_buffer_size = 0;
 }
 
 LinearMeta::~LinearMeta(void) {
@@ -170,6 +172,172 @@ void forward_kernel_wrapper(LinearMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(LinearMeta *m,
+                              BatchConfig const *bc,
+                              void const *input_ptr,
+                              void *output_ptr,
+                              void const *weight_ptr,
+                              void const *bias_ptr,
+                              int in_dim,
+                              int out_dim,
+                              int batch_size) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::forward_kernel<float>(m,
+                                    input_ptr,
+                                    output_ptr,
+                                    weight_ptr,
+                                    bias_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    batch_size,
+                                    stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::forward_kernel<half>(m,
+                                   input_ptr,
+                                   output_ptr,
+                                   weight_ptr,
+                                   bias_ptr,
+                                   in_dim,
+                                   out_dim,
+                                   batch_size,
+                                   stream);
+  }
+
+  if (m->activation == AC_MODE_RELU || m->activation == AC_MODE_SIGMOID) {
+    // save input activation if needed for PEFT
+    if (bc->num_active_peft_tokens() > 0) {
+      // Check that we have at most one request that requires peft_bwd
+      int num_peft_requests = 0;
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        if (bc->requestsInfo[i].peft_bwd) {
+          num_peft_requests++;
+        }
+      }
+      assert(num_peft_requests <= 1);
+
+      for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+        if (bc->request_completed[i]) {
+          continue;
+        }
+        // Skip non-PEFT requests
+        if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+          continue;
+        }
+        int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+        int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+        int first_token_offset = bc->requestsInfo[i].num_tokens_in_batch;
+        if (bc->requestsInfo[i].peft_bwd) {
+          size_t activation_size_needed =
+              data_type_size(m->output_type[0]) * max_peft_tokens * out_dim;
+          if (activation_size_needed > m->allocated_peft_buffer_size) {
+            MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+            m->output_activation_buffer =
+                allocator->allocate_instance_untyped(activation_size_needed);
+            m->allocated_peft_buffer_size = activation_size_needed;
+          }
+          // copy output activation
+          if (m->output_type[0] == DT_FLOAT) {
+            checkCUDA(cudaMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<float *>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                cudaMemcpyDeviceToDevice,
+                stream));
+          } else if (m->output_type[0] == DT_HALF) {
+            checkCUDA(cudaMemcpyAsync(
+                m->output_activation_buffer,
+                static_cast<half *>(output_ptr) + first_token_offset * out_dim,
+                data_type_size(m->output_type[0]) * num_peft_tokens * out_dim,
+                cudaMemcpyDeviceToDevice,
+                stream));
+          } else {
+            assert(false && "unsupport datatype in layernorm");
+          }
+        }
+      }
+    }
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [Linear] inference time = %.2lfms\n", m->op_name, elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(LinearMeta const *m,
+                             void *input_grad_ptr,
+                             void *output_grad_ptr,
+                             void const *weight_ptr,
+                             int in_dim,
+                             int out_dim,
+                             int num_infr_tokens,
+                             int num_peft_tokens) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     input_grad_ptr,
+                                     output_grad_ptr,
+                                     weight_ptr,
+                                     in_dim,
+                                     out_dim,
+                                     num_infr_tokens,
+                                     num_peft_tokens,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    input_grad_ptr,
+                                    output_grad_ptr,
+                                    weight_ptr,
+                                    in_dim,
+                                    out_dim,
+                                    num_infr_tokens,
+                                    num_peft_tokens,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [Linear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[Linear:forward:input]"); print_tensor<float>((float*)weight_ptr, in_dim
+    // * out_dim, "[Linear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[Linear:forward:output]");
+  }
+}
+
 void backward_kernel_wrapper(LinearMeta const *m,
                              void const *input_ptr,
                              void *input_grad_ptr,
@@ -323,17 +491,7 @@ void forward_kernel(LinearMeta const *m,
                                    : ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
   assert(input_type == weight_type && weight_type == output_type);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  cudaDataType_t compute_type = output_type;
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_T,
                          CUBLAS_OP_N,
@@ -398,7 +556,7 @@ void forward_kernel(LinearMeta const *m,
     size_t elements = (size_t)out_dim * (size_t)batch_size;
     constexpr float B = 0.7978845608028654f;   // sqrt(2.0/M_PI)
     constexpr float C = 0.035677408136300125f; // 0.044715 * sqrt(2.0/M_PI)
-    gelu_forward_kernel<<<GET_BLOCKS(elements), CUDA_NUM_THREADS>>>(
+    gelu_forward_kernel<<<GET_BLOCKS(elements), CUDA_NUM_THREADS, 0, stream>>>(
         elements, B, C, (float *)output_ptr);
   } else if (m->activation == AC_MODE_NONE) {
     // Do nothing
@@ -407,6 +565,74 @@ void forward_kernel(LinearMeta const *m,
   }
 }
 
+template <typename DT>
+void peft_bwd_kernel(LinearMeta const *m,
+                     void *input_grad_ptr,
+                     void *output_grad_ptr,
+                     void const *kernel_ptr,
+                     int in_dim,
+                     int out_dim,
+                     int num_infr_tokens,
+                     int num_peft_tokens,
+                     ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  // update input_grad_ptr and output_grad_ptr offset
+  int num_infr_only_tokens = num_infr_tokens - num_peft_tokens;
+  input_grad_ptr =
+      static_cast<DT *>(input_grad_ptr) + num_infr_only_tokens * in_dim;
+  output_grad_ptr =
+      static_cast<DT *>(output_grad_ptr) + num_infr_only_tokens * out_dim;
+  cudaDataType_t compute_type = output_type;
+  int output_size = out_dim * num_peft_tokens;
+  if (m->activation == AC_MODE_RELU) {
+    relu_backward_kernel(m->output_type[0],
+                         output_grad_ptr,
+                         m->output_activation_buffer,
+                         output_size,
+                         stream);
+  } else if (m->activation == AC_MODE_SIGMOID) {
+    sigmoid_backward_kernel(m->output_type[0],
+                            output_grad_ptr,
+                            m->output_activation_buffer,
+                            output_size,
+                            stream);
+  } else {
+    // TODO: only support relu and sigmoid for now
+    assert(m->activation == AC_MODE_NONE);
+  }
+
+  // Compute data gradient
+  // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+  DT alpha = 1.0f;
+  DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+  if (input_grad_ptr != NULL) {
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_N,
+                           CUBLAS_OP_N,
+                           in_dim,
+                           num_peft_tokens,
+                           out_dim,
+                           &alpha,
+                           kernel_ptr,
+                           weight_type,
+                           in_dim,
+                           output_grad_ptr,
+                           output_type,
+                           out_dim,
+                           &beta,
+                           input_grad_ptr,
+                           input_type,
+                           in_dim,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+}
+
 template <typename DT>
 void backward_kernel(LinearMeta const *m,
                      void const *input_ptr,
@@ -428,17 +654,7 @@ void backward_kernel(LinearMeta const *m,
   cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
   cudaDataType_t weight_type = ff_to_cuda_datatype(m->weight_type[0]);
   cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
-  cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  cudaDataType_t compute_type = output_type;
   int output_size = out_dim * batch_size;
   if (m->activation == AC_MODE_RELU) {
     relu_backward_kernel(
@@ -450,7 +666,7 @@ void backward_kernel(LinearMeta const *m,
     // TODO: only support relu and sigmoid for now
     assert(m->activation == AC_MODE_NONE);
   }
-  // Compute weight gradiant
+  // Compute weight gradient
   // NOTE: we use alpha=1 for kernel_grad to accumulate gradients
   checkCUDA(cublasGemmEx(m->handle.blas,
                          CUBLAS_OP_N,
@@ -491,7 +707,7 @@ void backward_kernel(LinearMeta const *m,
     assert(false && "Only L2 regularization is supported");
   }
 
-  // Compute bias gradiant
+  // Compute bias gradient
   // NOTE: we use alpha=1 for bias_grad to accumulate gradients
   // use_bias = True
   if (bias_grad_ptr != NULL) {
@@ -515,7 +731,7 @@ void backward_kernel(LinearMeta const *m,
                            compute_type,
                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
   }
-  // Compute data gradiant
+  // Compute data gradient
   // NOTE: we use alpha=1 for input_grad to accumulate gradients
   if (input_grad_ptr != NULL) {
     checkCUDA(cublasGemmEx(m->handle.blas,
diff --git a/src/ops/kernels/lora_linear_kernels.cpp b/src/ops/kernels/lora_linear_kernels.cpp
new file mode 100644
index 0000000000..c3c2cce3cf
--- /dev/null
+++ b/src/ops/kernels/lora_linear_kernels.cpp
@@ -0,0 +1,576 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/kernels/decompress_kernels.h"
+#include "flexflow/utils/hip_helper.h"
+#include <hip/hip_runtime.h>
+#include <random>
+#include <vector>
+
+namespace FlexFlow {
+
+LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
+    : OpMeta(handler, li) {
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
+}
+
+LoraLinearMeta::~LoraLinearMeta(void) {}
+
+namespace Kernels {
+namespace LoraLinear {
+
+void init_kernel_wrapper(LoraLinearMeta *m, int seed) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::init_kernel<float>(m, seed, stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::init_kernel<half>(m, seed, stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
+
+void inference_kernel_wrapper(LoraLinearMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::inference_kernel<float>(m,
+                                      bc,
+                                      input.get_float_ptr(),
+                                      output.get_float_ptr(),
+                                      in_dim,
+                                      out_dim,
+                                      stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::inference_kernel<half>(m,
+                                     bc,
+                                     input.get_half_ptr(),
+                                     output.get_half_ptr(),
+                                     in_dim,
+                                     out_dim,
+                                     stream);
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     bc,
+                                     input_grad.get_float_ptr(),
+                                     output_grad.get_float_ptr(),
+                                     in_dim,
+                                     out_dim,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    bc,
+                                    input_grad.get_half_ptr(),
+                                    output_grad.get_half_ptr(),
+                                    in_dim,
+                                    out_dim,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+namespace Internal {
+
+template <typename DT>
+void init_kernel(LoraLinearMeta *m, int seed, hipStream_t stream) {
+  // Initialize generator
+  std::mt19937 gen(seed);
+
+  // Get handle to weights by iterating over m->model_state to get each
+  // LoraLinearWeight object
+  for (auto &model_state : m->model_state) {
+    LoraLinearWeight weight = model_state.second.weights;
+    int w0_num_elements = weight.rank * weight.in_dim;
+    int w1_num_elements = weight.rank * weight.out_dim;
+
+    // LoRA_A weight: [in_dim, rank]
+    float stdv_lora_a = 1.0f / sqrt(weight.in_dim);
+    std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
+    std::vector<DT> lora_a_random_init(w0_num_elements);
+    for (auto &num : lora_a_random_init) {
+      float num_float = dis_lora_a(gen);
+      if (std::is_same<DT, half>::value) {
+        num = __float2half(num_float);
+      } else {
+        num = num_float;
+      }
+    }
+    checkCUDA(hipMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
+                             lora_a_random_init.data(),
+                             w0_num_elements * sizeof(DT),
+                             hipMemcpyHostToDevice,
+                             stream));
+
+    // LoRA_B weight: [rank, out_dim]
+    float stdv_lora_b = 1.0f / sqrt(weight.rank);
+    std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
+    std::vector<float> lora_b_random_init(w1_num_elements);
+    for (auto &num : lora_b_random_init) {
+      float num_float = dis_lora_b(gen);
+      if (std::is_same<DT, half>::value) {
+        num = __float2half(num_float);
+      } else {
+        num = num_float;
+      }
+    }
+    checkCUDA(hipMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
+                             lora_b_random_init.data(),
+                             w1_num_elements * sizeof(DT),
+                             hipMemcpyHostToDevice,
+                             stream));
+  }
+}
+
+template <typename DT>
+void inference_kernel(LoraLinearMeta *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int in_dim,
+                      int out_dim,
+                      ffStream_t stream) {
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+  DT alpha = 1.0f, beta = 0.0f;
+  hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  hipblasDatatype_t output_type = ff_to_cuda_datatype(m->input_type[1]);
+  hipblasDatatype_t lr_actv_type = output_type;
+  assert(input_type == output_type);
+  hipblasDatatype_t weight_type = output_type;
+  hipblasDatatype_t compute_type = output_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipDataType compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->input_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+  int num_peft_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+    }
+  }
+  // Assert that we have at most one request that requires peft_bwd
+  assert(num_peft_requests <= 1);
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_state.end());
+    LoraLinearWeight weight =
+        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
+    int rank = weight.rank;
+    void *intermediate_result_ptr = nullptr;
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed1 =
+          data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+      size_t activation_size_needed2 =
+          data_type_size(m->input_type[1]) * max_peft_tokens * rank;
+      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+      if (activation_size_needed1 > m->allocated_peft_buffer_size1) {
+        m->input_activation =
+            allocator->allocate_instance_untyped(activation_size_needed1);
+        m->allocated_peft_buffer_size1 = activation_size_needed1;
+      }
+      if (activation_size_needed2 > m->allocated_peft_buffer_size2) {
+        m->low_rank_activation =
+            allocator->allocate_instance_untyped(activation_size_needed2);
+        m->allocated_peft_buffer_size2 = activation_size_needed2;
+      }
+      // copy input activation
+      checkCUDA(hipMemcpyAsync(m->input_activation,
+                               input_ptr + first_token_offset * in_dim,
+                               data_type_size(m->input_type[0]) *
+                                   num_peft_tokens * in_dim,
+                               hipMemcpyDeviceToDevice,
+                               stream));
+      intermediate_result_ptr = m->low_rank_activation;
+    } else {
+      // use workspace to save intermediate result
+      assert(m->handle.workSpaceSize >=
+             data_type_size(m->input_type[1]) * num_peft_tokens * rank);
+      intermediate_result_ptr = m->handle.workSpace;
+    }
+    // buffer = weight_first * input
+    // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens]
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_T,
+                            HIPBLAS_OP_N,
+                            rank,
+                            num_peft_tokens,
+                            in_dim,
+                            &alpha,
+                            weight.w0_ptr,
+                            weight_type,
+                            in_dim,
+                            input_ptr + first_token_offset * in_dim,
+                            input_type,
+                            in_dim,
+                            &beta,
+                            intermediate_result_ptr,
+                            lr_actv_type,
+                            rank,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+    // output = weight_second * buffer
+    // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens]
+    // Note that we use alpha in both places since we do
+    // an in-place update for LoraLinear
+    float lora_alpha =
+        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
+    DT scaling_constant = (DT)(lora_alpha / rank);
+    checkCUDA(hipblasGemmEx(m->handle.blas,
+                            HIPBLAS_OP_T,
+                            HIPBLAS_OP_N,
+                            out_dim,
+                            num_peft_tokens,
+                            rank,
+                            &scaling_constant,
+                            weight.w1_ptr,
+                            weight_type,
+                            rank,
+                            intermediate_result_ptr,
+                            lr_actv_type,
+                            rank,
+                            &alpha,
+                            output_ptr + first_token_offset * out_dim,
+                            output_type,
+                            out_dim,
+                            compute_type,
+                            HIPBLAS_GEMM_DEFAULT));
+  }
+}
+
+template <typename DT>
+__global__ void sgd_update(size_t count,
+                           float lr,
+                           float weight_decay,
+                           float momentum,
+                           bool nesterov,
+                           DT const *WGrad,
+                           DT *V,
+                           DT *W) {
+  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
+  CUDA_KERNEL_LOOP(i, count) {
+    DT gt = WGrad[i] + (DT)weight_decay * W[i];
+    if (momentum > 0.0f) {
+      V[i] = V[i] * (DT)momentum + gt;
+      if (nesterov) {
+        gt = gt + (DT)momentum * V[i];
+      } else {
+        gt = V[i];
+      }
+    }
+    W[i] -= (DT)lr * gt;
+  }
+}
+
+template <typename DT>
+void peft_bwd_kernel(LoraLinearMeta *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int in_dim,
+                     int out_dim,
+                     ffStream_t stream) {
+  checkCUDA(hipblasSetStream(m->handle.blas, stream));
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+  hipblasDatatype_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  hipblasDatatype_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  assert(input_type == output_type);
+  hipblasDatatype_t weight_type = output_type;
+  hipblasDatatype_t lr_actv_type = output_type;
+  hipblasDatatype_t compute_type = output_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipDataType compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_state.end());
+    LoraLinearWeight weight =
+        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
+    int rank = weight.rank;
+    float lora_alpha =
+        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
+    DT scaling_constant = (DT)(lora_alpha / rank);
+
+    // Compute LORA_B weight's gradient
+    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+      DT alpha = 1.0f;
+      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                    ? 0.0f
+                    : 1.0f;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_T,
+                              rank,
+                              out_dim,
+                              num_peft_tokens,
+                              &scaling_constant,
+                              m->low_rank_activation,
+                              lr_actv_type,
+                              rank,
+                              output_grad_ptr,
+                              output_type,
+                              out_dim,
+                              &beta,
+                              weight.w1_grad_ptr,
+                              weight_type,
+                              rank,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+    }
+
+    // Compute LORA_B input's (and LORA_A output's) gradient inplace in
+    // low_rank_activation
+    {
+      DT alpha = 1.0f, beta = 0.0f;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_N,
+                              rank,
+                              num_peft_tokens,
+                              out_dim,
+                              &scaling_constant,
+                              weight.w1_ptr,
+                              weight_type,
+                              rank,
+                              output_grad_ptr,
+                              output_type,
+                              out_dim,
+                              &beta,
+                              m->low_rank_activation,
+                              lr_actv_type,
+                              rank,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+    }
+
+    // Compute LORA_A weight's gradient
+    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+      DT alpha = 1.0f;
+      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                    ? 0.0f
+                    : 1.0f;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_T,
+                              in_dim,
+                              rank,
+                              num_peft_tokens,
+                              &alpha,
+                              m->input_activation,
+                              input_type,
+                              in_dim,
+                              m->low_rank_activation,
+                              lr_actv_type,
+                              rank,
+                              &beta,
+                              weight.w0_grad_ptr,
+                              weight_type,
+                              in_dim,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+    }
+    // Compute input gradient
+    // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+    if (input_grad_ptr != nullptr) {
+      DT alpha = 1.0f;
+      DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+      checkCUDA(hipblasGemmEx(m->handle.blas,
+                              HIPBLAS_OP_N,
+                              HIPBLAS_OP_N,
+                              in_dim,
+                              num_peft_tokens,
+                              rank,
+                              &alpha,
+                              weight.w0_ptr,
+                              weight_type,
+                              in_dim,
+                              m->low_rank_activation,
+                              lr_actv_type,
+                              rank,
+                              &beta,
+                              input_grad_ptr,
+                              input_type,
+                              in_dim,
+                              compute_type,
+                              HIPBLAS_GEMM_DEFAULT));
+    }
+
+    if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
+      LoraOptimizerConfig const *optimizer_config =
+          m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config;
+      assert(optimizer_config != nullptr);
+      assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig));
+      int w0_num_elements = rank * in_dim;
+      int w1_num_elements = rank * out_dim;
+
+      // Get optimizer config
+      if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
+        LoraSGDOptimizerConfig const *sgd_config =
+            (LoraSGDOptimizerConfig const *)optimizer_config;
+        // LoRA_A weight is split in tensor parallelism, so no need to apply
+        // all-reduce
+        sgd_update<<<GET_BLOCKS(w0_num_elements),
+                     CUDA_NUM_THREADS,
+                     0,
+                     stream>>>(w0_num_elements,
+                               sgd_config->lr,
+                               sgd_config->weight_decay,
+                               sgd_config->momentum,
+                               sgd_config->nesterov,
+                               static_cast<DT const *>(weight.w0_grad_ptr),
+                               static_cast<DT *>(weight.w0_v_values_ptr),
+                               static_cast<DT *>(weight.w0_ptr));
+        // LoRA_B weight is replicated w tensor parallelism, so we need to sync
+        // and sum first
+        ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]);
+        checkCUDA(ncclAllReduce(static_cast<DT const *>(weight.w1_grad_ptr),
+                                static_cast<DT *>(weight.w1_grad_ptr),
+                                w1_num_elements,
+                                nccl_data_type,
+                                ncclSum,
+                                m->handle.ncclComm,
+                                stream));
+        sgd_update<<<GET_BLOCKS(w1_num_elements),
+                     CUDA_NUM_THREADS,
+                     0,
+                     stream>>>(w1_num_elements,
+                               sgd_config->lr,
+                               sgd_config->weight_decay,
+                               sgd_config->momentum,
+                               sgd_config->nesterov,
+                               static_cast<DT const *>(weight.w1_grad_ptr),
+                               static_cast<DT *>(weight.w1_v_values_ptr),
+                               static_cast<DT *>(weight.w1_ptr));
+      } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) {
+        assert(false && "Adam optimizer type not implemented yet");
+      } else {
+        assert(false && "Unsupported optimizer type");
+      }
+    }
+  }
+}
+
+} // namespace Internal
+} // namespace LoraLinear
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/src/ops/kernels/lora_linear_kernels.cu b/src/ops/kernels/lora_linear_kernels.cu
new file mode 100644
index 0000000000..5f130782aa
--- /dev/null
+++ b/src/ops/kernels/lora_linear_kernels.cu
@@ -0,0 +1,579 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/ops/kernels/decompress_kernels.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/utils/cuda_helper.h"
+#include <random>
+#include <vector>
+
+namespace FlexFlow {
+
+LoraLinearMeta::LoraLinearMeta(FFHandler handler, LoraLinear const *li)
+    : OpMeta(handler, li) {
+  allocated_peft_buffer_size1 = 0;
+  allocated_peft_buffer_size2 = 0;
+}
+
+LoraLinearMeta::~LoraLinearMeta(void) {}
+
+namespace Kernels {
+namespace LoraLinear {
+
+void init_kernel_wrapper(LoraLinearMeta *m, int seed) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::init_kernel<float>(m, seed, stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::init_kernel<half>(m, seed, stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
+
+void inference_kernel_wrapper(LoraLinearMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::inference_kernel<float>(m,
+                                      bc,
+                                      input.get_float_ptr(),
+                                      output.get_float_ptr(),
+                                      in_dim,
+                                      out_dim,
+                                      stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::inference_kernel<half>(m,
+                                     bc,
+                                     input.get_half_ptr(),
+                                     output.get_half_ptr(),
+                                     in_dim,
+                                     out_dim,
+                                     stream);
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [LoraLinear] forward time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  if (m->input_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel<float>(m,
+                                     bc,
+                                     input_grad.get_float_ptr(),
+                                     output_grad.get_float_ptr(),
+                                     in_dim,
+                                     out_dim,
+                                     stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel<half>(m,
+                                    bc,
+                                    input_grad.get_half_ptr(),
+                                    output_grad.get_half_ptr(),
+                                    in_dim,
+                                    out_dim,
+                                    stream);
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("%s [LoraLinear] PEFT Bwd time = %.2lfms\n", m->op_name, elapsed);
+    // print_tensor<float>((float*)input_ptr, in_dim * batch_size,
+    // "[LoraLinear:forward:input]"); print_tensor<float>((float*)weight_ptr,
+    // in_dim
+    // * out_dim, "[LoraLinear:forward:kernel]");
+    // print_tensor<float>((float*)output_ptr, out_dim * batch_size,
+    // "[LoraLinear:forward:output]");
+  }
+}
+
+namespace Internal {
+
+template <typename DT>
+void init_kernel(LoraLinearMeta *m, int seed, cudaStream_t stream) {
+  // Initialize generator
+  std::mt19937 gen(seed);
+
+  // Get handle to weights by iterating over m->model_state to get each
+  // LoraLinearWeight object
+  for (auto &model_state : m->model_state) {
+    LoraLinearWeight weight = model_state.second.weights;
+    int w0_num_elements = weight.rank * weight.in_dim;
+    int w1_num_elements = weight.rank * weight.out_dim;
+
+    // LoRA_A weight: [in_dim, rank]
+    float stdv_lora_a = 1.0f / sqrt(weight.in_dim);
+    std::uniform_real_distribution<float> dis_lora_a(-stdv_lora_a, stdv_lora_a);
+    std::vector<DT> lora_a_random_init(w0_num_elements);
+    for (auto &num : lora_a_random_init) {
+      float num_float = dis_lora_a(gen);
+      if (std::is_same<DT, half>::value) {
+        num = __float2half(num_float);
+      } else {
+        num = num_float;
+      }
+    }
+    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w0_ptr),
+                              lora_a_random_init.data(),
+                              w0_num_elements * sizeof(DT),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    // LoRA_B weight: [rank, out_dim]
+    float stdv_lora_b = 1.0f / sqrt(weight.rank);
+    std::uniform_real_distribution<float> dis_lora_b(-stdv_lora_b, stdv_lora_b);
+    std::vector<float> lora_b_random_init(w1_num_elements);
+    for (auto &num : lora_b_random_init) {
+      float num_float = dis_lora_b(gen);
+      if (std::is_same<DT, half>::value) {
+        num = __float2half(num_float);
+      } else {
+        num = num_float;
+      }
+    }
+    checkCUDA(cudaMemcpyAsync(static_cast<DT *>(weight.w1_ptr),
+                              lora_b_random_init.data(),
+                              w1_num_elements * sizeof(DT),
+                              cudaMemcpyHostToDevice,
+                              stream));
+  }
+}
+
+template <typename DT>
+void inference_kernel(LoraLinearMeta *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int in_dim,
+                      int out_dim,
+                      ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  DT alpha = 1.0f, beta = 0.0f;
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->input_type[1]);
+  cudaDataType_t lr_actv_type = output_type;
+  assert(input_type == output_type);
+  cudaDataType_t weight_type = output_type;
+  cudaDataType_t compute_type = output_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->input_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+  int num_peft_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+    }
+  }
+  // Assert that we have at most one request that requires peft_bwd
+  assert(num_peft_requests <= 1);
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+    int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_state.end());
+    LoraLinearWeight weight =
+        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
+    int rank = weight.rank;
+    void *intermediate_result_ptr = nullptr;
+    if (bc->requestsInfo[i].peft_bwd) {
+      size_t activation_size_needed1 =
+          data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+      size_t activation_size_needed2 =
+          data_type_size(m->input_type[1]) * max_peft_tokens * rank;
+      MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+      if (activation_size_needed1 > m->allocated_peft_buffer_size1) {
+        m->input_activation =
+            allocator->allocate_instance_untyped(activation_size_needed1);
+        m->allocated_peft_buffer_size1 = activation_size_needed1;
+      }
+      if (activation_size_needed2 > m->allocated_peft_buffer_size2) {
+        m->low_rank_activation =
+            allocator->allocate_instance_untyped(activation_size_needed2);
+        m->allocated_peft_buffer_size2 = activation_size_needed2;
+      }
+      // copy input activation
+      checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                input_ptr + first_token_offset * in_dim,
+                                data_type_size(m->input_type[0]) *
+                                    num_peft_tokens * in_dim,
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+      intermediate_result_ptr = m->low_rank_activation;
+    } else {
+      // use workspace to save intermediate result
+      assert(m->handle.workSpaceSize >=
+             data_type_size(m->input_type[1]) * num_peft_tokens * rank);
+      intermediate_result_ptr = m->handle.workSpace;
+    }
+    // buffer = weight_first * input
+    // [rank, num_peft_tokens] = [in_dim, rank].T * [in_dim, num_peft_tokens]
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           rank,
+                           num_peft_tokens,
+                           in_dim,
+                           &alpha,
+                           weight.w0_ptr,
+                           weight_type,
+                           in_dim,
+                           input_ptr + first_token_offset * in_dim,
+                           input_type,
+                           in_dim,
+                           &beta,
+                           intermediate_result_ptr,
+                           lr_actv_type,
+                           rank,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    // output = weight_second * buffer
+    // [out_dim, num_peft_tokens] = [rank, out_dim].T * [rank, num_peft_tokens]
+    // Note that we use alpha in both places since we do
+    // an in-place update for LoraLinear
+    float lora_alpha =
+        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
+    DT scaling_constant = (DT)(lora_alpha / rank);
+    checkCUDA(cublasGemmEx(m->handle.blas,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           out_dim,
+                           num_peft_tokens,
+                           rank,
+                           &scaling_constant,
+                           weight.w1_ptr,
+                           weight_type,
+                           rank,
+                           intermediate_result_ptr,
+                           lr_actv_type,
+                           rank,
+                           &alpha,
+                           output_ptr + first_token_offset * out_dim,
+                           output_type,
+                           out_dim,
+                           compute_type,
+                           CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+  }
+}
+
+template <typename DT>
+__global__ void sgd_update(size_t count,
+                           float lr,
+                           float weight_decay,
+                           float momentum,
+                           bool nesterov,
+                           DT const *WGrad,
+                           DT *V,
+                           DT *W) {
+  // Refernce https://pytorch.org/docs/stable/_modules/torch/optim/sgd.html#SGD
+  CUDA_KERNEL_LOOP(i, count) {
+    DT gt = WGrad[i] + (DT)weight_decay * W[i];
+    if (momentum > 0.0f) {
+      V[i] = V[i] * (DT)momentum + gt;
+      if (nesterov) {
+        gt = gt + (DT)momentum * V[i];
+      } else {
+        gt = V[i];
+      }
+    }
+    W[i] -= (DT)lr * gt;
+  }
+}
+
+template <typename DT>
+void peft_bwd_kernel(LoraLinearMeta *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int in_dim,
+                     int out_dim,
+                     ffStream_t stream) {
+  checkCUDA(cublasSetStream(m->handle.blas, stream));
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+  cudaDataType_t input_type = ff_to_cuda_datatype(m->input_type[0]);
+  cudaDataType_t output_type = ff_to_cuda_datatype(m->output_type[0]);
+  assert(input_type == output_type);
+  cudaDataType_t weight_type = output_type;
+  cudaDataType_t lr_actv_type = output_type;
+  cudaDataType_t compute_type = output_type;
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = output_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    // int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+    assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+           m->model_state.end());
+    LoraLinearWeight weight =
+        m->model_state[bc->requestsInfo[i].peft_model_id].weights;
+    int rank = weight.rank;
+    float lora_alpha =
+        m->model_state[bc->requestsInfo[i].peft_model_id].lora_alpha;
+    DT scaling_constant = (DT)(lora_alpha / rank);
+
+    // Compute LORA_B weight's gradient
+    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+      DT alpha = 1.0f;
+      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                    ? 0.0f
+                    : 1.0f;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_T,
+                             rank,
+                             out_dim,
+                             num_peft_tokens,
+                             &scaling_constant,
+                             m->low_rank_activation,
+                             lr_actv_type,
+                             rank,
+                             output_grad_ptr,
+                             output_type,
+                             out_dim,
+                             &beta,
+                             weight.w1_grad_ptr,
+                             weight_type,
+                             rank,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+
+    // Compute LORA_B input's (and LORA_A output's) gradient inplace in
+    // low_rank_activation
+    {
+      DT alpha = 1.0f, beta = 0.0f;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             rank,
+                             num_peft_tokens,
+                             out_dim,
+                             &scaling_constant,
+                             weight.w1_ptr,
+                             weight_type,
+                             rank,
+                             output_grad_ptr,
+                             output_type,
+                             out_dim,
+                             &beta,
+                             m->low_rank_activation,
+                             lr_actv_type,
+                             rank,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+
+    // Compute LORA_A weight's gradient
+    if (bc->requestsInfo[i].optimizer_tasks.compute_gradients) {
+      DT alpha = 1.0f;
+      DT beta = (bc->requestsInfo[i].optimizer_tasks.reset_gradients_to_zero)
+                    ? 0.0f
+                    : 1.0f;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_T,
+                             in_dim,
+                             rank,
+                             num_peft_tokens,
+                             &alpha,
+                             m->input_activation,
+                             input_type,
+                             in_dim,
+                             m->low_rank_activation,
+                             lr_actv_type,
+                             rank,
+                             &beta,
+                             weight.w0_grad_ptr,
+                             weight_type,
+                             in_dim,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+    // Compute input gradient
+    // NOTE: we use beta=1 for input_grad to accumulate gradients when needed
+    if (input_grad_ptr != nullptr) {
+      DT alpha = 1.0f;
+      DT beta = m->reset_input_grads[0] ? 0.0f : 1.0f;
+      checkCUDA(cublasGemmEx(m->handle.blas,
+                             CUBLAS_OP_N,
+                             CUBLAS_OP_N,
+                             in_dim,
+                             num_peft_tokens,
+                             rank,
+                             &alpha,
+                             weight.w0_ptr,
+                             weight_type,
+                             in_dim,
+                             m->low_rank_activation,
+                             lr_actv_type,
+                             rank,
+                             &beta,
+                             input_grad_ptr,
+                             input_type,
+                             in_dim,
+                             compute_type,
+                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+    }
+
+    if (bc->requestsInfo[i].optimizer_tasks.update_weights) {
+      LoraOptimizerConfig const *optimizer_config =
+          m->model_state[bc->requestsInfo[i].peft_model_id].optimizer_config;
+      assert(optimizer_config != nullptr);
+      assert(typeid(*optimizer_config) != typeid(LoraOptimizerConfig));
+      int w0_num_elements = rank * in_dim;
+      int w1_num_elements = rank * out_dim;
+
+      // Get optimizer config
+      if (typeid(*optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
+        LoraSGDOptimizerConfig const *sgd_config =
+            (LoraSGDOptimizerConfig const *)optimizer_config;
+        // LoRA_A weight is split in tensor parallelism, so no need to apply
+        // all-reduce
+        sgd_update<<<GET_BLOCKS(w0_num_elements),
+                     CUDA_NUM_THREADS,
+                     0,
+                     stream>>>(w0_num_elements,
+                               sgd_config->lr,
+                               sgd_config->weight_decay,
+                               sgd_config->momentum,
+                               sgd_config->nesterov,
+                               static_cast<DT const *>(weight.w0_grad_ptr),
+                               static_cast<DT *>(weight.w0_v_values_ptr),
+                               static_cast<DT *>(weight.w0_ptr));
+        // LoRA_B weight is replicated w tensor parallelism, so we need to sync
+        // and sum first
+#ifdef FF_USE_NCCL
+        ncclDataType_t nccl_data_type = ff_to_nccl_datatype(m->output_type[0]);
+        checkCUDA(ncclAllReduce(static_cast<DT const *>(weight.w1_grad_ptr),
+                                static_cast<DT *>(weight.w1_grad_ptr),
+                                w1_num_elements,
+                                nccl_data_type,
+                                ncclSum,
+                                m->handle.ncclComm,
+                                stream));
+#else
+        assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
+#endif
+        sgd_update<<<GET_BLOCKS(w1_num_elements),
+                     CUDA_NUM_THREADS,
+                     0,
+                     stream>>>(w1_num_elements,
+                               sgd_config->lr,
+                               sgd_config->weight_decay,
+                               sgd_config->momentum,
+                               sgd_config->nesterov,
+                               static_cast<DT const *>(weight.w1_grad_ptr),
+                               static_cast<DT *>(weight.w1_v_values_ptr),
+                               static_cast<DT *>(weight.w1_ptr));
+      } else if (typeid(*optimizer_config) == typeid(LoraAdamOptimizerConfig)) {
+        assert(false && "Adam optimizer type not implemented yet");
+      } else {
+        assert(false && "Unsupported optimizer type");
+      }
+    }
+  }
+}
+
+} // namespace Internal
+} // namespace LoraLinear
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/src/ops/kernels/pool_2d_kernels.cpp b/src/ops/kernels/pool_2d_kernels.cpp
index 8af85612ca..b3f20a35dd 100644
--- a/src/ops/kernels/pool_2d_kernels.cpp
+++ b/src/ops/kernels/pool_2d_kernels.cpp
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
+#include "flexflow/ops/pool_2d.h"
 #include "flexflow/utils/hip_helper.h"
 
 namespace FlexFlow {
 
-Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) {
+Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool)
+    : OpMeta(handler, pool) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
   checkCUDNN(miopenCreatePoolingDescriptor(&poolDesc));
diff --git a/src/ops/kernels/pool_2d_kernels.cu b/src/ops/kernels/pool_2d_kernels.cu
index b418d20cd3..c236f049ba 100644
--- a/src/ops/kernels/pool_2d_kernels.cu
+++ b/src/ops/kernels/pool_2d_kernels.cu
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/pool_2d_kernels.h"
+#include "flexflow/ops/pool_2d.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-Pool2DMeta::Pool2DMeta(FFHandler handler) : OpMeta(handler) {
+Pool2DMeta::Pool2DMeta(FFHandler handler, Pool2D const *pool)
+    : OpMeta(handler, pool) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
   checkCUDNN(cudnnCreatePoolingDescriptor(&poolDesc));
diff --git a/src/ops/kernels/reshape_kernels.cpp b/src/ops/kernels/reshape_kernels.cpp
index b17d95bfea..47f407fd82 100644
--- a/src/ops/kernels/reshape_kernels.cpp
+++ b/src/ops/kernels/reshape_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/ops/kernels/reshape_kernels.h"
+#include "flexflow/ops/reshape.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {}
+ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape)
+    : OpMeta(handler, reshape) {}
 
 namespace Kernels {
 namespace Reshape {
diff --git a/src/ops/kernels/reshape_kernels.cu b/src/ops/kernels/reshape_kernels.cu
index 9786f63815..0a2b01ae52 100644
--- a/src/ops/kernels/reshape_kernels.cu
+++ b/src/ops/kernels/reshape_kernels.cu
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/ops/kernels/reshape_kernels.h"
+#include "flexflow/ops/reshape.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-ReshapeMeta::ReshapeMeta(FFHandler handler) : OpMeta(handler) {}
+ReshapeMeta::ReshapeMeta(FFHandler handler, Reshape const *reshape)
+    : OpMeta(handler, reshape) {}
 
 namespace Kernels {
 namespace Reshape {
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cpp b/src/ops/kernels/residual_rms_norm_kernels.cpp
index 6906556452..016364edfd 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cpp
+++ b/src/ops/kernels/residual_rms_norm_kernels.cpp
@@ -22,18 +22,16 @@
 namespace FlexFlow {
 // declare Legion names
 using Legion::coord_t;
+
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
                                          ResidualRMSNorm const *rms,
                                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
+  inplace_residual = rms->inplace_residual;
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
   num_elements = in_dim * batch_size;
@@ -47,12 +45,14 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 ResidualRMSNormMeta::~ResidualRMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
     reserveInst.destroy();
   }
 }
+
 namespace Kernels {
 namespace ResidualRMSNorm {
 
@@ -78,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -87,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -109,18 +107,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     X_out[index] = X1[index] + X2[index];
     sum +=
         (static_cast<float>(X_out[index]) * static_cast<float>(X_out[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -128,11 +121,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X_out[index]) * static_cast<T_ACC>(rms[i]);
-    output[index] = Y[index] * weights[index % N];
+    Y[index] = static_cast<T>(static_cast<float>(X_out[index]) *
+                              static_cast<float>(rms[i]));
+    output[index] = static_cast<T>(static_cast<float>(Y[index]) *
+                                   static_cast<float>(weights[index % N]));
   }
 }
 
@@ -144,19 +138,10 @@ void forward_kernel(ResidualRMSNormMeta const *m,
                     T *residual_output_ptr,
                     T *output_ptr,
                     hipStream_t stream) {
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualRMSNormFusedForwardKernel<T>),
-                     num_blocks,
-                     num_threads,
+                     m->batch_size,
+                     std::min(CUDA_NUM_THREADS, m->in_dim),
                      0,
                      stream,
                      m->in_dim,
@@ -178,7 +163,57 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
                             GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  assert(input1.data_type == input2.data_type);
+  assert(output.data_type == input1.data_type);
+  assert(weight.data_type == output.data_type);
+  assert(residual_output.data_type == output.data_type);
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input1.get_half_ptr(),
+                   input2.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   residual_output.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input1.get_float_ptr(),
+                   input2.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   residual_output.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
 
+void inference_kernel_wrapper(ResidualRMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &residual_output,
+                              GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
   hipEvent_t t_start, t_end;
   if (m->profiling) {
     checkCUDA(hipEventCreate(&t_start));
@@ -211,6 +246,67 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
     assert(false && "Unsupported data type");
   }
 
+  // save input activation if needed for PEFT. This must be done after the
+  // forward kernel since that's where we add the residual
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              residual_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              residual_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -222,6 +318,288 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ float ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  float ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
+  }
+  ds = BlockReduceSum<float>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dX1_residual,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX1,
+                                          T *dX2,
+                                          bool reset_input_grad1,
+                                          bool reset_input_grad2) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    if (reset_input_grad1) {
+      dX1[index] = static_cast<T>(dX_val);
+    } else {
+      dX1[index] = dX1_residual[index] + static_cast<T>(dX_val);
+    }
+    if (reset_input_grad2) {
+      dX2[index] = static_cast<T>(dX1[index]);
+    } else {
+      dX2[index] += static_cast<T>(dX1[index]);
+    }
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(ResidualRMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *residual_output_rms_input_ptr,
+                     T *residual_input0_grad_ptr,
+                     T *residual_input1_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     hipStream_t stream) {
+  int M = m->batch_size;
+  int N = m->in_dim;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+          N,
+          output_grad_ptr,
+          residual_output_rms_input_ptr,
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr));
+
+  RMSNormBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      N,
+      nullptr,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      weight_ptr,
+      static_cast<T *>(m->rms_ptr),
+      static_cast<T *>(m->norm_ptr),
+      residual_input0_grad_ptr,
+      residual_input1_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1]);
+
+  GammaBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      M,
+      N,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      static_cast<T *>(m->rms_ptr),
+      weight_grad_ptr);
+}
+
+template <typename T>
+void peft_bwd_kernel(ResidualRMSNormMeta const *m,
+                     BatchConfig const *bc,
+                     T const *output_grad_0_ptr,
+                     T const *output_grad_1_ptr,
+                     T *input_grad_0_ptr,
+                     T *input_grad_1_ptr,
+                     T const *weight_ptr,
+                     hipStream_t stream) {
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
+    int N = m->in_dim;
+
+    T const *residual_output_rms_input_ptr =
+        static_cast<T *>(m->input_activation);
+
+    ComputeInternalGradientsCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_1_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr));
+
+    RMSNormBackwardCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_0_ptr,
+            output_grad_1_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr),
+            input_grad_0_ptr,
+            input_grad_1_ptr,
+            m->reset_input_grads[0],
+            m->reset_input_grads[1]);
+  }
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I): Residual output / RMS input
+  regions[2](I/O): Residual input 0 grad
+  regions[3](I/O): Residual input 1 grad
+  regions[4](I): weight
+  regions[5](I/O): weight_grad
+*/
+void backward_kernel_wrapper(
+    ResidualRMSNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &residual_output_rms_input,
+    GenericTensorAccessorW const &residual_input0_grad,
+    GenericTensorAccessorW const &residual_input1_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorW const &weight_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  assert(output_grad.data_type == residual_output_rms_input.data_type);
+  assert(residual_output_rms_input.data_type == residual_input0_grad.data_type);
+  assert(residual_input0_grad.data_type == residual_input1_grad.data_type);
+  assert(residual_input1_grad.data_type == weight.data_type);
+  assert(weight.data_type == weight_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    residual_output_rms_input.get_half_ptr(),
+                    residual_input0_grad.get_half_ptr(),
+                    residual_input1_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    residual_output_rms_input.get_float_ptr(),
+                    residual_input0_grad.get_float_ptr(),
+                    residual_input1_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad_0,
+                             GenericTensorAccessorR const &output_grad_1,
+                             GenericTensorAccessorW const &input_grad_0,
+                             GenericTensorAccessorW const &input_grad_1,
+                             GenericTensorAccessorR const &weight) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  assert(output_grad_1.data_type == input_grad_0.data_type);
+  assert(input_grad_0.data_type == input_grad_1.data_type);
+  assert(input_grad_1.data_type == weight.data_type);
+
+  if (output_grad_1.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    bc,
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_half_ptr(),
+                    output_grad_1.get_half_ptr(),
+                    input_grad_0.get_half_ptr(),
+                    input_grad_1.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad_1.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    bc,
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_float_ptr(),
+                    output_grad_1.get_float_ptr(),
+                    input_grad_0.get_float_ptr(),
+                    input_grad_1.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/kernels/residual_rms_norm_kernels.cu b/src/ops/kernels/residual_rms_norm_kernels.cu
index 17ac14449b..0d44f0260a 100644
--- a/src/ops/kernels/residual_rms_norm_kernels.cu
+++ b/src/ops/kernels/residual_rms_norm_kernels.cu
@@ -24,17 +24,14 @@ namespace FlexFlow {
 using Legion::coord_t;
 
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
                                          ResidualRMSNorm const *rms,
                                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
+  inplace_residual = rms->inplace_residual;
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
   num_elements = in_dim * batch_size;
@@ -48,6 +45,7 @@ ResidualRMSNormMeta::ResidualRMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 ResidualRMSNormMeta::~ResidualRMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
@@ -80,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -89,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -111,18 +107,13 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     X_out[index] = X1[index] + X2[index];
     sum +=
         (static_cast<float>(X_out[index]) * static_cast<float>(X_out[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -130,11 +121,12 @@ __global__ void ResidualRMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X_out[index]) * static_cast<T_ACC>(rms[i]);
-    output[index] = Y[index] * weights[index % N];
+    Y[index] = static_cast<T>(static_cast<float>(X_out[index]) *
+                              static_cast<float>(rms[i]));
+    output[index] = static_cast<T>(static_cast<float>(Y[index]) *
+                                   static_cast<float>(weights[index % N]));
   }
 }
 
@@ -147,26 +139,17 @@ void forward_kernel(ResidualRMSNormMeta const *m,
                     T *output_ptr,
                     cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   ResidualRMSNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->in_dim,
-                                               m->eps,
-                                               input1_ptr,
-                                               input2_ptr,
-                                               residual_output_ptr,
-                                               static_cast<T *>(m->rms_ptr),
-                                               static_cast<T *>(m->norm_ptr),
-                                               weight_ptr,
-                                               output_ptr);
+      <<<m->batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>(
+          m->in_dim,
+          m->eps,
+          input1_ptr,
+          input2_ptr,
+          residual_output_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr),
+          weight_ptr,
+          output_ptr);
 }
 
 void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
@@ -219,6 +202,401 @@ void forward_kernel_wrapper(ResidualRMSNormMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(ResidualRMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input1,
+                              GenericTensorAccessorR const &input2,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &residual_output,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  assert(input1.data_type == input2.data_type);
+  assert(output.data_type == input1.data_type);
+  assert(weight.data_type == output.data_type);
+  assert(residual_output.data_type == output.data_type);
+
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input1.get_half_ptr(),
+                   input2.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   residual_output.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input1.get_float_ptr(),
+                   input2.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   residual_output.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  // save input activation if needed for PEFT. This must be done after the
+  // forward kernel since that's where we add the residual
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              residual_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              residual_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualRMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ float ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  float ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
+  }
+  ds = BlockReduceSum<float>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dX1_residual,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX1,
+                                          T *dX2,
+                                          bool reset_input_grad1,
+                                          bool reset_input_grad2) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    if (reset_input_grad1) {
+      dX1[index] = static_cast<T>(dX_val);
+    } else {
+      dX1[index] = dX1_residual[index] + static_cast<T>(dX_val);
+    }
+    if (reset_input_grad2) {
+      dX2[index] = static_cast<T>(dX1[index]);
+    } else {
+      dX2[index] += static_cast<T>(dX1[index]);
+    }
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(ResidualRMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *residual_output_rms_input_ptr,
+                     T *residual_input0_grad_ptr,
+                     T *residual_input1_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     cudaStream_t stream) {
+  int M = m->batch_size;
+  int N = m->in_dim;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+          N,
+          output_grad_ptr,
+          residual_output_rms_input_ptr,
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr));
+
+  RMSNormBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      N,
+      nullptr,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      weight_ptr,
+      static_cast<T *>(m->rms_ptr),
+      static_cast<T *>(m->norm_ptr),
+      residual_input0_grad_ptr,
+      residual_input1_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1]);
+
+  GammaBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      M,
+      N,
+      output_grad_ptr,
+      residual_output_rms_input_ptr,
+      static_cast<T *>(m->rms_ptr),
+      weight_grad_ptr);
+}
+
+template <typename T>
+void peft_bwd_kernel(ResidualRMSNormMeta const *m,
+                     BatchConfig const *bc,
+                     T const *output_grad_0_ptr,
+                     T const *output_grad_1_ptr,
+                     T *input_grad_0_ptr,
+                     T *input_grad_1_ptr,
+                     T const *weight_ptr,
+                     cudaStream_t stream) {
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
+    int N = m->in_dim;
+
+    T const *residual_output_rms_input_ptr =
+        static_cast<T *>(m->input_activation);
+
+    ComputeInternalGradientsCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_1_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr));
+
+    RMSNormBackwardCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_0_ptr,
+            output_grad_1_ptr,
+            residual_output_rms_input_ptr,
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr),
+            input_grad_0_ptr,
+            input_grad_1_ptr,
+            m->reset_input_grads[0],
+            m->reset_input_grads[1]);
+  }
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I): Residual output / RMS input
+  regions[2](I/O): Residual input 0 grad
+  regions[3](I/O): Residual input 1 grad
+  regions[4](I): weight
+  regions[5](I/O): weight_grad
+*/
+void backward_kernel_wrapper(
+    ResidualRMSNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &residual_output_rms_input,
+    GenericTensorAccessorW const &residual_input0_grad,
+    GenericTensorAccessorW const &residual_input1_grad,
+    GenericTensorAccessorR const &weight,
+    GenericTensorAccessorW const &weight_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(output_grad.data_type == residual_output_rms_input.data_type);
+  assert(residual_output_rms_input.data_type == residual_input0_grad.data_type);
+  assert(residual_input0_grad.data_type == residual_input1_grad.data_type);
+  assert(residual_input1_grad.data_type == weight.data_type);
+  assert(weight.data_type == weight_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    residual_output_rms_input.get_half_ptr(),
+                    residual_input0_grad.get_half_ptr(),
+                    residual_input1_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    residual_output_rms_input.get_float_ptr(),
+                    residual_input0_grad.get_float_ptr(),
+                    residual_input1_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(ResidualRMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad_0,
+                             GenericTensorAccessorR const &output_grad_1,
+                             GenericTensorAccessorW const &input_grad_0,
+                             GenericTensorAccessorW const &input_grad_1,
+                             GenericTensorAccessorR const &weight) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(output_grad_1.data_type == input_grad_0.data_type);
+  assert(input_grad_0.data_type == input_grad_1.data_type);
+  assert(input_grad_1.data_type == weight.data_type);
+
+  if (output_grad_1.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    bc,
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_half_ptr(),
+                    output_grad_1.get_half_ptr(),
+                    input_grad_0.get_half_ptr(),
+                    input_grad_1.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad_1.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    bc,
+                    m->reset_input_grads[0] ? nullptr
+                                            : output_grad_0.get_float_ptr(),
+                    output_grad_1.get_float_ptr(),
+                    input_grad_0.get_float_ptr(),
+                    input_grad_1.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualRMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace ResidualRMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/kernels/rms_norm_kernels.cpp b/src/ops/kernels/rms_norm_kernels.cpp
index 24ab7051e6..4158628005 100644
--- a/src/ops/kernels/rms_norm_kernels.cpp
+++ b/src/ops/kernels/rms_norm_kernels.cpp
@@ -23,16 +23,12 @@ namespace FlexFlow {
 // declare Legion names
 using Legion::coord_t;
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 RMSNormMeta::RMSNormMeta(FFHandler handler,
                          RMSNorm const *rms,
                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
@@ -47,12 +43,14 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 RMSNormMeta::~RMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
     reserveInst.destroy();
   }
 }
+
 namespace Kernels {
 namespace RMSNorm {
 
@@ -78,7 +76,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -87,9 +85,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -107,16 +103,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     sum += (static_cast<float>(X[index]) * static_cast<float>(X[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -124,10 +115,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rms[i]);
+    Y[index] = static_cast<T>(X[index]) * static_cast<T>(rms[i]);
     output[index] = Y[index] * weights[index % N];
   }
 }
@@ -138,19 +128,10 @@ void forward_kernel(RMSNormMeta const *m,
                     T const *weight_ptr,
                     T *output_ptr,
                     hipStream_t stream) {
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
 
   hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormFusedForwardKernel<T>),
-                     num_blocks,
-                     num_threads,
+                     m->batch_size,
+                     std::min(CUDA_NUM_THREADS, m->in_dim),
                      0,
                      stream,
                      m->in_dim,
@@ -204,6 +185,363 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(RMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  assert(output.data_type == input.data_type);
+  assert(weight.data_type == output.data_type);
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+
+        if (input.data_type == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (input.data_type == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ T ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  float ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
+  }
+  ds = BlockReduceSum<T>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX,
+                                          bool reset_input_grad) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    if (reset_input_grad) {
+      dX[index] = dX_val;
+    } else {
+      dX[index] += dX_val;
+    }
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(RMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *input_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     hipStream_t stream) {
+  int M = m->batch_size;
+  int N = m->in_dim;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
+                     M,
+                     std::min(N, CUDA_NUM_THREADS),
+                     0,
+                     stream,
+                     N,
+                     output_grad_ptr,
+                     input_ptr,
+                     weight_ptr,
+                     static_cast<T *>(m->rms_ptr),
+                     static_cast<T *>(m->norm_ptr));
+
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormBackwardCUDAKernel<T>),
+                     M,
+                     std::min(N, CUDA_NUM_THREADS),
+                     0,
+                     stream,
+                     m->in_dim,
+                     output_grad_ptr,
+                     input_ptr,
+                     weight_ptr,
+                     static_cast<T *>(m->rms_ptr),
+                     static_cast<T *>(m->norm_ptr),
+                     input_grad_ptr,
+                     m->reset_input_grads[0]);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBackwardCUDAKernel<T>),
+                     M,
+                     std::min(N, CUDA_NUM_THREADS),
+                     0,
+                     stream,
+                     M,
+                     N,
+                     output_grad_ptr,
+                     input_ptr,
+                     static_cast<T *>(m->rms_ptr),
+                     weight_grad_ptr);
+}
+
+void backward_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight,
+                             GenericTensorAccessorW const &weight_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  assert(input_grad.data_type == input.data_type);
+  assert(weight_grad.data_type == weight.data_type);
+  assert(output_grad.data_type == input.data_type);
+  assert(weight.data_type == output_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+void peft_bwd_kernel(RMSNormMeta const *m,
+                     BatchConfig const *bc,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     hipStream_t stream) {
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
+    int N = m->num_elements;
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
+                       M,
+                       std::min(N, CUDA_NUM_THREADS),
+                       0,
+                       stream,
+                       N,
+                       output_grad_ptr,
+                       static_cast<T *>(m->input_activation),
+                       weight_ptr,
+                       static_cast<T *>(m->rms_ptr),
+                       static_cast<T *>(m->norm_ptr));
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RMSNormBackwardCUDAKernel<T>),
+                       M,
+                       std::min(N, CUDA_NUM_THREADS),
+                       0,
+                       stream,
+                       m->in_dim,
+                       output_grad_ptr,
+                       static_cast<T *>(m->input_activation),
+                       weight_ptr,
+                       static_cast<T *>(m->rms_ptr),
+                       static_cast<T *>(m->norm_ptr),
+                       input_grad_ptr,
+                       m->reset_input_grads[0]);
+  }
+}
+
+void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(output_grad.data_type == weight.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    bc,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    bc,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/kernels/rms_norm_kernels.cu b/src/ops/kernels/rms_norm_kernels.cu
index 7c9f4a9f98..dd6ada864d 100644
--- a/src/ops/kernels/rms_norm_kernels.cu
+++ b/src/ops/kernels/rms_norm_kernels.cu
@@ -24,16 +24,12 @@ namespace FlexFlow {
 using Legion::coord_t;
 
 #define C10_WARP_SIZE 32
-constexpr int kCUDABlockReduceNumThreads = 512;
-constexpr int kCUDANumThreads = 256;
 
 RMSNormMeta::RMSNormMeta(FFHandler handler,
                          RMSNorm const *rms,
                          MemoryAllocator &gpu_mem_allocator)
     : OpMeta(handler, rms) {
   eps = rms->eps;
-  alpha = 1.0f;
-  beta = 0.0f;
 
   in_dim = rms->data_dim;
   batch_size = rms->effective_batch_size;
@@ -48,6 +44,7 @@ RMSNormMeta::RMSNormMeta(FFHandler handler,
       rms_ptr_size * data_type_size(data_type));
   norm_ptr = gpu_mem_allocator.allocate_instance_untyped(
       norm_ptr_size * data_type_size(data_type));
+  allocated_peft_buffer_size = 0;
 }
 RMSNormMeta::~RMSNormMeta(void) {
   if (reserveInst != Realm::RegionInstance::NO_INST) {
@@ -96,66 +93,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
   return val;
 }
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
-  __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
-  }
-  __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
-  }
-  return val;
-}
-
-#ifdef DEADCODE
-template <typename T>
-__global__ void
-    RowwiseRootMeanSquareKernel(long long N, float eps, T const *X, T *rms) {
-  __shared__ float v_shared[C10_WARP_SIZE];
-  long long const i = blockIdx.x;
-  float sum = 0.0f;
-  for (long long j = threadIdx.x; j < N; j += blockDim.x) {
-    long long const index = i * N + j;
-    sum += (static_cast<float>(X[index]) * static_cast<float>(X[index]));
-  }
-  sum = BlockReduceSum<float>(sum,
-                              v_shared); // use BlockReduceSum() to sum X_ij^2
-
-  if (threadIdx.x == 0) {
-    rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
-  }
-}
-
-template <typename T>
-__global__ void NormKernel(int64_t N, T const *X, T const *rstd, T *Y) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rstd[i]);
-  }
-}
-
-template <typename T>
-__global__ void elewise_apply_weights(int64_t batch_size,
-                                      int64_t in_dim,
-                                      T const *norm,
-                                      T const *weights,
-                                      T *output) {
-  CUDA_KERNEL_LOOP(i, batch_size * in_dim) {
-    output[i] = norm[i] * weights[i % in_dim];
-  }
-}
-#endif
-
 template <typename T>
 __global__ void RMSNormFusedForwardKernel(int64_t N,
                                           float eps,
@@ -167,16 +104,11 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
   __shared__ float v_shared[C10_WARP_SIZE];
   int64_t const i = blockIdx.x;
   float sum = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     int64_t const index = i * N + j;
     sum += (static_cast<float>(X[index]) * static_cast<float>(X[index]));
   }
-  sum = BlockReduceSum<float>(
-      sum,
-      v_shared,
-      min(blockDim.x,
-          kCUDABlockReduceNumThreads)); // use BlockReduceSum() to sum X_ij^2
+  sum = BlockReduceSum<float>(sum, v_shared);
 
   if (threadIdx.x == 0) {
     rms[i] = static_cast<T>(rsqrt((sum / static_cast<float>(N)) + eps));
@@ -184,10 +116,9 @@ __global__ void RMSNormFusedForwardKernel(int64_t N,
 
   __syncthreads();
 
-  using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
-    Y[index] = static_cast<T_ACC>(X[index]) * static_cast<T_ACC>(rms[i]);
+    Y[index] = static_cast<T>(X[index]) * static_cast<T>(rms[i]);
     output[index] = Y[index] * weights[index % N];
   }
 }
@@ -199,24 +130,15 @@ void forward_kernel(RMSNormMeta const *m,
                     T *output_ptr,
                     cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   RMSNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->in_dim,
-                                               m->eps,
-                                               input_ptr,
-                                               static_cast<T *>(m->rms_ptr),
-                                               static_cast<T *>(m->norm_ptr),
-                                               weight_ptr,
-                                               output_ptr);
+      <<<m->batch_size, std::min(CUDA_NUM_THREADS, m->in_dim), 0, stream>>>(
+          m->in_dim,
+          m->eps,
+          input_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr),
+          weight_ptr,
+          output_ptr);
 }
 
 void forward_kernel_wrapper(RMSNormMeta const *m,
@@ -261,6 +183,346 @@ void forward_kernel_wrapper(RMSNormMeta const *m,
   }
 }
 
+void inference_kernel_wrapper(RMSNormMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorR const &weight,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  assert(output.data_type == input.data_type);
+  assert(weight.data_type == output.data_type);
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+
+        if (input.data_type == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (input.data_type == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(input.data_type) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (output.data_type == DT_HALF) {
+    forward_kernel(m,
+                   input.get_half_ptr(),
+                   weight.get_half_ptr(),
+                   output.get_half_ptr(),
+                   stream);
+  } else if (output.data_type == DT_FLOAT) {
+    forward_kernel(m,
+                   input.get_float_ptr(),
+                   weight.get_float_ptr(),
+                   output.get_float_ptr(),
+                   stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[RMSNorm] forward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T const *rrms, T *c2) {
+  __shared__ T ds_storage[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  float ds = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    int const index = i * N + j;
+    ds += static_cast<float>(dY[index]) * static_cast<float>(X[index]) *
+          static_cast<float>(gamma[j]);
+  }
+  ds = BlockReduceSum<T>(ds, ds_storage);
+  if (threadIdx.x == 0) {
+    float const c2_val =
+        -ds *
+        (static_cast<float>(rrms[i]) * static_cast<float>(rrms[i]) *
+         static_cast<float>(rrms[i])) /
+        static_cast<float>((int)N);
+    c2[i] = static_cast<T>(c2_val);
+  }
+}
+
+template <typename T>
+__global__ void RMSNormBackwardCUDAKernel(int64_t N,
+                                          T const *dY,
+                                          T const *X,
+                                          T const *gamma,
+                                          T const *c1,
+                                          T const *c2,
+                                          T *dX,
+                                          bool reset_input_grad) {
+  const int64_t i = blockIdx.x;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    float const dX_val =
+        static_cast<float>(c1[i]) * static_cast<float>(dY[index]) *
+            static_cast<float>(gamma[j]) +
+        static_cast<float>(c2[i]) * static_cast<float>(X[index]);
+    if (reset_input_grad) {
+      dX[index] = dX_val;
+    } else {
+      dX[index] += dX_val;
+    }
+  }
+}
+
+// Assume the batch size will not be very large, direct implementation is the
+// most efficient one.
+template <typename T>
+__global__ void GammaBackwardCUDAKernel(
+    int64_t M, int64_t N, T const *dY, T const *X, T const *rrms, T *dg) {
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T sum1 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dY[index] * X[index] * rrms[i];
+    }
+    dg[j] = sum1;
+  }
+}
+
+template <typename T>
+void backward_kernel(RMSNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *input_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     T *weight_grad_ptr,
+                     cudaStream_t stream) {
+  int M = m->batch_size;
+  int N = m->in_dim;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+          N,
+          output_grad_ptr,
+          input_ptr,
+          weight_ptr,
+          static_cast<T *>(m->rms_ptr),
+          static_cast<T *>(m->norm_ptr));
+
+  RMSNormBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      m->in_dim,
+      output_grad_ptr,
+      input_ptr,
+      weight_ptr,
+      static_cast<T *>(m->rms_ptr),
+      static_cast<T *>(m->norm_ptr),
+      input_grad_ptr,
+      m->reset_input_grads[0]);
+  GammaBackwardCUDAKernel<T><<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+      M,
+      N,
+      output_grad_ptr,
+      input_ptr,
+      static_cast<T *>(m->rms_ptr),
+      weight_grad_ptr);
+}
+
+void backward_kernel_wrapper(RMSNormMeta const *m,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorR const &input,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight,
+                             GenericTensorAccessorW const &weight_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(input_grad.data_type == input.data_type);
+  assert(weight_grad.data_type == weight.data_type);
+  assert(output_grad.data_type == input.data_type);
+  assert(weight.data_type == output_grad.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    backward_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    weight_grad.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    backward_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    weight_grad.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[RMSNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+template <typename T>
+void peft_bwd_kernel(RMSNormMeta const *m,
+                     BatchConfig const *bc,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T const *weight_ptr,
+                     cudaStream_t stream) {
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+
+    int M = bc->requestsInfo[i].num_tokens_in_batch;
+    int N = m->num_elements;
+    ComputeInternalGradientsCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            N,
+            output_grad_ptr,
+            static_cast<T *>(m->input_activation),
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr));
+    RMSNormBackwardCUDAKernel<T>
+        <<<M, std::min(N, CUDA_NUM_THREADS), 0, stream>>>(
+            m->in_dim,
+            output_grad_ptr,
+            static_cast<T *>(m->input_activation),
+            weight_ptr,
+            static_cast<T *>(m->rms_ptr),
+            static_cast<T *>(m->norm_ptr),
+            input_grad_ptr,
+            m->reset_input_grads[0]);
+  }
+}
+
+void peft_bwd_kernel_wrapper(RMSNormMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorR const &output_grad,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &weight) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(output_grad.data_type == weight.data_type);
+
+  if (output_grad.data_type == DT_HALF) {
+    peft_bwd_kernel(m,
+                    bc,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    weight.get_half_ptr(),
+                    stream);
+  } else if (output_grad.data_type == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    bc,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    weight.get_float_ptr(),
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[RMSNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 } // namespace RMSNorm
 } // namespace Kernels
 } // namespace FlexFlow
diff --git a/src/ops/kernels/softmax.cpp b/src/ops/kernels/softmax.cpp
index 89c9f14a01..fa31c5adff 100644
--- a/src/ops/kernels/softmax.cpp
+++ b/src/ops/kernels/softmax.cpp
@@ -25,13 +25,13 @@ using Legion::Domain;
 SoftmaxMeta::SoftmaxMeta(FFHandler handler,
                          Softmax const *softmax,
                          Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, softmax) {
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
-  checkCUDNN(
-      cudnnSetTensorDescriptorFromDomain4SoftMax(inputTensor, input_domain));
+  checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
+      inputTensor, input_domain, softmax->data_type));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
-  checkCUDNN(
-      cudnnSetTensorDescriptorFromDomain4SoftMax(outputTensor, input_domain));
+  checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
+      outputTensor, input_domain, softmax->data_type));
   dim = softmax->dim;
   profiling = softmax->profiling;
   inference_debugging = softmax->inference_debugging;
@@ -41,20 +41,26 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
 namespace Kernels {
 namespace Softmax {
 
-template <typename DT>
 void forward_kernel_wrapper(SoftmaxMeta const *m,
-                            DT const *input_ptr,
-                            DT *output_ptr) {
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-
   hipEvent_t t_start, t_end;
   if (m->profiling) {
     checkCUDA(hipEventCreate(&t_start));
     checkCUDA(hipEventCreate(&t_end));
     checkCUDA(hipEventRecord(t_start, stream));
   }
-  Internal::forward_kernel(m, input_ptr, output_ptr, stream);
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::forward_kernel(
+        m, input.get_float_ptr(), output.get_float_ptr(), stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::forward_kernel(
+        m, input.get_half_ptr(), output.get_half_ptr(), stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -70,11 +76,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-template <typename DT>
 void backward_kernel_wrapper(SoftmaxMeta const *m,
-                             DT *input_grad_ptr,
-                             DT const *output_grad_ptr,
-                             size_t num_elements) {
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
@@ -84,8 +88,22 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
     checkCUDA(hipEventCreate(&t_end));
     checkCUDA(hipEventRecord(t_start, stream));
   }
-  Internal::backward_kernel(
-      input_grad_ptr, output_grad_ptr, num_elements, stream);
+  assert(input_grad.domain == output_grad.domain);
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::backward_kernel(m,
+                              input_grad.get_float_ptr(),
+                              output_grad.get_float_ptr(),
+                              output_grad.domain.get_volume(),
+                              stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::backward_kernel(m,
+                              input_grad.get_half_ptr(),
+                              output_grad.get_half_ptr(),
+                              output_grad.domain.get_volume(),
+                              stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -101,21 +119,112 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-template void forward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                            float const *input_ptr,
-                                            float *output_ptr);
-template void forward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                           half const *input_ptr,
-                                           half *output_ptr);
-
-template void backward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                             float *input_grad_ptr,
-                                             float const *output_grad_ptr,
-                                             size_t num_elements);
-template void backward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                            half *input_grad_ptr,
-                                            half const *output_grad_ptr,
-                                            size_t num_elements);
+void inference_kernel_wrapper(SoftmaxMeta const *m,
+                              BatchConfig const *bc,
+                              bool is_last_op,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+  int num_classes = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::inference_kernel(m,
+                               bc,
+                               input.get_float_ptr(),
+                               output.get_float_ptr(),
+                               num_classes,
+                               stream);
+    if (is_last_op) {
+      checkCUDA(hipMemcpyAsync(output_grad.get_float_ptr(),
+                               output.get_float_ptr(),
+                               output.domain.get_volume() * sizeof(float),
+                               hipMemcpyDeviceToDevice,
+                               stream));
+    }
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::inference_kernel(m,
+                               bc,
+                               input.get_half_ptr(),
+                               output.get_half_ptr(),
+                               num_classes,
+                               stream);
+    if (is_last_op) {
+      checkCUDA(hipMemcpyAsync(output_grad.get_half_ptr(),
+                               output.get_half_ptr(),
+                               output.domain.get_volume() * sizeof(half),
+                               hipMemcpyDeviceToDevice,
+                               stream));
+    }
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel(m,
+                              bc,
+                              input_grad.get_float_ptr(),
+                              output_grad.get_float_ptr(),
+                              num_classes,
+                              stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel(m,
+                              bc,
+                              input_grad.get_half_ptr(),
+                              output_grad.get_half_ptr(),
+                              num_classes,
+                              stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
 
 namespace Internal {
 template <typename DT>
@@ -138,7 +247,8 @@ void forward_kernel(SoftmaxMeta const *m,
 }
 
 template <typename DT>
-void backward_kernel(DT *input_grad_ptr,
+void backward_kernel(SoftmaxMeta const *m,
+                     DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      size_t num_elements,
                      hipStream_t stream) {
@@ -149,6 +259,116 @@ void backward_kernel(DT *input_grad_ptr,
                            stream));
 }
 
+template <typename DT>
+void inference_kernel(SoftmaxMeta const *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int num_classes,
+                      hipStream_t stream) {
+  checkCUDNN(miopenSetStream(m->handle.dnn, stream));
+
+  float alpha = 1.0f, beta = 0.0f;
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  checkCUDNN(miopenSet4dTensorDescriptor(m->outputTensor,
+                                         cudnn_data_type,
+                                         bc->num_active_tokens(),
+                                         num_classes,
+                                         1,
+                                         1));
+  checkCUDNN(miopenSoftmaxForward_V2(m->handle.dnn,
+                                     &alpha,
+                                     m->outputTensor,
+                                     input_ptr,
+                                     &beta,
+                                     m->outputTensor,
+                                     output_ptr,
+                                     MIOPEN_SOFTMAX_ACCURATE,
+                                     MIOPEN_SOFTMAX_MODE_CHANNEL));
+}
+
+template <typename DT>
+__global__ void sparse_categorical_crossentropy_loss_peft_backward(
+    DT *input_grad,
+    DT const *output_grad,
+    BatchConfig::TokenId const *token_ids,
+    int num_tokens,
+    int num_classes) {
+  CUDA_KERNEL_LOOP(i, num_tokens * num_classes) {
+    int class_idx = i % num_classes;
+    int token_idx = i / num_classes;
+    input_grad[i] = output_grad[i];
+    if (class_idx == token_ids[token_idx]) {
+      input_grad[i] = input_grad[i] - (DT)1.0f;
+    }
+  }
+}
+
+template <typename DT>
+void peft_bwd_kernel(SoftmaxMeta const *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int num_classes,
+                     hipStream_t stream) {
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+      continue;
+    }
+    int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+    // shift labels by 1 position to the left (ignore first token label)
+    for (int j = 0; j < num_bwd_tokens; j++) {
+      token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
+    }
+
+    DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1);
+    // ignore last token
+    checkCUDA(hipMemsetAsync(input_grad_ptr +
+                                 (tokens_previous_requests +
+                                  bc->requestsInfo[i].num_tokens_in_batch - 1) *
+                                     num_classes,
+                             0,
+                             num_classes * sizeof(DT),
+                             stream));
+    checkCUDA(hipMemcpyAsync(m->handle.workSpace,
+                             token_ids,
+                             sizeof(BatchConfig::TokenId) * num_bwd_tokens,
+                             hipMemcpyHostToDevice,
+                             stream));
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(sparse_categorical_crossentropy_loss_peft_backward<DT>),
+        GET_BLOCKS(num_bwd_tokens * num_classes),
+        CUDA_NUM_THREADS,
+        0,
+        stream,
+        input_grad_ptr + tokens_previous_requests * num_classes,
+        output_grad_ptr + tokens_previous_requests * num_classes,
+        static_cast<BatchConfig::TokenId const *>(m->handle.workSpace),
+        num_bwd_tokens,
+        num_classes);
+    // scale
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(scale_kernel<DT>),
+                       GET_BLOCKS(num_bwd_tokens * num_classes),
+                       CUDA_NUM_THREADS,
+                       0,
+                       stream,
+                       input_grad_ptr + tokens_previous_requests * num_classes,
+                       num_bwd_tokens * num_classes,
+                       DT(0.0),
+                       scale_factor);
+
+    tokens_previous_requests += num_bwd_tokens + 1;
+  }
+  assert(tokens_previous_requests == bc->num_active_tokens());
+}
+
 } // namespace Internal
 } // namespace Softmax
 } // namespace Kernels
diff --git a/src/ops/kernels/softmax.cu b/src/ops/kernels/softmax.cu
index e47006cc9d..16f1219bf6 100644
--- a/src/ops/kernels/softmax.cu
+++ b/src/ops/kernels/softmax.cu
@@ -24,7 +24,7 @@ using Legion::Domain;
 SoftmaxMeta::SoftmaxMeta(FFHandler handler,
                          Softmax const *softmax,
                          Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, softmax) {
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnSetTensorDescriptorFromDomain4SoftMax(
       inputTensor, input_domain, softmax->data_type));
@@ -40,10 +40,9 @@ SoftmaxMeta::SoftmaxMeta(FFHandler handler,
 namespace Kernels {
 namespace Softmax {
 
-template <typename DT>
 void forward_kernel_wrapper(SoftmaxMeta const *m,
-                            DT const *input_ptr,
-                            DT *output_ptr) {
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   cudaEvent_t t_start, t_end;
@@ -52,7 +51,15 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
-  Internal::forward_kernel(m, input_ptr, output_ptr, stream);
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::forward_kernel(
+        m, input.get_float_ptr(), output.get_float_ptr(), stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::forward_kernel(
+        m, input.get_half_ptr(), output.get_half_ptr(), stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
     checkCUDA(cudaEventSynchronize(t_end));
@@ -68,11 +75,9 @@ void forward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-template <typename DT>
 void backward_kernel_wrapper(SoftmaxMeta const *m,
-                             DT *input_grad_ptr,
-                             DT const *output_grad_ptr,
-                             size_t num_elements) {
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
 
@@ -82,8 +87,22 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
-  Internal::backward_kernel(
-      input_grad_ptr, output_grad_ptr, num_elements, stream);
+  assert(input_grad.domain == output_grad.domain);
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::backward_kernel(m,
+                              input_grad.get_float_ptr(),
+                              output_grad.get_float_ptr(),
+                              output_grad.domain.get_volume(),
+                              stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::backward_kernel(m,
+                              input_grad.get_half_ptr(),
+                              output_grad.get_half_ptr(),
+                              output_grad.domain.get_volume(),
+                              stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
     checkCUDA(cudaEventSynchronize(t_end));
@@ -99,21 +118,113 @@ void backward_kernel_wrapper(SoftmaxMeta const *m,
   }
 }
 
-template void forward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                            float const *input_ptr,
-                                            float *output_ptr);
-template void forward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                           half const *input_ptr,
-                                           half *output_ptr);
-
-template void backward_kernel_wrapper<float>(SoftmaxMeta const *m,
-                                             float *input_grad_ptr,
-                                             float const *output_grad_ptr,
-                                             size_t num_elements);
-template void backward_kernel_wrapper<half>(SoftmaxMeta const *m,
-                                            half *input_grad_ptr,
-                                            half const *output_grad_ptr,
-                                            size_t num_elements);
+void inference_kernel_wrapper(SoftmaxMeta const *m,
+                              BatchConfig const *bc,
+                              bool is_last_op,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output,
+                              GenericTensorAccessorW const &output_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  int num_classes = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::inference_kernel(m,
+                               bc,
+                               input.get_float_ptr(),
+                               output.get_float_ptr(),
+                               num_classes,
+                               stream);
+    if (is_last_op) {
+      checkCUDA(cudaMemcpyAsync(output_grad.get_float_ptr(),
+                                output.get_float_ptr(),
+                                output.domain.get_volume() * sizeof(float),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::inference_kernel(m,
+                               bc,
+                               input.get_half_ptr(),
+                               output.get_half_ptr(),
+                               num_classes,
+                               stream);
+    if (is_last_op) {
+      checkCUDA(cudaMemcpyAsync(output_grad.get_half_ptr(),
+                                output.get_half_ptr(),
+                                output.domain.get_volume() * sizeof(half),
+                                cudaMemcpyDeviceToDevice,
+                                stream));
+    }
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
+
+void peft_bwd_kernel_wrapper(SoftmaxMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  int num_classes = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  if (m->output_type[0] == DT_FLOAT) {
+    Internal::peft_bwd_kernel(m,
+                              bc,
+                              input_grad.get_float_ptr(),
+                              output_grad.get_float_ptr(),
+                              num_classes,
+                              stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    Internal::peft_bwd_kernel(m,
+                              bc,
+                              input_grad.get_half_ptr(),
+                              output_grad.get_half_ptr(),
+                              num_classes,
+                              stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    // print_tensor<float>(acc_input.ptr, acc_input.rect.volume(),
+    // "[Softmax:forward:input]"); print_tensor<float>(acc_output.ptr,
+    // acc_output.rect.volume(), "[Softmax:forward:output]");
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    log_measure.debug(
+        "%s [Softmax] inference time = %.2fms\n", m->op_name, elapsed);
+  }
+}
+
 namespace Internal {
 template <typename DT>
 void forward_kernel(SoftmaxMeta const *m,
@@ -135,7 +246,8 @@ void forward_kernel(SoftmaxMeta const *m,
 }
 
 template <typename DT>
-void backward_kernel(DT *input_grad_ptr,
+void backward_kernel(SoftmaxMeta const *m,
+                     DT *input_grad_ptr,
                      DT const *output_grad_ptr,
                      size_t num_elements,
                      cudaStream_t stream) {
@@ -146,6 +258,115 @@ void backward_kernel(DT *input_grad_ptr,
                             stream));
 }
 
+template <typename DT>
+void inference_kernel(SoftmaxMeta const *m,
+                      BatchConfig const *bc,
+                      DT const *input_ptr,
+                      DT *output_ptr,
+                      int num_classes,
+                      cudaStream_t stream) {
+  checkCUDNN(cudnnSetStream(m->handle.dnn, stream));
+
+  float alpha = 1.0f, beta = 0.0f;
+  cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
+  checkCUDNN(cudnnSetTensor4dDescriptor(m->outputTensor,
+                                        CUDNN_TENSOR_NCHW,
+                                        cudnn_data_type,
+                                        bc->num_active_tokens(),
+                                        num_classes,
+                                        1,
+                                        1));
+  checkCUDNN(cudnnSoftmaxForward(m->handle.dnn,
+                                 CUDNN_SOFTMAX_ACCURATE,
+                                 CUDNN_SOFTMAX_MODE_CHANNEL,
+                                 &alpha,
+                                 m->outputTensor,
+                                 input_ptr,
+                                 &beta,
+                                 m->outputTensor,
+                                 output_ptr));
+}
+
+template <typename DT>
+__global__ void sparse_categorical_crossentropy_loss_peft_backward(
+    DT *input_grad,
+    DT const *output_grad,
+    BatchConfig::TokenId const *token_ids,
+    int num_tokens,
+    int num_classes) {
+  CUDA_KERNEL_LOOP(i, num_tokens * num_classes) {
+    int class_idx = i % num_classes;
+    int token_idx = i / num_classes;
+    input_grad[i] = output_grad[i];
+    if (class_idx == token_ids[token_idx]) {
+      input_grad[i] = input_grad[i] - (DT)1.0f;
+    }
+  }
+}
+
+template <typename DT>
+void peft_bwd_kernel(SoftmaxMeta const *m,
+                     BatchConfig const *bc,
+                     DT *input_grad_ptr,
+                     DT const *output_grad_ptr,
+                     int num_classes,
+                     cudaStream_t stream) {
+  BatchConfig::TokenId token_ids[BatchConfig::MAX_NUM_TOKENS];
+  int tokens_previous_requests = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+      continue;
+    }
+    int num_bwd_tokens = bc->requestsInfo[i].num_tokens_in_batch - 1;
+    // shift labels by 1 position to the left (ignore first token label)
+    for (int j = 0; j < num_bwd_tokens; j++) {
+      token_ids[j] = bc->tokensInfo[j + tokens_previous_requests + 1].token_id;
+    }
+
+    DT scale_factor = 1.0 / (bc->requestsInfo[i].num_tokens_in_batch - 1);
+    // ignore last token
+    checkCUDA(cudaMemsetAsync(
+        input_grad_ptr + (tokens_previous_requests +
+                          bc->requestsInfo[i].num_tokens_in_batch - 1) *
+                             num_classes,
+        0,
+        num_classes * sizeof(DT),
+        stream));
+    checkCUDA(cudaMemcpyAsync(m->handle.workSpace,
+                              token_ids,
+                              sizeof(BatchConfig::TokenId) * num_bwd_tokens,
+                              cudaMemcpyHostToDevice,
+                              stream));
+    sparse_categorical_crossentropy_loss_peft_backward<<<
+        GET_BLOCKS(num_bwd_tokens * num_classes),
+        CUDA_NUM_THREADS,
+        0,
+        stream>>>(
+        input_grad_ptr + tokens_previous_requests * num_classes,
+        output_grad_ptr + tokens_previous_requests * num_classes,
+        static_cast<BatchConfig::TokenId const *>(m->handle.workSpace),
+        num_bwd_tokens,
+        num_classes);
+    // scale
+    scale_kernel<<<GET_BLOCKS(num_bwd_tokens * num_classes),
+                   CUDA_NUM_THREADS,
+                   0,
+                   stream>>>(input_grad_ptr +
+                                 tokens_previous_requests * num_classes,
+                             num_bwd_tokens * num_classes,
+                             DT(0.0),
+                             scale_factor);
+
+    tokens_previous_requests += num_bwd_tokens + 1;
+  }
+  assert(tokens_previous_requests == bc->num_active_tokens());
+}
+
 } // namespace Internal
 } // namespace Softmax
 } // namespace Kernels
diff --git a/src/ops/kernels/transpose_kernels.cpp b/src/ops/kernels/transpose_kernels.cpp
index 49a7d827f5..199e1cd0c1 100644
--- a/src/ops/kernels/transpose_kernels.cpp
+++ b/src/ops/kernels/transpose_kernels.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/kernels/transpose_kernels.h"
+#include "flexflow/ops/transpose.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -22,6 +23,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Domain;
 
+TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose)
+    : OpMeta(handler, transpose) {}
+
 struct TransposeStrides {
   int num_dim;
   int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM],
diff --git a/src/ops/kernels/transpose_kernels.cu b/src/ops/kernels/transpose_kernels.cu
index b401ff0ba1..18a6e405af 100644
--- a/src/ops/kernels/transpose_kernels.cu
+++ b/src/ops/kernels/transpose_kernels.cu
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/kernels/transpose_kernels.h"
+#include "flexflow/ops/transpose.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
@@ -21,6 +22,9 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Domain;
 
+TransposeMeta::TransposeMeta(FFHandler handler, Transpose const *transpose)
+    : OpMeta(handler, transpose) {}
+
 struct TransposeStrides {
   int num_dim;
   int in_strides[MAX_TENSOR_DIM], out_strides[MAX_TENSOR_DIM],
diff --git a/src/ops/layer_norm.cc b/src/ops/layer_norm.cc
index b19f400eb2..3161987d60 100644
--- a/src/ops/layer_norm.cc
+++ b/src/ops/layer_norm.cc
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/layer_norm.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/model.h"
 #include "flexflow/utils/hash_utils.h"
 #include "legion/legion_utilities.h"
@@ -56,7 +57,7 @@ LayerNormParams LayerNorm::get_params() const {
   params.elementwise_affine = this->elementwise_affine;
   params.eps = this->eps;
   params.use_bias = this->use_bias;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -561,7 +562,7 @@ void LayerNorm::inference_task(Task const *task,
     assert(regions.size() == 2);
   }
 
-  LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta);
+  LayerNorm::inference_kernel_wrapper(m, bc, in, out, gamma, beta);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
@@ -645,6 +646,104 @@ void LayerNorm::forward_task(Task const *task,
   LayerNorm::forward_kernel_wrapper(m, in, out, gamma, beta);
 }
 
+Legion::FutureMap
+    LayerNorm::peft_bwd(FFModel const &ff,
+                        BatchConfigFuture const &bc,
+                        std::vector<ParallelTensor> const &batch_inputs,
+                        std::vector<ParallelTensor> const &batch_outputs,
+                        MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "LayerNorm op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  // regions[0](I): output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I/O): input_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(2, FID_DATA);
+  if (elementwise_affine) {
+    // regions[2](I): gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(3, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output_grad
+  regions[1](I/O): input_grad
+  regions[2](I): gamma
+*/
+void LayerNorm::peft_bwd_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
+  assert(task->regions.size() == regions.size());
+
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
+
+  Domain out_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  Domain in_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[1].region.get_index_space());
+
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 3));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[2],
+                                             task->regions[2],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    Domain gamma_domain = runtime->get_index_space_domain(
+        ctx, task->regions[2].region.get_index_space());
+
+    assert(gamma_domain.get_volume() == m->effective_num_elements);
+  } else {
+    assert(regions.size() == 2);
+  }
+  LayerNorm::peft_bwd_kernel_wrapper(m, output_grad, input_grad, gamma);
+}
+
 void LayerNorm::backward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -722,55 +821,60 @@ void LayerNorm::backward_task(Task const *task,
                               Runtime *runtime) {
   LayerNormMeta const *m = *((LayerNormMeta **)task->local_args);
   assert(task->regions.size() == regions.size());
-  float const *in_ptr = NULL, *out_grad_ptr = NULL, *gamma_ptr = NULL;
-  float *in_grad_ptr = NULL, *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL;
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
   Domain out_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
-  out_grad_ptr = helperGetTensorPointerRO<float>(
-      regions[0], task->regions[0], FID_DATA, ctx, runtime);
   Domain in_domain = runtime->get_index_space_domain(
       ctx, task->regions[1].region.get_index_space());
-  in_ptr = helperGetTensorPointerRO<float>(
-      regions[1], task->regions[1], FID_DATA, ctx, runtime);
   Domain in_grad_domain = runtime->get_index_space_domain(
       ctx, task->regions[2].region.get_index_space());
-  in_grad_ptr = helperGetTensorPointerRW<float>(
-      regions[2], task->regions[2], FID_DATA, ctx, runtime);
   assert(in_domain == out_grad_domain);
   assert(in_domain.get_volume() ==
          m->effective_num_elements * m->effective_batch_size);
+
   if (m->elementwise_affine) {
     assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[3],
+                                             task->regions[3],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    gamma_grad = helperGetGenericTensorAccessorRW(m->output_type[0],
+                                                  regions[4],
+                                                  task->regions[4],
+                                                  FID_DATA,
+                                                  ctx,
+                                                  runtime);
     Domain gamma_domain = runtime->get_index_space_domain(
         ctx, task->regions[3].region.get_index_space());
-    gamma_ptr = helperGetTensorPointerRO<float>(
-        regions[3], task->regions[3], FID_DATA, ctx, runtime);
     Domain gamma_grad_domain = runtime->get_index_space_domain(
         ctx, task->regions[4].region.get_index_space());
-    gamma_grad_ptr = helperGetTensorPointerRW<float>(
-        regions[4], task->regions[4], FID_DATA, ctx, runtime);
     if (m->use_bias) {
       Domain beta_grad_domain = runtime->get_index_space_domain(
           ctx, task->regions[5].region.get_index_space());
-      beta_grad_ptr = helperGetTensorPointerRW<float>(
-          regions[5], task->regions[5], FID_DATA, ctx, runtime);
+      beta_grad = helperGetGenericTensorAccessorRW(m->output_type[0],
+                                                   regions[5],
+                                                   task->regions[5],
+                                                   FID_DATA,
+                                                   ctx,
+                                                   runtime);
       assert(gamma_domain == beta_grad_domain);
     }
-
     assert(gamma_domain == gamma_grad_domain);
-
     assert(gamma_domain.get_volume() == m->effective_num_elements);
   } else {
     assert(regions.size() == 3);
   }
-
-  LayerNorm::backward_kernel_wrapper<float>(m,
-                                            out_grad_ptr,
-                                            in_ptr,
-                                            in_grad_ptr,
-                                            gamma_ptr,
-                                            gamma_grad_ptr,
-                                            beta_grad_ptr);
+  LayerNorm::backward_kernel_wrapper(
+      m, output_grad, input, input_grad, gamma, gamma_grad, beta_grad);
 }
 
 bool LayerNorm::measure_operator_cost(Simulator *sim,
@@ -785,7 +889,8 @@ bool LayerNorm::measure_operator_cost(Simulator *sim,
   }
   Domain input_domain = sub_input.get_domain();
   Domain output_domain = sub_output.get_domain();
-  LayerNormMeta *m = sim->layernorm_meta;
+  MemoryAllocator gpu_mem_allocator(sim->memory);
+  LayerNormMeta *m = new LayerNormMeta(sim->handler, this, gpu_mem_allocator);
 
   sim->free_all();
   float *in_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
@@ -821,16 +926,24 @@ bool LayerNorm::measure_operator_cost(Simulator *sim,
   if (sim->computationMode == COMP_MODE_TRAINING) {
     float *in_grad_ptr =
         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+    GenericTensorAccessorW in_grad_acc(
+        inputs[0]->data_type, input_domain, in_grad_ptr);
     assert(in_grad_ptr != NULL);
     cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
 
     float *out_grad_ptr = NULL;
     out_grad_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+    GenericTensorAccessorR out_grad_acc(
+        outputs[0]->data_type, output_domain, out_grad_ptr);
     assert(out_grad_ptr != NULL);
     cost_metrics.outputs_memory +=
         cost_metrics.total_mem_diff_from(sim->offset);
 
     float *gamma_grad_ptr = NULL, *beta_grad_ptr = NULL;
+    GenericTensorAccessorW gamma_grad_acc(
+        outputs[0]->data_type, output_domain, gamma_grad_ptr);
+    GenericTensorAccessorW beta_grad_acc(
+        outputs[0]->data_type, output_domain, beta_grad_ptr);
 
     out_of_memory = (in_grad_ptr == NULL) || (out_grad_ptr == NULL) ||
                     (((gamma_grad_ptr == NULL) || (beta_grad_ptr == NULL)) &&
@@ -842,13 +955,13 @@ bool LayerNorm::measure_operator_cost(Simulator *sim,
     }
 
     backward = [=] {
-      backward_kernel_wrapper<float>(m,
-                                     out_grad_ptr,
-                                     in_ptr,
-                                     in_grad_ptr,
-                                     gamma_ptr,
-                                     gamma_grad_ptr,
-                                     beta_grad_ptr);
+      backward_kernel_wrapper(m,
+                              out_grad_acc,
+                              input1_acc,
+                              in_grad_acc,
+                              gamma_acc,
+                              gamma_grad_acc,
+                              beta_grad_acc);
     };
   }
 
diff --git a/src/ops/layer_norm.cpp b/src/ops/layer_norm.cpp
index 07dbdb3dfb..27d314e21e 100644
--- a/src/ops/layer_norm.cpp
+++ b/src/ops/layer_norm.cpp
@@ -14,6 +14,7 @@
  */
 
 #include "flexflow/ops/layer_norm.h"
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
@@ -27,21 +28,37 @@ constexpr int kColwiseReduceTileSize = 32;
 LayerNormMeta::LayerNormMeta(FFHandler handle,
                              LayerNorm const *ln,
                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
+  use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
   effective_num_elements = ln->effective_num_elements;
-  use_bias = ln->use_bias;
+  profiling = ln->profiling;
+  inference_debugging = ln->inference_debugging;
   eps = ln->eps;
-  checkCUDA(hipMalloc(&mean_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&rstd_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&ds_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&db_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&scale_ptr, sizeof(float) * effective_batch_size));
-  checkCUDA(hipMalloc(&bias_ptr, sizeof(float) * effective_batch_size));
+  DataType data_type = ln->data_type;
+  size_t totalSize = effective_batch_size * data_type_size(data_type) * 6;
+  gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
+  mean_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  rstd_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  ds_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  db_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  scale_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
+      data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
-LayerNormMeta::~LayerNormMeta(void) {}
+LayerNormMeta::~LayerNormMeta(void) {
+  if (reserveInst != Realm::RegionInstance::NO_INST) {
+    reserveInst.destroy();
+  }
+}
 
 template <typename T>
 __device__ __forceinline__ T WARP_SHFL_DOWN(T value,
@@ -74,7 +91,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < blockDim.x / C10_WARP_SIZE) ? shared[lid] : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -82,8 +99,14 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
 }
 
 template <typename T>
-__global__ void RowwiseMomentsCUDAKernel(
-    int64_t N, float eps, T const *X, T *mean, T *rstd) {
+__global__ void LayerNormFusedForwardKernel(int64_t N,
+                                            float eps,
+                                            T const *X,
+                                            T *mean,
+                                            T *rstd,
+                                            T const *gamma,
+                                            T const *beta,
+                                            T *Y) {
   __shared__ float m_shared[C10_WARP_SIZE];
   __shared__ float v_shared[C10_WARP_SIZE];
   const int64_t i = blockIdx.x;
@@ -103,18 +126,10 @@ __global__ void RowwiseMomentsCUDAKernel(
     mean[i] = static_cast<T>(sum1);
     rstd[i] = static_cast<T>(rsqrt(sum2 + eps));
   }
-}
 
-template <typename T>
-__global__ void LayerNormForwardCUDAKernel(int64_t N,
-                                           T const *X,
-                                           T const *mean,
-                                           T const *rstd,
-                                           T const *gamma,
-                                           T const *beta,
-                                           T *Y) {
+  __syncthreads();
+
   using T_ACC = T;
-  const int64_t i = blockIdx.x;
   for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
@@ -135,28 +150,19 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m,
                                T const *gamma_ptr,
                                T const *beta_ptr,
                                hipStream_t stream) {
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(RowwiseMomentsCUDAKernel<T>),
-                     m->effective_batch_size,
-                     kCUDABlockReduceNumThreads,
-                     0,
-                     stream,
-                     m->effective_num_elements,
-                     m->eps,
-                     in_ptr,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr));
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(LayerNormForwardCUDAKernel<T>),
-                     m->effective_batch_size,
-                     kCUDANumThreads,
-                     0,
-                     stream,
-                     m->effective_num_elements,
-                     in_ptr,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr),
-                     gamma_ptr,
-                     beta_ptr,
-                     out_ptr);
+
+  LayerNormFusedForwardKernel<T>
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   m->eps,
+                   in_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   out_ptr);
 }
 
 /*static*/
@@ -167,24 +173,154 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m,
                                        GenericTensorAccessorR const &beta) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
   if (m->input_type[0] == DT_FLOAT) {
-    LayerNorm::forward_kernel<float>(m,
-                                     input.get_float_ptr(),
-                                     output.get_float_ptr(),
-                                     gamma.get_float_ptr(),
-                                     m->use_bias ? beta.get_float_ptr()
-                                                 : nullptr,
-                                     stream);
+    LayerNorm::forward_kernel<float>(
+        m,
+        input.get_float_ptr(),
+        output.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
+        stream);
   } else if (m->input_type[0] == DT_HALF) {
-    LayerNorm::forward_kernel<half>(m,
-                                    input.get_half_ptr(),
-                                    output.get_half_ptr(),
-                                    gamma.get_half_ptr(),
-                                    m->use_bias ? beta.get_half_ptr() : nullptr,
-                                    stream);
+    LayerNorm::forward_kernel<half>(
+        m,
+        input.get_half_ptr(),
+        output.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
+        stream);
   } else {
     assert(false && "unsupport datatype in layernorm");
   }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed);
+    // print_tensor<T>(in_ptr, 32, "[LayerNorm:forward:input]");
+    // print_tensor<T>(out_ptr, 32, "[LayerNorm:forward:output]");
+  }
+}
+
+/*static*/
+void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
+                                         BatchConfig const *bc,
+                                         GenericTensorAccessorR const &input,
+                                         GenericTensorAccessorW &output,
+                                         GenericTensorAccessorR const &gamma,
+                                         GenericTensorAccessorR const &beta) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    LayerNorm::forward_kernel<float>(
+        m,
+        input.get_float_ptr(),
+        output.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
+        stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    LayerNorm::forward_kernel<half>(
+        m,
+        input.get_half_ptr(),
+        output.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
+        stream);
+  } else {
+    assert(false && "unsupport datatype in layernorm");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed);
+    // print_tensor<T>(in_ptr, 32, "[LayerNorm:forward:input]");
+    // print_tensor<T>(out_ptr, 32, "[LayerNorm:forward:output]");
+  }
 }
 
 template <typename T>
@@ -224,7 +360,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   using T_ACC = T;
   const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < M) {
-    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(N);
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
     const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
                     static_cast<T_ACC>(rstd[index]) *
                     static_cast<T_ACC>(rstd[index]) *
@@ -235,27 +371,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   }
 }
 
-template <typename T>
-__global__ void LayerNormBackwardCUDAKenrel(int64_t N,
-                                            T const *dY,
-                                            T const *X,
-                                            T const *gamma,
-                                            T const *a,
-                                            T const *b,
-                                            T const *c,
-                                            T *dX) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    const T_ACC gamma_v =
-        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
-    dX[index] =
-        static_cast<T_ACC>(a[i]) * static_cast<T_ACC>(dY[index]) * gamma_v +
-        b[i] * static_cast<T_ACC>(X[index]) + c[i];
-  }
-}
-
 template <typename T>
 __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
                                                   int64_t N,
@@ -452,116 +567,148 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m,
                                 hipStream_t stream) {
   const int64_t M = m->effective_batch_size;
   const int64_t N = m->effective_num_elements;
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
-                     M,
-                     kCUDABlockReduceNumThreads,
-                     0,
-                     stream,
-                     N,
-                     output_grad_ptr,
-                     input_ptr,
-                     gamma_ptr,
-                     static_cast<T *>(m->ds_ptr),
-                     static_cast<T *>(m->db_ptr));
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          input_ptr,
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
   const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel<T>),
-                     B,
-                     kCUDANumThreads,
-                     0,
-                     stream,
-                     M,
-                     N,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr),
-                     static_cast<T *>(m->ds_ptr),
-                     static_cast<T *>(m->db_ptr),
-                     static_cast<T *>(m->scale_ptr),
-                     static_cast<T *>(m->bias_ptr));
-
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
   int const warp_size = C10_WARP_SIZE;
   int const num_threads = 128;
   const dim3 blocks(M);
   int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      input_ptr,
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      N);
 
-  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
-                     blocks,
-                     num_threads,
-                     nshared,
-                     stream,
-                     output_grad_ptr,
-                     input_ptr,
-                     static_cast<T *>(m->mean_ptr),
-                     static_cast<T *>(m->rstd_ptr),
-                     gamma_ptr,
-                     input_grad_ptr,
-                     N);
   if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
     if (M < 512) {
       // For small batch size, do colwise reduce directly
       const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel<T>),
-                         B,
-                         kCUDANumThreads,
-                         0,
-                         stream,
-                         M,
-                         N,
-                         output_grad_ptr,
-                         input_ptr,
-                         static_cast<T *>(m->mean_ptr),
-                         static_cast<T *>(m->rstd_ptr),
-                         gamma_grad_ptr,
-                         beta_grad_ptr);
+      GammaBetaBackwardSimpleCUDAKernel<T>
+          <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                              N,
+                                              output_grad_ptr,
+                                              input_ptr,
+                                              static_cast<T *>(m->mean_ptr),
+                                              static_cast<T *>(m->rstd_ptr),
+                                              gamma_grad_ptr,
+                                              beta_grad_ptr);
     } else {
       const int64_t B =
           (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
       constexpr int kThreadX = kColwiseReduceTileSize;
       constexpr int kThreadY = kColwiseReduceTileSize / 2;
-      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel<T>),
-                         B,
-                         dim3(kThreadX, kThreadY),
-                         0,
-                         stream,
-                         M,
-                         N,
-                         output_grad_ptr,
-                         input_ptr,
-                         static_cast<T *>(m->mean_ptr),
-                         static_cast<T *>(m->rstd_ptr),
-                         gamma_grad_ptr,
-                         beta_grad_ptr);
+      GammaBetaBackwardCUDAKernel<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, stream>>>(
+              M,
+              N,
+              output_grad_ptr,
+              input_ptr,
+              static_cast<T *>(m->mean_ptr),
+              static_cast<T *>(m->rstd_ptr),
+              gamma_grad_ptr,
+              beta_grad_ptr);
     }
   }
 }
 
 /*static*/
 template <typename T>
-void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m,
-                                        T const *output_grad_ptr,
-                                        T const *input_ptr,
-                                        T *input_grad_ptr,
-                                        T const *gamma_ptr,
-                                        T *gamma_grad_ptr,
-                                        T *beta_grad_ptr) {
+void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m,
+                                T const *output_grad_ptr,
+                                T *input_grad_ptr,
+                                T const *gamma_ptr,
+                                hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      N);
+}
+
+/*static*/
+void LayerNorm::peft_bwd_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
-  LayerNorm::backward_kernel<float>(m,
-                                    output_grad_ptr,
-                                    input_ptr,
-                                    input_grad_ptr,
-                                    gamma_ptr,
-                                    gamma_grad_ptr,
-                                    beta_grad_ptr,
-                                    stream);
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::peft_bwd_kernel(m,
+                               output_grad.get_float_ptr(),
+                               input_grad.get_float_ptr(),
+                               gamma.get_float_ptr(),
+                               stream);
+  } else {
+    assert(m->output_type[0] == DT_HALF);
+    LayerNorm::peft_bwd_kernel(m,
+                               output_grad.get_half_ptr(),
+                               input_grad.get_half_ptr(),
+                               gamma.get_half_ptr(),
+                               stream);
+  }
 }
 
-template void
-    LayerNorm::backward_kernel_wrapper<float>(LayerNormMeta const *m,
-                                              float const *output_grad_ptr,
-                                              float const *input_ptr,
-                                              float *input_grad_ptr,
-                                              float const *gamma_ptr,
-                                              float *gamma_grad_ptr,
-                                              float *beta_grad_ptr);
+/*static*/
+void LayerNorm::backward_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_float_ptr(),
+                               input.get_float_ptr(),
+                               input_grad.get_float_ptr(),
+                               gamma.get_float_ptr(),
+                               gamma_grad.get_float_ptr(),
+                               beta_grad.get_float_ptr(),
+                               stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_half_ptr(),
+                               input.get_half_ptr(),
+                               input_grad.get_half_ptr(),
+                               gamma.get_half_ptr(),
+                               gamma_grad.get_half_ptr(),
+                               beta_grad.get_half_ptr(),
+                               stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
diff --git a/src/ops/layer_norm.cu b/src/ops/layer_norm.cu
index 44979c48fe..0801d11617 100644
--- a/src/ops/layer_norm.cu
+++ b/src/ops/layer_norm.cu
@@ -27,7 +27,7 @@ constexpr int kColwiseReduceTileSize = 32;
 LayerNormMeta::LayerNormMeta(FFHandler handle,
                              LayerNorm const *ln,
                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   effective_batch_size = ln->effective_batch_size;
@@ -50,6 +50,7 @@ LayerNormMeta::LayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 LayerNormMeta::~LayerNormMeta(void) {
@@ -96,73 +97,6 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared) {
   return val;
 }
 
-template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
-  int const lid = threadIdx.x % C10_WARP_SIZE;
-  int const wid = threadIdx.x / C10_WARP_SIZE;
-  val = WarpReduceSum(val);
-  __syncthreads();
-  if (lid == 0) {
-    shared[wid] = val;
-  }
-  __syncthreads();
-  val = (threadIdx.x < (min(blockDim.x, max_num_threads) / C10_WARP_SIZE))
-            ? shared[lid]
-            : T(0);
-  if (wid == 0) {
-    val = WarpReduceSum(val);
-  }
-  return val;
-}
-
-#ifdef DEADCODE
-template <typename T>
-__global__ void RowwiseMomentsCUDAKernel(
-    int64_t N, float eps, T const *X, T *mean, T *rstd) {
-  __shared__ float m_shared[C10_WARP_SIZE];
-  __shared__ float v_shared[C10_WARP_SIZE];
-  const int64_t i = blockIdx.x;
-  float sum1 = 0.0f;
-  float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    sum1 += static_cast<float>(X[index]);
-    sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
-  }
-  sum1 = BlockReduceSum<float>(sum1, m_shared);
-  sum2 = BlockReduceSum<float>(sum2, v_shared);
-  if (threadIdx.x == 0) {
-    float const scale = float(1) / static_cast<float>(N);
-    sum1 *= scale;
-    sum2 = max(sum2 * scale - sum1 * sum1, float(0));
-    mean[i] = static_cast<T>(sum1);
-    rstd[i] = static_cast<T>(rsqrt(sum2 + eps));
-  }
-}
-
-template <typename T>
-__global__ void LayerNormForwardCUDAKernel(int64_t N,
-                                           T const *X,
-                                           T const *mean,
-                                           T const *rstd,
-                                           T const *gamma,
-                                           T const *beta,
-                                           T *Y) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    const T_ACC gamma_v =
-        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
-    const T_ACC beta_v =
-        beta == nullptr ? T_ACC(0) : static_cast<T_ACC>(beta[j]);
-    Y[index] = (static_cast<T_ACC>(X[index]) - static_cast<T_ACC>(mean[i])) *
-                   static_cast<T_ACC>(rstd[i]) * gamma_v +
-               beta_v;
-  }
-}
-#endif
-
 template <typename T>
 __global__ void LayerNormFusedForwardKernel(int64_t N,
                                             float eps,
@@ -177,18 +111,13 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -200,7 +129,7 @@ __global__ void LayerNormFusedForwardKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -221,25 +150,18 @@ void LayerNorm::forward_kernel(LayerNormMeta const *m,
                                T const *beta_ptr,
                                cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   LayerNormFusedForwardKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->effective_num_elements,
-                                               m->eps,
-                                               in_ptr,
-                                               static_cast<T *>(m->mean_ptr),
-                                               static_cast<T *>(m->rstd_ptr),
-                                               gamma_ptr,
-                                               beta_ptr,
-                                               out_ptr);
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   m->eps,
+                   in_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   out_ptr);
 }
 
 /*static*/
@@ -290,6 +212,116 @@ void LayerNorm::forward_kernel_wrapper(LayerNormMeta const *m,
   }
 }
 
+/*static*/
+void LayerNorm::inference_kernel_wrapper(LayerNormMeta *m,
+                                         BatchConfig const *bc,
+                                         GenericTensorAccessorR const &input,
+                                         GenericTensorAccessorW &output,
+                                         GenericTensorAccessorR const &gamma,
+                                         GenericTensorAccessorR const &beta) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              input.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    LayerNorm::forward_kernel<float>(
+        m,
+        input.get_float_ptr(),
+        output.get_float_ptr(),
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_float_ptr() : nullptr,
+        stream);
+  } else if (m->input_type[0] == DT_HALF) {
+    LayerNorm::forward_kernel<half>(
+        m,
+        input.get_half_ptr(),
+        output.get_half_ptr(),
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta.get_half_ptr() : nullptr,
+        stream);
+  } else {
+    assert(false && "unsupport datatype in layernorm");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[LayerNorm] forward time (CF) = %.9fms\n", elapsed);
+    // print_tensor<T>(in_ptr, 32, "[LayerNorm:forward:input]");
+    // print_tensor<T>(out_ptr, 32, "[LayerNorm:forward:output]");
+  }
+}
+
 template <typename T>
 __global__ void ComputeInternalGradientsCUDAKernel(
     int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
@@ -327,7 +359,7 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   using T_ACC = T;
   const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < M) {
-    const T_ACC s = T_ACC(1) / static_cast<T_ACC>(N);
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
     const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
                     static_cast<T_ACC>(rstd[index]) *
                     static_cast<T_ACC>(rstd[index]) *
@@ -338,27 +370,6 @@ __global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
   }
 }
 
-template <typename T>
-__global__ void LayerNormBackwardCUDAKenrel(int64_t N,
-                                            T const *dY,
-                                            T const *X,
-                                            T const *gamma,
-                                            T const *a,
-                                            T const *b,
-                                            T const *c,
-                                            T *dX) {
-  using T_ACC = T;
-  const int64_t i = blockIdx.x;
-  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
-    const int64_t index = i * N + j;
-    const T_ACC gamma_v =
-        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
-    dX[index] =
-        static_cast<T_ACC>(a[i]) * static_cast<T_ACC>(dY[index]) * gamma_v +
-        b[i] * static_cast<T_ACC>(X[index]) + c[i];
-  }
-}
-
 template <typename T>
 __global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
                                                   int64_t N,
@@ -620,44 +631,83 @@ void LayerNorm::backward_kernel(LayerNormMeta const *m,
 
 /*static*/
 template <typename T>
-void LayerNorm::backward_kernel_wrapper(LayerNormMeta const *m,
-                                        T const *output_grad_ptr,
-                                        T const *input_ptr,
-                                        T *input_grad_ptr,
-                                        T const *gamma_ptr,
-                                        T *gamma_grad_ptr,
-                                        T *beta_grad_ptr) {
+void LayerNorm::peft_bwd_kernel(LayerNormMeta const *m,
+                                T const *output_grad_ptr,
+                                T *input_grad_ptr,
+                                T const *gamma_ptr,
+                                cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      N);
+}
+
+/*static*/
+void LayerNorm::peft_bwd_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   if (m->output_type[0] == DT_FLOAT) {
-    LayerNorm::backward_kernel<float>(m,
-                                      output_grad_ptr,
-                                      input_ptr,
-                                      input_grad_ptr,
-                                      gamma_ptr,
-                                      gamma_grad_ptr,
-                                      beta_grad_ptr,
-                                      stream);
+    LayerNorm::peft_bwd_kernel(m,
+                               output_grad.get_float_ptr(),
+                               input_grad.get_float_ptr(),
+                               gamma.get_float_ptr(),
+                               stream);
+  } else {
+    assert(m->output_type[0] == DT_HALF);
+    LayerNorm::peft_bwd_kernel(m,
+                               output_grad.get_half_ptr(),
+                               input_grad.get_half_ptr(),
+                               gamma.get_half_ptr(),
+                               stream);
   }
-  // }else if(m->output_type[0] == DT_HALF){
-  //   LayerNorm::backward_kernel<half>(m,
-  //                                   output_grad_ptr,
-  //                                   input_ptr,
-  //                                   input_grad_ptr,
-  //                                   gamma_ptr,
-  //                                   gamma_grad_ptr,
-  //                                   beta_grad_ptr,
-  //                                   stream);
-  // }
 }
 
-template void
-    LayerNorm::backward_kernel_wrapper<float>(LayerNormMeta const *m,
-                                              float const *output_grad_ptr,
-                                              float const *input_ptr,
-                                              float *input_grad_ptr,
-                                              float const *gamma_ptr,
-                                              float *gamma_grad_ptr,
-                                              float *beta_grad_ptr);
+/*static*/
+void LayerNorm::backward_kernel_wrapper(
+    LayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  if (m->output_type[0] == DT_FLOAT) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_float_ptr(),
+                               input.get_float_ptr(),
+                               input_grad.get_float_ptr(),
+                               gamma.get_float_ptr(),
+                               gamma_grad.get_float_ptr(),
+                               beta_grad.get_float_ptr(),
+                               stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    LayerNorm::backward_kernel(m,
+                               output_grad.get_half_ptr(),
+                               input.get_half_ptr(),
+                               input_grad.get_half_ptr(),
+                               gamma.get_half_ptr(),
+                               gamma_grad.get_half_ptr(),
+                               beta_grad.get_half_ptr(),
+                               stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+}
 
-}; // namespace FlexFlow
+} // namespace FlexFlow
diff --git a/src/ops/linear.cc b/src/ops/linear.cc
index 44b56d623e..20ad762b62 100644
--- a/src/ops/linear.cc
+++ b/src/ops/linear.cc
@@ -498,7 +498,7 @@ OpMeta *Linear::init_task_with_dim(Task const *task,
   m->add_bias_only_once = linear->add_bias_only_once;
   m->profiling = linear->profiling;
   m->inference_debugging = linear->inference_debugging;
-  m->trainableInputs[0] = linear->trainableInputs[0];
+  m->trainable_inputs[0] = linear->trainable_inputs[0];
   m->weight_ptr_type = m->input_type[0];
   m->quantization_type = linear->quantization_type;
   m->offload = linear->offload;
@@ -632,8 +632,11 @@ void Linear::inference_task(Task const *task,
       m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
   int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
   int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  assert((weight.domain.hi()[0] - weight.domain.lo()[0] + 1) == in_dim);
+  assert((weight.domain.hi()[1] - weight.domain.lo()[1] + 1) == out_dim);
+  assert(weight.domain.get_volume() == in_dim * out_dim);
 
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   GenericTensorAccessorR bias;
   if (m->use_bias &&
       !(m->add_bias_only_once && task->index_point.point_data[0] != 0)) {
@@ -645,14 +648,15 @@ void Linear::inference_task(Task const *task,
                                             runtime);
     assert(bias.domain.get_volume() == static_cast<size_t>(out_dim));
   }
-  forward_kernel_wrapper(m,
-                         input.ptr,
-                         output.ptr,
-                         weight.ptr,
-                         bias.ptr,
-                         in_dim,
-                         out_dim,
-                         batch_size);
+  inference_kernel_wrapper(m,
+                           bc,
+                           input.ptr,
+                           output.ptr,
+                           weight.ptr,
+                           bias.ptr,
+                           in_dim,
+                           out_dim,
+                           batch_size);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -664,6 +668,119 @@ void Linear::inference_task(Task const *task,
     }
     Linear::save_inference_tensors_to_file(
         m, shard_id, bc, {input}, weights_accessors, {output});
+    printf("\tin=[%i,%i].T @ w=[%i,%i] -> out=[%i,%i]\n",
+           in_dim,
+           bc->num_tokens,
+           in_dim,
+           out_dim,
+           out_dim,
+           bc->num_tokens);
+  }
+}
+
+FutureMap Linear::peft_bwd(FFModel const &ff,
+                           BatchConfigFuture const &bc,
+                           std::vector<ParallelTensor> const &batch_inputs,
+                           std::vector<ParallelTensor> const &batch_outputs,
+                           MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Linear op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(LINEAR_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(weights[0]->part,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        weights[0]->region,
+                        ff.cpu_offload ? MAP_TO_ZC_MEMORY : 0));
+  launcher.add_field(2, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void Linear::peft_bwd_task(Task const *task,
+                           std::vector<PhysicalRegion> const &regions,
+                           Context ctx,
+                           Runtime *runtime) {
+  Domain input_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  LinearMeta *m = *((LinearMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  assert(regions.size() == 3);
+  assert(task->regions.size() == 3);
+  if (m->quantization_type == DT_NONE) {
+    assert(m->input_type[0] == m->weight_type[0]);
+  }
+  assert(m->input_type[0] == m->output_type[0]);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output_grad = helperGetGenericTensorAccessorRW(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+
+  int num_infr_tokens = bc->num_active_infr_tokens();
+  int num_peft_tokens = bc->num_active_peft_tokens();
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    Linear::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false, true);
+    printf("\tw=[%i,%i] @ out_grad=[%i,%i] -> in_grad[%i,%i]\n",
+           in_dim,
+           out_dim,
+           out_dim,
+           num_peft_tokens,
+           in_dim,
+           num_peft_tokens);
+  }
+  peft_bwd_kernel_wrapper(m,
+                          input_grad.ptr,
+                          output_grad.ptr,
+                          weight.ptr,
+                          in_dim,
+                          out_dim,
+                          num_infr_tokens,
+                          num_peft_tokens);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    Linear::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
   }
 }
 
@@ -782,7 +899,7 @@ void Linear::backward(FFModel const &ff) {
     launcher.add_field(rid++, FID_DATA);
     // regions[1](I/O): replica_grad
     assert(replica == NULL);
-    if (trainableInputs[0]) {
+    if (trainable_inputs[0]) {
       launcher.add_region_requirement(
           RegionRequirement(inputs[0]->part_grad,
                             0 /*projection id*/,
@@ -878,17 +995,17 @@ void Linear::backward_task_with_dim(Task const *task,
                                     Runtime *runtime) {
   // Linear* linear = (Linear*) task->args;
   LinearMeta const *m = *((LinearMeta **)task->local_args);
-  assert(regions.size() == (5 + static_cast<size_t>(m->trainableInputs[0]) +
+  assert(regions.size() == (5 + static_cast<size_t>(m->trainable_inputs[0]) +
                             static_cast<size_t>(m->use_bias)));
   assert(task->regions.size() ==
-         (5 + static_cast<size_t>(m->trainableInputs[0]) +
+         (5 + static_cast<size_t>(m->trainable_inputs[0]) +
           static_cast<size_t>(m->use_bias)));
   DT *input_grad = nullptr;
   size_t rid = 0;
   TensorAccessorR<DT, NDIM> acc_input(
       regions[rid], task->regions[rid], FID_DATA, ctx, runtime);
   rid++;
-  if (m->trainableInputs[0]) {
+  if (m->trainable_inputs[0]) {
     Domain domain = runtime->get_index_space_domain(
         ctx, task->regions[rid].region.get_index_space());
     if (domain.get_dim() == NDIM + 1) {
@@ -1119,7 +1236,10 @@ bool Linear::measure_operator_cost(Simulator *sim,
   int input_n = sub_input.get_volume() / input_c;
   int output_c = sub_output.dims[0].size;
   int output_n = sub_output.get_volume() / output_c;
-  LinearMeta *m = sim->linear_meta;
+
+  MemoryAllocator gpu_mem_allocator(sim->memory);
+  LinearMeta *m = new LinearMeta(
+      sim->handler, output_n, this, gpu_mem_allocator, input_c * output_c);
   m->activation = activation;
   m->kernel_reg_type = kernel_reg_type;
   m->kernel_reg_lambda = kernel_reg_lambda;
@@ -1164,7 +1284,7 @@ bool Linear::measure_operator_cost(Simulator *sim,
   };
   if (sim->computationMode == COMP_MODE_TRAINING) {
     void *input_grad_ptr = NULL;
-    if (trainableInputs[0]) {
+    if (trainable_inputs[0]) {
       input_grad_ptr =
           sim->allocate(sub_input.get_volume(), inputs[0]->data_type);
     } else {
@@ -1313,7 +1433,7 @@ LinearParams Linear::get_params() const {
   params.kernel_reg_lambda = this->kernel_reg_lambda;
   params.quantization_type = this->quantization_type;
   params.offload = this->offload;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
 
diff --git a/src/ops/lora_linear.cc b/src/ops/lora_linear.cc
new file mode 100644
index 0000000000..fde6bc2b28
--- /dev/null
+++ b/src/ops/lora_linear.cc
@@ -0,0 +1,1316 @@
+#include "flexflow/ops/lora_linear.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/layer.h"
+#include "flexflow/model.h"
+#include "flexflow/ops/kernels/lora_linear_kernels.h"
+#include "flexflow/utils/hash_utils.h"
+#include "flexflow/utils/peft_weight_allocator.h"
+#include "legion/legion_utilities.h"
+#include <sys/stat.h>
+#include <sys/types.h>
+#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
+#include "flexflow/utils/cuda_helper.h"
+#else
+#include "flexflow/utils/hip_helper.h"
+#endif
+
+namespace FlexFlow {
+
+// declare Legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::InlineLauncher;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+
+using namespace FlexFlow::Kernels::LoraLinear;
+
+bool check_lora_layer_match(Layer *potential_target,
+                            std::string target_module_name) {
+  if (potential_target->op_type == OP_LINEAR &&
+      potential_target->name != nullptr && strlen(potential_target->name) > 0) {
+    std::string s(potential_target->name);
+    if (s.find(target_module_name) != std::string::npos &&
+        s.find("lora") == std::string::npos) {
+      return true;
+    }
+  }
+  return false;
+}
+
+PEFTModelID *FFModel::add_lora_layer(LoraLinearConfig const peft_config) {
+  assert(config.enable_peft &&
+         "Cannot add a LoRA layer if PEFT mode is not enabled");
+  if (peft_config.target_modules.size() == 0) {
+    printf("PEFT config does not contain any target module\n");
+    std::cout << peft_config << std::endl;
+    assert(false);
+  }
+  PEFTModelID *peft_model_id = new PEFTModelID(peft_model_global_guid++);
+  peft_configs[*peft_model_id] = peft_config;
+
+  for (std::string target_module_name : peft_config.target_modules) {
+    assert(target_module_name.length() > 0 &&
+           "LoRA target module name is empty");
+    // find target layer
+    for (auto it = layers.begin(); it != layers.end(); ++it) {
+      Layer *target_module = *it;
+      bool match = check_lora_layer_match(target_module, target_module_name);
+      if (!match) {
+        continue;
+      }
+
+      if (base_layer_to_peft_layer.find(target_module) !=
+          base_layer_to_peft_layer.end()) {
+        // lora linear layer already added, no need to add again
+        Layer *peft_layer = base_layer_to_peft_layer[target_module];
+        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
+      } else {
+        Tensor const input = target_module->inputs[0];
+        Tensor const output = target_module->outputs[0];
+        assert(input->data_type == output->data_type);
+        std::string name_ = target_module->name
+                                ? std::string(target_module->name)
+                                : std::string("");
+        size_t last_underscore = name_.length() - 1;
+        for (int i = name_.length() - 1; i > 0; i--) {
+          if (!(std::isdigit(target_module->name[i]) ||
+                target_module->name[i] == '_')) {
+            break;
+          } else if (target_module->name[i] == '_') {
+            last_underscore = i;
+          }
+        }
+        name_.erase(last_underscore);
+
+        name_ += ".lora";
+        std::cout << "Adding layer " << name_ << std::endl;
+        Layer *peft_layer = new Layer(this,
+                                      OP_LORA,
+                                      output->data_type,
+                                      name_.c_str(),
+                                      2 /*inputs*/,
+                                      0 /*weights*/,
+                                      1 /*outputs*/,
+                                      input,
+                                      output);
+        // fix LoRA layer's transformer layer ID and model ID
+        peft_layer->layer_guid.transformer_layer_id =
+            target_module->layer_guid.transformer_layer_id;
+        peft_layer->layer_guid.model_id = target_module->layer_guid.model_id;
+        {
+          int numdims = output->num_dims;
+          int dims[MAX_TENSOR_DIM];
+          for (int i = 0; i < numdims; i++) {
+            dims[i] = output->dims[i];
+          }
+          peft_layer->outputs[0] =
+              create_tensor_legion_ordering(numdims,
+                                            dims,
+                                            output->data_type,
+                                            peft_layer,
+                                            0,
+                                            true /*create_grad*/);
+        }
+        it = layers.insert(it + 1, peft_layer);
+        ++it;
+        base_layer_to_peft_layer[target_module] = peft_layer;
+        peft_layer_to_peft_id[peft_layer] = std::vector<PEFTModelID>();
+        peft_layer_to_peft_id[peft_layer].push_back(*peft_model_id);
+      }
+    }
+  }
+
+  // save finetuned lora model configs to file
+  if (peft_config.trainable) {
+    std::string finetuned_model_folder = join_path({
+        peft_config.cache_folder,
+        "finetuned_models",
+        peft_config.peft_model_id,
+    });
+    fs::remove_all(finetuned_model_folder);
+    std::string finetuned_model_config_folder = join_path({
+        finetuned_model_folder,
+        "config",
+    });
+    fs::create_directories(finetuned_model_config_folder);
+    std::string lora_linear_config_filepath = join_path({
+        finetuned_model_config_folder,
+        "ff_config.json",
+    });
+    serialize_to_json_file(peft_config, lora_linear_config_filepath);
+    std::string optimizer_config_filepath = join_path({
+        finetuned_model_config_folder,
+        "ff_optimizer_config.json",
+    });
+    if (typeid(*peft_config.optimizer_config) ==
+        typeid(LoraSGDOptimizerConfig)) {
+      LoraSGDOptimizerConfig const *sgd_config =
+          static_cast<LoraSGDOptimizerConfig const *>(
+              peft_config.optimizer_config);
+      serialize_to_json_file(*sgd_config, optimizer_config_filepath);
+    } else if (typeid(*peft_config.optimizer_config) ==
+               typeid(LoraAdamOptimizerConfig)) {
+      LoraAdamOptimizerConfig const *adam_config =
+          static_cast<LoraAdamOptimizerConfig const *>(
+              peft_config.optimizer_config);
+      serialize_to_json_file(*adam_config, optimizer_config_filepath);
+    } else {
+      assert(false && "Optimizer not supported");
+    }
+  }
+
+  return peft_model_id;
+}
+
+Op *LoraLinear::create_operator_from_layer(
+    FFModel &model,
+    Layer const *layer,
+    std::vector<ParallelTensor> const &inputs) {
+  std::unordered_map<PEFTModelID, LoraLinearConfig> _peft_configs;
+  std::vector<PEFTModelID> const &peft_ids =
+      model.peft_layer_to_peft_id[(Layer *)layer];
+  for (int i = 0; i < peft_ids.size(); i++) {
+    _peft_configs.emplace(
+        std::make_pair(peft_ids[i], model.peft_configs[peft_ids[i]]));
+  }
+  return new LoraLinear(model,
+                        layer->layer_guid,
+                        layer->op_type,
+                        inputs[0],
+                        inputs[1],
+                        _peft_configs,
+                        layer->name);
+}
+
+LoraLinear::LoraLinear(FFModel &model,
+                       LoraLinear const &other,
+                       ParallelTensor const input,
+                       ParallelTensor const output)
+    : LoraLinear(model,
+                 other.layer_guid,
+                 other.op_type,
+                 input,
+                 output,
+                 other.peft_configs,
+                 other.name) {}
+
+LoraLinear::LoraLinear(FFModel &model,
+                       Params const &params,
+                       Input const &inputs,
+                       char const *name)
+    : LoraLinear(model,
+                 params.layer_guid,
+                 params.type,
+                 inputs.first,
+                 inputs.second,
+                 params.peft_configs,
+                 params.name) {}
+
+LoraLinear::LoraLinear(
+    FFModel &model,
+    LayerID const &_layer_guid,
+    OperatorType _op_type,
+    ParallelTensor const _input,
+    ParallelTensor const _output,
+    std::unordered_map<PEFTModelID, LoraLinearConfig> const &_peft_configs,
+    char const *name)
+    : Op(model,
+         _op_type,
+         _output->data_type,
+         name,
+         2 /*inputs*/,
+         0 /*weights*/,
+         false,
+         1 /*outputs*/,
+         _input,
+         _output) {
+  assert(_input->data_type == _output->data_type);
+  // overwrite layer_guid
+  layer_guid = _layer_guid;
+  data_type = _output->data_type;
+
+  ParallelTensorShape input_shape = this->inputs[0]->get_shape();
+  LoraLinearParams params = this->get_params();
+
+  // Create output tensor
+  {
+    int numdim = inputs[1]->num_dims;
+    ParallelDim dims[MAX_TENSOR_DIM];
+    for (int i = 0; i < numdim; i++) {
+      dims[i] = inputs[1]->dims[i];
+    }
+    outputs[0] = model.create_parallel_tensor_legion_ordering(
+        numdim, dims, inputs[1]->data_type, this);
+  }
+  for (auto const &kv : _peft_configs) {
+    peft_configs.insert(kv);
+  }
+  // assert(check_output_input_weight_parallel_dims(allocate_weights));
+}
+
+void LoraLinear::init(FFModel const &ff) {
+  assert(false && "LoraLinear does not support normal init");
+}
+
+void LoraLinear::init_inference(
+    FFModel const &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
+  // assert(check_output_input_weight_same_machine_view());
+  // output is considered as an input to allow in-place optimization
+  ParallelTensor output_tensor = batch_outputs[0];
+  parallel_is = output_tensor->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_init_inference(ff, argmap, output_tensor);
+  IndexLauncher launcher(LORA_LINEAR_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(LoraLinear)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(1, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap_inference(ff, fm, output_tensor);
+}
+
+template <typename DT>
+void load_peft_from_file(DT *ptr,
+                         size_t num_rows,
+                         size_t num_columns,
+                         int num_shards,
+                         int shard_id,
+                         std::string filepath) {
+  std::ifstream in(filepath, std::ios::in | std::ios::binary);
+  if (!in.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(in.good() && "incorrect weight file path");
+
+  // HuggingFace dims (serialized in row-major order)
+  //    lora_A: [rank, intermediate_dim]
+  //    lora_B: [hidden_dim, rank]
+  // FlexFlow dims (serialized in column-major order)
+  //    lora_A: [intermediate_dim, rank]
+  //    lora_B: [rank, out_dim]
+  // Tensor parallelism: shard lora_A along intermediate_dim, replicate lora_B
+  assert(num_rows % num_shards == 0);
+  size_t chunk_size = num_rows / num_shards;
+  size_t offset = (num_shards > 1) ? shard_id * chunk_size : 0;
+
+  // Allocate memory for the weight shard
+  std::vector<DT> host_array(chunk_size * num_columns);
+  // Read the chunk
+  size_t total_size_read = 0;
+  for (int i = 0; i < num_columns; ++i) {
+    in.seekg((i * num_rows + offset) * sizeof(DT));
+    in.read(reinterpret_cast<char *>(host_array.data() + i * chunk_size),
+            chunk_size * sizeof(DT));
+    total_size_read += in.gcount();
+  }
+  // Check weight shard size
+  size_t expected_data_size = chunk_size * num_columns * sizeof(DT);
+  if (total_size_read != expected_data_size) {
+    printf("load weight data error: expected %lu bytes, got: %lu bytes, data "
+           "size: %lu\n",
+           expected_data_size,
+           total_size_read,
+           sizeof(DT));
+    assert(false);
+  }
+  assert(host_array.size() == chunk_size * num_columns);
+  // Copy weight to device memory
+  copy_tensor_host_to_dev(ptr, host_array.data(), chunk_size * num_columns);
+  in.close();
+}
+
+/*
+  regions[0](O): output
+  regions[1](I): kernel
+  regions[2](I): bias
+*/
+OpMeta *LoraLinear::init_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  LoraLinear const *lora = (LoraLinear *)task->args;
+  FFHandler handle = *((FFHandler const *)task->local_args);
+  GenericTensorAccessorR input =
+      helperGetGenericTensorAccessorRO(lora->inputs[0]->data_type,
+                                       regions[0],
+                                       task->regions[0],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW output =
+      helperGetGenericTensorAccessorRW(lora->outputs[0]->data_type,
+                                       regions[1],
+                                       task->regions[1],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+  int batch_size = output.domain.get_volume() / out_dim;
+  assert(input.domain.get_volume() == in_dim * batch_size);
+  assert(output.domain.get_volume() == out_dim * batch_size);
+
+  LoraLinearMeta *m = new LoraLinearMeta(handle, lora);
+  m->trainable_inputs[0] = lora->trainable_inputs[0];
+  std::strcpy(m->op_name, lora->name);
+  m->layer_guid = lora->layer_guid;
+
+  int num_shards = lora->inputs[0]->dims[0].degree;
+  int shard_id = task->index_point.point_data[0];
+  int num_dims = lora->inputs[0]->num_dims;
+  assert(in_dim == lora->inputs[0]->dims[0].size / num_shards);
+  assert(out_dim ==
+         lora->inputs[1]->dims[0].size / lora->inputs[1]->dims[0].degree);
+
+  DataType dt = m->input_type[0];
+  assert(dt == m->input_type[1]);
+  assert(dt == m->output_type[0]);
+  assert(dt == lora->inputs[0]->data_type);
+  assert(dt == lora->inputs[1]->data_type);
+  assert(dt == lora->outputs[0]->data_type);
+
+  // get layer name
+  assert(lora->name != nullptr &&
+         "Layer name is not set, cannot determine weights location");
+  std::string lora_layername = std::string(lora->name);
+  std::string searchString = "lora";
+  size_t found = lora_layername.find(searchString);
+  if (found == std::string::npos) {
+    std::cout << "LoraLinear layer name not in the right format (does not "
+                 "contain word 'lora')"
+              << std::endl;
+    assert(false);
+  }
+  std::string lora_layername_substr =
+      lora_layername.substr(0, found + searchString.length());
+
+  for (auto const &kv : lora->peft_configs) {
+    PEFTModelID const &model_id = kv.first;
+    LoraLinearConfig const &lora_config = kv.second;
+
+    int rank = lora_config.rank;
+
+    int w0_num_elements = rank * in_dim;
+    int w1_num_elements = rank * out_dim;
+    // values below represent total weight sizes before sharding. Lora B is not
+    // sharded.
+    int lora_A_num_rows = in_dim * num_shards;
+    int lora_A_num_cols = rank;
+    int lora_B_num_rows = rank;
+    int lora_B_num_cols = out_dim;
+    int lora_A_num_shards = num_shards;
+    int lora_B_num_shards = 1;
+
+    LoraLinearWeight weight;
+    weight.in_dim = in_dim;
+    weight.out_dim = out_dim;
+    weight.rank = rank;
+    weight.num_shards = num_shards;
+    PEFTWeightAllocator *allocator = m->handle.peft_weight_allocator;
+    weight.w0_ptr = allocator->allocate_local_weights_untyped(
+        model_id, w0_num_elements * data_type_size(dt));
+    weight.w1_ptr = allocator->allocate_local_weights_untyped(
+        model_id, w1_num_elements * data_type_size(dt));
+
+    if (!lora_config.init_lora_weights) {
+      // load weights from file
+      std::string weights_folder_filepath = join_path({
+          lora_config.cache_folder,
+          "weights",
+          lora_config.peft_model_id,
+          dt == DT_FLOAT ? "full-precision" : "half-precision",
+      });
+      std::string w0_filepath = join_path(
+          {weights_folder_filepath, lora_layername_substr + "_A.weight"});
+      std::string w1_filepath = join_path(
+          {weights_folder_filepath, lora_layername_substr + "_B.weight"});
+      if (dt == DT_FLOAT) {
+        std::cout << "Loading LORA weight "
+                  << lora_layername_substr + "_A.weight"
+                  << ", num_rows: " << lora_A_num_rows
+                  << ", num_cols: " << lora_A_num_cols
+                  << ", num_shards: " << lora_A_num_shards
+                  << ", shard_id: " << shard_id << std::endl;
+        load_peft_from_file((float *)weight.w0_ptr,
+                            lora_A_num_rows,
+                            lora_A_num_cols,
+                            lora_A_num_shards,
+                            shard_id,
+                            w0_filepath);
+        std::cout << "Loading LORA weight "
+                  << lora_layername_substr + "_B.weight"
+                  << ", num_rows: " << lora_B_num_rows
+                  << ", num_cols: " << lora_B_num_cols
+                  << ", num_shards: " << lora_B_num_shards
+                  << ", shard_id: " << shard_id << std::endl;
+        load_peft_from_file((float *)weight.w1_ptr,
+                            lora_B_num_rows,
+                            lora_B_num_cols,
+                            lora_B_num_shards,
+                            shard_id,
+                            w1_filepath);
+      } else if (dt == DT_HALF) {
+        std::cout << "Loading LORA weight "
+                  << lora_layername_substr + "_A.weight"
+                  << ", num_rows: " << lora_A_num_rows
+                  << ", num_cols: " << lora_A_num_cols
+                  << ", num_shards: " << lora_A_num_shards
+                  << ", shard_id: " << shard_id << std::endl;
+        load_peft_from_file((half *)weight.w0_ptr,
+                            lora_A_num_rows,
+                            lora_A_num_cols,
+                            lora_A_num_shards,
+                            shard_id,
+                            w0_filepath);
+        std::cout << "Loading LORA weight "
+                  << lora_layername_substr + "_B.weight"
+                  << ", num_rows: " << lora_B_num_rows
+                  << ", num_cols: " << lora_B_num_cols
+                  << ", num_shards: " << lora_B_num_shards
+                  << ", shard_id: " << shard_id << std::endl;
+        load_peft_from_file((half *)weight.w1_ptr,
+                            lora_B_num_rows,
+                            lora_B_num_cols,
+                            lora_B_num_shards,
+                            shard_id,
+                            w1_filepath);
+      } else {
+        assert(false && "Data type not supported");
+      }
+    } else {
+      // initialize weights
+      int seed = 0;
+      init_kernel_wrapper(m, seed);
+    }
+
+    // allocate space for gradients if the LoRA layer is trainable
+    if (lora_config.trainable) {
+      // Ensure we have an optimizer
+      assert(lora_config.optimizer_config != nullptr && "Optimizer not set");
+      assert(typeid(*lora_config.optimizer_config) !=
+                 typeid(LoraOptimizerConfig) &&
+             "Optimizer config is not a subclass of LoraOptimizerConfig");
+      if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
+        // Input is partitioned (no replication)
+        // w0_grad is local weight gradients
+        weight.w0_grad_ptr = allocator->allocate_local_weights_untyped(
+            model_id, w0_num_elements * data_type_size(dt));
+        // w1_grad is sync weight gradients
+        weight.w1_grad_ptr = allocator->allocate_sync_weights_untyped(
+            model_id, w1_num_elements * data_type_size(dt));
+      } else {
+        // Input is replicated
+        // w0_grad is sync weight gradients
+        weight.w0_grad_ptr = allocator->allocate_sync_weights_untyped(
+            model_id, w0_num_elements * data_type_size(dt));
+        // w1_grad is local weight gradients
+        weight.w1_grad_ptr = allocator->allocate_local_weights_untyped(
+            model_id, w1_num_elements * data_type_size(dt));
+      }
+      // allocate space for v_values if needed by optimizer
+      if (typeid(*lora_config.optimizer_config) ==
+          typeid(LoraSGDOptimizerConfig)) {
+        LoraSGDOptimizerConfig const *sgd_config =
+            static_cast<LoraSGDOptimizerConfig const *>(
+                lora_config.optimizer_config);
+        if (sgd_config->momentum > 0.0f) {
+          if (lora->inputs[0]->dims[num_dims - 1].degree == 1) {
+            weight.w0_v_values_ptr = allocator->allocate_local_weights_untyped(
+                model_id, w0_num_elements * data_type_size(dt));
+            weight.w1_v_values_ptr = allocator->allocate_sync_weights_untyped(
+                model_id, w1_num_elements * data_type_size(dt));
+          } else {
+            weight.w0_v_values_ptr = allocator->allocate_sync_weights_untyped(
+                model_id, w0_num_elements * data_type_size(dt));
+            weight.w1_v_values_ptr = allocator->allocate_local_weights_untyped(
+                model_id, w1_num_elements * data_type_size(dt));
+          }
+        }
+      } else if (typeid(*lora_config.optimizer_config) ==
+                 typeid(LoraAdamOptimizerConfig)) {
+        assert(false && "Adam optim not yet implemented");
+      } else {
+        assert(false && "Optimizer not supported");
+      }
+    }
+    assert(m->model_state.find(model_id) == m->model_state.end());
+    m->model_state[model_id].weights = weight;
+    m->model_state[model_id].optimizer_config = lora_config.optimizer_config;
+    m->model_state[model_id].lora_alpha = lora_config.lora_alpha;
+    m->model_state[model_id].cache_folder = lora_config.cache_folder;
+    m->model_state[model_id].peft_model_id = lora_config.peft_model_id;
+  }
+  return m;
+}
+
+void LoraLinear::forward(FFModel const &ff) {
+  assert(false && "LoraLinear does not support normal init");
+}
+
+FutureMap
+    LoraLinear::inference(FFModel const &ff,
+                          BatchConfigFuture const &bc,
+                          std::vector<ParallelTensor> const &batch_inputs,
+                          std::vector<ParallelTensor> const &batch_outputs,
+                          MachineView const *mv) {
+  assert(check_output_input_weight_same_parallel_is());
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
+  // assert(check_output_input_weight_same_machine_view());
+  // output is considered as an input to allow in-place optimization
+  ParallelTensor output_tensor = batch_outputs[0];
+  parallel_is = output_tensor->parallel_is;
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  size_t machine_view_hash = view->hash();
+  set_argumentmap_for_inference(ff, argmap, output_tensor);
+  IndexLauncher launcher(LORA_LINEAR_INF_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[1]->region));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void LoraLinear::inference_task(Task const *task,
+                                std::vector<PhysicalRegion> const &regions,
+                                Context ctx,
+                                Runtime *runtime) {
+  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
+    return;
+  }
+  assert(regions.size() == 2);
+  assert(task->regions.size() == regions.size());
+  assert(m->input_type[0] == m->output_type[0]);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorRW(
+      m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  // int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  // int out_dim = output.domain.hi()[0] - output.domain.lo()[0] + 1;
+
+  // int num_infr_tokens = bc->num_active_infr_tokens();
+  // int num_peft_tokens = bc->num_active_peft_tokens();
+  inference_kernel_wrapper(m, bc, input, output);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+
+    // get layer name
+    std::string lora_layername = std::string(m->op_name);
+    std::string searchString = "lora";
+    size_t found = lora_layername.find(searchString);
+    if (found == std::string::npos) {
+      std::cout << "LoraLinear layer name not in the right format (does not "
+                   "contain word 'lora')"
+                << std::endl;
+      assert(false);
+    }
+    std::string lora_layername_substr =
+        lora_layername.substr(0, found + searchString.length());
+    // print layer name
+    std::cout << "INF " << lora_layername_substr << std::endl;
+
+    // build output filepath
+    fs::path dst_filepath = get_dst_folder("fwd", m->decoding_step, shard_id);
+    if (m->layer_guid.model_id > 0) {
+      assert(false && "Model ID > 0 not supported yet");
+    }
+    std::string layername = "layers." +
+                            std::to_string(m->layer_guid.transformer_layer_id) +
+                            "." + lora_layername_substr;
+    dst_filepath /= layername;
+
+    // save batch config, if passed
+    if (bc != nullptr) {
+      bc->save_to_file(dst_filepath.string() + ".batch_config");
+    }
+
+    std::string filename = dst_filepath.string() + ".input_0";
+    if (input.data_type == DT_FLOAT) {
+      save_tensor(
+          input.get_float_ptr(), input.domain.get_volume(), filename.c_str());
+    } else if (input.data_type == DT_HALF) {
+      save_tensor(
+          input.get_half_ptr(), input.domain.get_volume(), filename.c_str());
+    } else {
+      assert(false);
+    }
+
+    int rank, num_tokens;
+    for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
+      PEFTModelID peft_model_id = it->first;
+      LoraLinearWeight weight = m->model_state[peft_model_id].weights;
+      rank = weight.rank;
+      num_tokens = input.domain.get_volume() / weight.in_dim;
+      fs::path dst_filepath_weights =
+          get_dst_folder("weights", m->decoding_step, shard_id) / layername;
+      std::string filenameA =
+          dst_filepath_weights.string() + ".weight_A.original";
+      std::string filenameB =
+          dst_filepath_weights.string() + ".weight_B.original";
+      if (m->input_type[0] == DT_FLOAT) {
+        save_tensor((float *)weight.w0_ptr,
+                    weight.rank * weight.in_dim,
+                    filenameA.c_str());
+        save_tensor((float *)weight.w1_ptr,
+                    weight.rank * weight.out_dim,
+                    filenameB.c_str());
+      } else if (m->input_type[0] == DT_HALF) {
+        save_tensor((half *)weight.w0_ptr,
+                    weight.rank * weight.in_dim,
+                    filenameA.c_str());
+        save_tensor((half *)weight.w1_ptr,
+                    weight.rank * weight.out_dim,
+                    filenameB.c_str());
+      } else {
+        assert(false && "Data type not supported");
+      }
+    }
+
+    filename = dst_filepath.string() + ".output_0";
+    if (output.data_type == DT_FLOAT) {
+      save_tensor(
+          output.get_float_ptr(), output.domain.get_volume(), filename.c_str());
+    } else if (output.data_type == DT_HALF) {
+      save_tensor(
+          output.get_half_ptr(), output.domain.get_volume(), filename.c_str());
+    } else {
+      assert(false);
+    }
+
+    if (bc->num_active_peft_tokens() > 0) {
+      // input activation (intermediate)
+      filename = dst_filepath.string() + ".low_rank_activation";
+      if (output.data_type == DT_FLOAT) {
+        save_tensor((float *)m->low_rank_activation,
+                    rank * num_tokens,
+                    filename.c_str());
+      } else if (output.data_type == DT_HALF) {
+        save_tensor((half *)m->low_rank_activation,
+                    rank * num_tokens,
+                    filename.c_str());
+      } else {
+        assert(false);
+      }
+    }
+    m->decoding_step++;
+  }
+}
+
+FutureMap LoraLinear::peft_bwd(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
+  assert(batch_inputs.size() == 2);
+  assert(batch_outputs.size() == 1);
+  // Assert that the output and the second input are mapped to the same
+  // region/part
+  assert(batch_outputs[0]->region == batch_inputs[1]->region);
+  assert(batch_outputs[0]->part == batch_inputs[1]->part);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  ParallelTensor output_tensor = batch_outputs[0];
+  parallel_is = output_tensor->parallel_is;
+  MachineView const *view = mv ? mv : &output_tensor->machine_view;
+  set_argumentmap_for_inference(ff, argmap, output_tensor);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(LORA_LINEAR_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void lora_inference_debugging(LoraLinearMeta *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorW input_grad,
+                              GenericTensorAccessorR output_grad,
+                              int shard_id) {
+  // get layer name
+  std::string lora_layername = std::string(m->op_name);
+  std::string searchString = "lora";
+  size_t found = lora_layername.find(searchString);
+  if (found == std::string::npos) {
+    std::cout << "LoraLinear layer name not in the right format (does not "
+                 "contain word 'lora')"
+              << std::endl;
+    assert(false);
+  }
+  std::string lora_layername_substr =
+      lora_layername.substr(0, found + searchString.length());
+  // print layer name
+  std::cout << "BWD " << lora_layername_substr << std::endl;
+
+  // build output filepath
+  fs::path dst_filepath = get_dst_folder("bwd", m->bwd_step, shard_id);
+  if (m->layer_guid.model_id > 0) {
+    assert(false && "Model ID > 0 not supported yet");
+  }
+  std::string layername = "layers." +
+                          std::to_string(m->layer_guid.transformer_layer_id) +
+                          "." + lora_layername_substr;
+  dst_filepath /= layername;
+
+  // save batch config, if passed
+  if (bc != nullptr) {
+    bc->save_to_file(dst_filepath.string() + ".batch_config");
+  }
+
+  // weights, weights gradients
+  fs::path dst_filepath_weights =
+      get_dst_folder("weights", m->bwd_step, shard_id) / layername;
+  assert(m->model_state.size() >= 1 && "Model state empty!");
+  for (auto it = m->model_state.begin(); it != m->model_state.end(); ++it) {
+    PEFTModelID peft_model_id = it->first;
+    LoraLinearWeight weight = m->model_state[peft_model_id].weights;
+    std::string filename_weight_A =
+        dst_filepath_weights.string() + ".weight_A.finetuned";
+    std::string filename_weight_B =
+        dst_filepath_weights.string() + ".weight_B.finetuned";
+    std::string filename_grad_A =
+        dst_filepath_weights.string() + ".weight_A.gradient";
+    std::string filename_grad_B =
+        dst_filepath_weights.string() + ".weight_B.gradient";
+    if (m->input_type[0] == DT_FLOAT) {
+      // weight A
+      save_tensor((float *)weight.w0_ptr,
+                  weight.rank * weight.in_dim,
+                  filename_weight_A.c_str());
+      // weight grad A
+      save_tensor((float *)weight.w0_grad_ptr,
+                  weight.rank * weight.in_dim,
+                  filename_grad_A.c_str());
+      // weight B
+      save_tensor((float *)weight.w1_ptr,
+                  weight.rank * weight.out_dim,
+                  filename_weight_B.c_str());
+      // weight grad B
+      save_tensor((float *)weight.w1_grad_ptr,
+                  weight.rank * weight.out_dim,
+                  filename_grad_B.c_str());
+    } else if (m->input_type[0] == DT_HALF) {
+      // weight A
+      save_tensor((half *)weight.w0_ptr,
+                  weight.rank * weight.in_dim,
+                  filename_weight_A.c_str());
+      // weight grad A
+      save_tensor((half *)weight.w0_grad_ptr,
+                  weight.rank * weight.in_dim,
+                  filename_grad_A.c_str());
+      // weight B
+      save_tensor((half *)weight.w1_ptr,
+                  weight.rank * weight.out_dim,
+                  filename_weight_B.c_str());
+      // weight grad B
+      save_tensor((half *)weight.w1_grad_ptr,
+                  weight.rank * weight.out_dim,
+                  filename_grad_B.c_str());
+    } else {
+      assert(false && "Data type not supported");
+    }
+  }
+
+  std::string filename = dst_filepath.string() + ".input_gradient_0";
+  if (input_grad.data_type == DT_FLOAT) {
+    save_tensor(input_grad.get_float_ptr(),
+                input_grad.domain.get_volume(),
+                filename.c_str());
+  } else if (input_grad.data_type == DT_HALF) {
+    save_tensor(input_grad.get_half_ptr(),
+                input_grad.domain.get_volume(),
+                filename.c_str());
+  } else {
+    assert(false);
+  }
+
+  filename = dst_filepath.string() + ".output_gradient_0";
+  if (output_grad.data_type == DT_FLOAT) {
+    save_tensor(output_grad.get_float_ptr(),
+                output_grad.domain.get_volume(),
+                filename.c_str());
+  } else if (output_grad.data_type == DT_HALF) {
+    save_tensor(output_grad.get_half_ptr(),
+                output_grad.domain.get_volume(),
+                filename.c_str());
+  } else {
+    assert(false);
+  }
+  m->bwd_step++;
+}
+
+template <typename DT>
+void save_peft_to_file(DT const *weight_ptr,
+                       size_t size,
+                       std::string filepath) {
+  std::ofstream out(filepath, std::ios::binary);
+  // Check if the file was opened successfully
+  if (!out || !out.is_open() || !out.good()) {
+    printf("Could not open file: %s\n", filepath.c_str());
+  }
+  assert(out && out.is_open() && out.good() &&
+         "can't write to lora weight file path");
+  std::vector<DT> host_array(size);
+  copy_tensor_dev_to_host(weight_ptr, host_array.data(), size);
+
+  size_t target_data_size = sizeof(DT) * size;
+  out.write((char *)host_array.data(), target_data_size);
+
+  size_t out_written_size = out.tellp();
+  if (out_written_size != target_data_size) {
+    printf("save weight data error: %lu, %lu, %lu\n",
+           out_written_size,
+           target_data_size,
+           sizeof(DT));
+    assert(false);
+  }
+  out.close();
+}
+
+void save_peft_weights_if_needed(LoraLinearMeta *m,
+                                 BatchConfig const *bc,
+                                 int in_dim,
+                                 int out_dim,
+                                 int shard_id) {
+  std::string lora_layername = std::string(m->op_name);
+  std::string searchString = "lora";
+  size_t found = lora_layername.find(searchString);
+  if (found == std::string::npos) {
+    std::cout << "LoraLinear layer name not in the right format (does not "
+                 "contain word 'lora')"
+              << std::endl;
+    assert(false);
+  }
+  std::string lora_layername_substr =
+      lora_layername.substr(0, found + searchString.length());
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    // Skip non-PEFT requests
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    // Skip PEFT forward-only requests
+    if (!bc->requestsInfo[i].peft_bwd) {
+      continue;
+    }
+    if (bc->requestsInfo[i].optimizer_tasks.save_updated_weights) {
+      assert(m->model_state.find(bc->requestsInfo[i].peft_model_id) !=
+             m->model_state.end());
+      std::string weight_export_folder = join_path({
+          m->model_state[bc->requestsInfo[i].peft_model_id].cache_folder,
+          "finetuned_models",
+          m->model_state[bc->requestsInfo[i].peft_model_id].peft_model_id,
+          "weights",
+          "shard_" + std::to_string(shard_id),
+      });
+      fs::create_directories(weight_export_folder);
+
+      int rank = m->model_state[bc->requestsInfo[i].peft_model_id].weights.rank;
+      int w0_num_elements = rank * in_dim;
+      int w1_num_elements = rank * out_dim;
+      std::string w0_filepath = join_path(
+          {weight_export_folder, lora_layername_substr + "_A.weight"});
+      std::string w1_filepath = join_path(
+          {weight_export_folder, lora_layername_substr + "_B.weight"});
+      if (m->input_type[0] == DT_FLOAT) {
+        save_peft_to_file(
+            (float *)m->model_state[bc->requestsInfo[i].peft_model_id]
+                .weights.w0_ptr,
+            w0_num_elements,
+            w0_filepath);
+        if (shard_id == 0) {
+          save_peft_to_file(
+              (float *)m->model_state[bc->requestsInfo[i].peft_model_id]
+                  .weights.w1_ptr,
+              w1_num_elements,
+              w1_filepath);
+        }
+      } else if (m->input_type[0] == DT_HALF) {
+        save_peft_to_file(
+            (half *)m->model_state[bc->requestsInfo[i].peft_model_id]
+                .weights.w0_ptr,
+            w0_num_elements,
+            w0_filepath);
+        if (shard_id == 0) {
+          save_peft_to_file(
+              (half *)m->model_state[bc->requestsInfo[i].peft_model_id]
+                  .weights.w1_ptr,
+              w1_num_elements,
+              w1_filepath);
+        }
+      } else {
+        assert(false && "Data type not supported");
+      }
+    }
+  }
+}
+
+void LoraLinear::peft_bwd_task(Task const *task,
+                               std::vector<PhysicalRegion> const &regions,
+                               Context ctx,
+                               Runtime *runtime) {
+  Domain input_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  LoraLinearMeta *m = *((LoraLinearMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  assert(regions.size() == 2);
+  assert(task->regions.size() == regions.size());
+  assert(m->input_type[0] == m->output_type[0]);
+  assert(task->index_point.get_dim() == 1);
+  int shard_id = task->index_point.point_data[0];
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  int in_dim = input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  int out_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  // int num_infr_tokens = bc->num_active_infr_tokens();
+  // int num_peft_tokens = bc->num_active_peft_tokens();
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+
+  save_peft_weights_if_needed(m, bc, in_dim, out_dim, shard_id);
+
+  if (m->inference_debugging) {
+    lora_inference_debugging(m, bc, input_grad, output_grad, shard_id);
+  }
+}
+
+void LoraLinear::backward(FFModel const &ff) {
+  assert(false && "LoraLinear does not support normal backward");
+}
+
+void LoraLinear::print_layer(FFModel const &ff) {}
+
+void LoraLinear::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 1);
+  assert(numInputs == 2);
+  assert(outputs[0]->get_volume() == inputs[1]->get_volume());
+  outputs[0]->parallel_is = inputs[1]->parallel_is;
+  outputs[0]->region = inputs[1]->region;
+  outputs[0]->part = inputs[1]->part;
+  outputs[0]->region_grad = inputs[1]->region_grad;
+  outputs[0]->part_grad = inputs[1]->part_grad;
+}
+
+bool LoraLinear::measure_operator_cost(Simulator *sim,
+                                       MachineView const &mv,
+                                       CostMetrics &cost_metrics) const {
+  return false;
+}
+
+bool operator==(LoraLinearParams const &lhs, LoraLinearParams const &rhs) {
+  if (lhs.layer_guid == rhs.layer_guid && lhs.type == rhs.type &&
+      lhs.peft_configs.size() == rhs.peft_configs.size()) {
+    for (auto const &kv : lhs.peft_configs) {
+      auto it = rhs.peft_configs.find(kv.first);
+      if (it == rhs.peft_configs.end() || !(it->second == kv.second)) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+fs::path create_unique_temp_directory() {
+  std::srand(static_cast<unsigned int>(std::time(nullptr)));
+
+  fs::path temp_dir = fs::temp_directory_path();
+  fs::path unique_path;
+
+  do {
+    std::string unique_name = "flexflow_tmp_" + std::to_string(std::rand());
+    unique_path = temp_dir / unique_name;
+  } while (fs::exists(unique_path));
+
+  fs::create_directory(unique_path);
+  return unique_path;
+}
+
+void serialize_string(Legion::Serializer &sez,
+                      std::string string_to_serialize) {
+  sez.serialize(string_to_serialize.length());
+  sez.serialize(string_to_serialize.c_str(), string_to_serialize.length());
+}
+
+std::string deserialize_string(Legion::Deserializer &dez) {
+  size_t string_size;
+  char buffer[4096] = {0};
+  dez.deserialize(string_size);
+  dez.deserialize(buffer, string_size);
+  return std::string(buffer);
+}
+
+void LoraLinear::serialize(Legion::Serializer &sez) const {
+  sez.serialize(this->layer_guid.id);
+  sez.serialize(this->layer_guid.transformer_layer_id);
+  sez.serialize(this->layer_guid.model_id);
+  sez.serialize(this->op_type);
+  sez.serialize(this->peft_configs.size());
+  for (auto const &kv : this->peft_configs) {
+    // Serialize PEFTModelID
+    sez.serialize(kv.first.id);
+
+    // Serialize LoraLinearConfig and OptimizerConfig to tmp folder
+    // 1. Create tmp dir and serialize it
+    fs::path unique_temp_dir = create_unique_temp_directory();
+    serialize_string(sez, unique_temp_dir.string());
+    // 2. Dump LoraLinearConfig to json file in tmp dir
+    std::string lora_config_filename = std::string("lora_linear_config_") +
+                                       std::to_string(kv.first.id) +
+                                       std::string(".json");
+    fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename;
+    serialize_to_json_file(kv.second, lora_config_json_filepath);
+    // 3. Dump optimizer to json file in tmp dir, and serialize optimizer type
+    std::string optimizer_filename = std::string("optimizer_config_") +
+                                     std::to_string(kv.first.id) +
+                                     std::string(".json");
+    fs::path optim_config_filepath = unique_temp_dir / optimizer_filename;
+    assert((kv.second.trainable) == (kv.second.optimizer_config != nullptr));
+    if (kv.second.trainable) {
+      if (typeid(*kv.second.optimizer_config) ==
+          typeid(LoraSGDOptimizerConfig)) {
+        sez.serialize(OPTIMIZER_TYPE_SGD);
+        LoraSGDOptimizerConfig const *sgd_config =
+            static_cast<LoraSGDOptimizerConfig const *>(
+                kv.second.optimizer_config);
+        serialize_to_json_file(*sgd_config, optim_config_filepath);
+      } else if (typeid(*kv.second.optimizer_config) ==
+                 typeid(LoraAdamOptimizerConfig)) {
+        sez.serialize(OPTIMIZER_TYPE_ADAM);
+        LoraAdamOptimizerConfig const *adam_config =
+            static_cast<LoraAdamOptimizerConfig const *>(
+                kv.second.optimizer_config);
+        serialize_to_json_file(*adam_config, optim_config_filepath);
+      } else {
+        assert(false && "Optimizer type not yet supported");
+      }
+    }
+  }
+  sez.serialize(strlen(this->name));
+  sez.serialize(this->name, strlen(this->name));
+}
+
+/* static */
+using PCG::Node;
+Node LoraLinear::deserialize(FFModel &ff,
+                             Legion::Deserializer &dez,
+                             ParallelTensor inputs[],
+                             int num_inputs) {
+  assert(num_inputs == 2);
+  size_t id, transformer_layer_id, deserialized_model_id;
+  OperatorType op_type;
+  size_t num_pefts;
+  size_t name_len;
+  char name[MAX_OPNAME] = {0};
+
+  LoraLinearParams params;
+
+  dez.deserialize(id);
+  dez.deserialize(transformer_layer_id);
+  dez.deserialize(deserialized_model_id);
+  dez.deserialize(op_type);
+  dez.deserialize(num_pefts);
+  for (int i = 0; i < num_pefts; i++) {
+    // Deserialize PEFTModelID
+    size_t pid;
+    dez.deserialize(pid);
+    PEFTModelID peft_model_id(pid);
+    // Deserialize tmp folder containing LoraLinearConfig and optimizer config
+    fs::path unique_temp_dir = fs::path(deserialize_string(dez));
+    // 1. Deserialize LoraLinearConfig
+    std::string lora_config_filename = std::string("lora_linear_config_") +
+                                       std::to_string(pid) +
+                                       std::string(".json");
+    fs::path lora_config_json_filepath = unique_temp_dir / lora_config_filename;
+    std::unique_ptr<LoraLinearConfig> lora_linear_config =
+        deserialize_from_json_file<LoraLinearConfig>(lora_config_json_filepath);
+    // 2. Deserialize optimizer if needed
+    if (lora_linear_config->trainable) {
+      std::string optimizer_filename = std::string("optimizer_config_") +
+                                       std::to_string(pid) +
+                                       std::string(".json");
+      fs::path optim_config_filepath = unique_temp_dir / optimizer_filename;
+      OptimizerType type_;
+      dez.deserialize(type_);
+      if (type_ == OPTIMIZER_TYPE_SGD) {
+        std::unique_ptr<LoraSGDOptimizerConfig> sgd_optimizer_config =
+            deserialize_from_json_file<LoraSGDOptimizerConfig>(
+                optim_config_filepath);
+        lora_linear_config->optimizer_config =
+            dynamic_cast<LoraOptimizerConfig *>(sgd_optimizer_config.release());
+      } else if (type_ == OPTIMIZER_TYPE_ADAM) {
+        std::unique_ptr<LoraAdamOptimizerConfig> adam_optimizer_config =
+            deserialize_from_json_file<LoraAdamOptimizerConfig>(
+                optim_config_filepath);
+        lora_linear_config->optimizer_config =
+            dynamic_cast<LoraOptimizerConfig *>(
+                adam_optimizer_config.release());
+      } else {
+        printf("Optimizer type: %d\n", type_);
+        assert(false && "Optimizer type not yet supported");
+      }
+    }
+    try {
+      fs::remove_all(unique_temp_dir);
+    } catch (fs::filesystem_error const &e) {
+      std::cerr << "Error removing tmp directory: " << e.what() << std::endl;
+    }
+    params.peft_configs.emplace(
+        std::make_pair(peft_model_id, *lora_linear_config));
+  }
+  dez.deserialize(name_len);
+  dez.deserialize(name, name_len);
+  LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
+
+  params.layer_guid = layer_guid;
+  params.type = op_type;
+  strcpy(params.name, name);
+  return ff.get_or_create_node<LoraLinear>({inputs[0], inputs[1]}, params);
+}
+
+Op *LoraLinear::materialize(FFModel &ff,
+                            ParallelTensor inputs[],
+                            int num_inputs) const {
+  LoraLinearParams params = get_params();
+  return new LoraLinear(ff, params, {inputs[0], inputs[1]}, this->name);
+}
+
+LoraLinearParams LoraLinear::get_params() const {
+  LoraLinearParams params;
+  params.layer_guid = this->layer_guid;
+  params.type = this->op_type;
+  if (strlen(this->name) < MAX_OPNAME) {
+    strcpy(params.name, this->name);
+  }
+  params.peft_configs = this->peft_configs;
+  return params;
+}
+
+bool LoraLinearParams::is_valid(
+    std::pair<ParallelTensorShape, ParallelTensorShape> const &input_shape)
+    const {
+  return true;
+}
+
+}; // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::LoraLinearParams>::operator()(
+    FlexFlow::LoraLinearParams const &params) const {
+  size_t key = 0;
+  hash_combine(key, params.layer_guid.id);
+  hash_combine(key, params.layer_guid.transformer_layer_id);
+  hash_combine(key, params.layer_guid.model_id);
+  for (auto const &kv : params.peft_configs) {
+    hash_combine(key, kv.first.id);
+    hash_combine(key, kv.second.rank);
+    hash_combine(key, kv.second.trainable);
+    hash_combine(key, kv.second.cache_folder);
+    hash_combine(key, kv.second.peft_model_id);
+    hash_combine(key, kv.second.lora_alpha);
+    hash_combine(key, kv.second.lora_dropout);
+    hash_combine(key, kv.second.target_modules);
+    hash_combine(key, kv.second.init_lora_weights);
+  }
+  return key;
+}
+}; // namespace std
diff --git a/src/ops/lora_linear_params.cc b/src/ops/lora_linear_params.cc
new file mode 100644
index 0000000000..6e0c60e057
--- /dev/null
+++ b/src/ops/lora_linear_params.cc
@@ -0,0 +1,221 @@
+#include "flexflow/ops/lora_linear_params.h"
+#include <fstream>
+#include <nlohmann/json.hpp>
+#include <string>
+using json = nlohmann::json;
+
+namespace FlexFlow {
+
+// ---------------- Optimizer configs ----------------
+// ---------------------------------------------------
+
+// empty optimizer
+LoraOptimizerConfig::LoraOptimizerConfig() {}
+
+// SGD optimizer
+LoraSGDOptimizerConfig::LoraSGDOptimizerConfig()
+    : lr(0.001f), momentum(0.0f), nesterov(false), weight_decay(0.0f) {}
+
+LoraSGDOptimizerConfig::LoraSGDOptimizerConfig(double lr_,
+                                               double momentum_,
+                                               bool nesterov_,
+                                               bool weight_decay_)
+    : lr(lr_), momentum(momentum_), nesterov(nesterov_),
+      weight_decay(weight_decay_) {}
+
+std::ostream &operator<<(std::ostream &os, LoraSGDOptimizerConfig const &llc) {
+  os << "SGD Optimizer (lr=" << llc.lr << ",momentum=" << llc.momentum
+     << ",nesterov=" << llc.nesterov << ",weight_decay=" << llc.weight_decay
+     << ")";
+  return os;
+}
+
+// Adam optimizer
+LoraAdamOptimizerConfig::LoraAdamOptimizerConfig()
+    : alpha(0.001f), beta1(0.9f), beta2(0.999f), weight_decay(0.0f),
+      epsilon(1e-8) {}
+
+LoraAdamOptimizerConfig::LoraAdamOptimizerConfig(double alpha_,
+                                                 double beta1_,
+                                                 double beta2_,
+                                                 double weight_decay_,
+                                                 double epsilon_)
+    : alpha(alpha_), beta1(beta1_), beta2(beta2_), weight_decay(weight_decay_),
+      epsilon(epsilon_) {}
+
+std::ostream &operator<<(std::ostream &os, LoraAdamOptimizerConfig const &llc) {
+  os << "SGD Optimizer (alpha=" << llc.alpha << ",beta1=" << llc.beta1
+     << ",beta2=" << llc.beta2 << ",weight_decay=" << llc.weight_decay
+     << ",epsilon=" << llc.epsilon << ")";
+  return os;
+}
+
+// Serialization helpers
+template <typename T>
+void serialize_to_json_file(T const &obj, fs::path const &filepath) {
+  json j = obj;
+  std::ofstream file(filepath);
+  file << j.dump(4);
+}
+
+template <typename T>
+std::unique_ptr<T> deserialize_from_json_file(fs::path const &filepath) {
+  std::ifstream file(filepath);
+  json j;
+  file >> j;
+  return std::make_unique<T>(j.get<T>());
+}
+
+template void
+    serialize_to_json_file<LoraLinearConfig>(LoraLinearConfig const &obj,
+                                             fs::path const &filepath);
+template void serialize_to_json_file<LoraSGDOptimizerConfig>(
+    LoraSGDOptimizerConfig const &obj, fs::path const &filepath);
+template void serialize_to_json_file<LoraAdamOptimizerConfig>(
+    LoraAdamOptimizerConfig const &obj, fs::path const &filepath);
+template std::unique_ptr<LoraLinearConfig>
+    deserialize_from_json_file<LoraLinearConfig>(fs::path const &filepath);
+template std::unique_ptr<LoraSGDOptimizerConfig>
+    deserialize_from_json_file<LoraSGDOptimizerConfig>(
+        fs::path const &filepath);
+template std::unique_ptr<LoraAdamOptimizerConfig>
+    deserialize_from_json_file<LoraAdamOptimizerConfig>(
+        fs::path const &filepath);
+
+// ------------------ LoRA configs -------------------
+// ---------------------------------------------------
+const LoraLinearConfig LoraLinearConfig::EmptyConfig = LoraLinearConfig("", "");
+
+LoraLinearConfig::LoraLinearConfig(
+    std::string const &cache_folder_,
+    std::string const &peft_model_id_,
+    bool trainable_,
+    LoraOptimizerConfig *optimizer_config_,
+    bool init_lora_weights_,
+    std::string const &base_model_name_or_path_,
+    std::string const &precision_,
+    int rank_,
+    float lora_alpha_,
+    float lora_dropout_,
+    std::vector<std::string> const &target_modules_)
+    : cache_folder(cache_folder_), peft_model_id(peft_model_id_), rank(rank_),
+      lora_alpha(lora_alpha_), lora_dropout(lora_dropout_),
+      trainable(trainable_), optimizer_config(optimizer_config_),
+      init_lora_weights(init_lora_weights_),
+      base_model_name_or_path(base_model_name_or_path_), precision(precision_),
+      target_modules(target_modules_) {
+
+  if (peft_model_id.empty()) {
+    return;
+  }
+  assert(!cache_folder.empty() &&
+         "cache_folder must be provided when using PEFT");
+  if (trainable) {
+    assert(optimizer_config != nullptr &&
+           "optimizer_config must be provided when using PEFT");
+    assert(
+        !base_model_name_or_path.empty() &&
+        "base_model_name_or_path must be provided when training a PEFT model");
+    assert(!precision.empty() &&
+           "precision must be provided when training a PEFT model");
+  } else {
+    assert(init_lora_weights == false &&
+           "init_lora_weights must be false when LORA not trainable");
+    assert(optimizer_config == nullptr &&
+           "optimizer_config must be nullptr when not trainable");
+  }
+  // if we are not initializing LORA from scratch, load the configs from
+  // existing repository
+  if (!init_lora_weights) {
+    std::string peft_inference_config_file_path =
+        join_path({cache_folder, "configs", peft_model_id, "config.json"});
+    std::ifstream config_file(peft_inference_config_file_path);
+    if (config_file.is_open()) {
+      try {
+        json model_config;
+        config_file >> model_config;
+        rank = model_config["r"];
+        lora_alpha = float(model_config["lora_alpha"]);
+        lora_dropout = model_config["lora_dropout"];
+        for (auto &s : model_config["target_modules"]) {
+          target_modules.push_back(s);
+        }
+        // do not load the base_model_name_or_path from the HF config because we
+        // may be applying LoRA to another model
+      } catch (json::exception const &e) {
+        std::cerr << "Error parsing PEFT config from JSON file: " << e.what()
+                  << std::endl;
+        assert(false);
+      }
+    } else {
+      std::cerr << "Error opening JSON file " << peft_inference_config_file_path
+                << std::endl;
+      assert(false);
+    }
+  }
+  assert(rank > 0 && "rank must be greater than 0");
+  assert(lora_alpha > 0.0f && "lora_alpha must be greater than 0.0");
+  assert(lora_dropout >= 0.0f && lora_dropout <= 1.0f &&
+         "lora_dropout must be in [0.0, 1.0]");
+  assert(target_modules.size() > 0 && "target_modules must not be left empty");
+}
+
+// constructor used to support unordered_map
+LoraLinearConfig::LoraLinearConfig() : LoraLinearConfig("", "") {}
+
+bool operator==(LoraLinearConfig const &lhs, LoraLinearConfig const &rhs) {
+  if (lhs.cache_folder == rhs.cache_folder &&
+      lhs.peft_model_id == rhs.peft_model_id && lhs.rank == rhs.rank &&
+      lhs.lora_alpha == rhs.lora_alpha &&
+      lhs.lora_dropout == rhs.lora_dropout &&
+      lhs.target_modules.size() == rhs.target_modules.size() &&
+      lhs.trainable == rhs.trainable &&
+      lhs.init_lora_weights == rhs.init_lora_weights &&
+      lhs.optimizer_config == rhs.optimizer_config &&
+      lhs.base_model_name_or_path == rhs.base_model_name_or_path &&
+      lhs.precision == rhs.precision) {
+    for (int i = 0; i < lhs.target_modules.size(); i++) {
+      if (lhs.target_modules[i] != rhs.target_modules[i]) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+std::ostream &operator<<(std::ostream &os, LoraLinearConfig const &llc) {
+  os << "LoraLinearConfig: ";
+  os << "cache_folder: " << llc.cache_folder << ", ";
+  os << "peft_model_id: " << llc.peft_model_id << ", ";
+  os << "rank: " << llc.rank << ", ";
+  os << "lora_alpha: " << llc.lora_alpha << ", ";
+  os << "lora_dropout: " << llc.lora_dropout << ", ";
+  os << "target_modules: [";
+  for (int i = 0; i < llc.target_modules.size(); i++) {
+    os << llc.target_modules[i];
+    if (i < llc.target_modules.size() - 1) {
+      os << ", ";
+    }
+  }
+  os << "], ";
+  os << "trainable: " << llc.trainable << ", ";
+  if (llc.optimizer_config != nullptr) {
+    os << "optimizer_config: ";
+    if (typeid(*llc.optimizer_config) == typeid(LoraSGDOptimizerConfig)) {
+      os << *static_cast<LoraSGDOptimizerConfig *>(llc.optimizer_config);
+    } else if (typeid(*llc.optimizer_config) ==
+               typeid(LoraAdamOptimizerConfig)) {
+      os << *static_cast<LoraAdamOptimizerConfig *>(llc.optimizer_config);
+    } else {
+      os << "Unknown optimizer config type";
+    }
+    std::cout << std::endl;
+  }
+  os << "init_lora_weights: " << llc.init_lora_weights << std::endl;
+  os << "base_model_name_or_path: " << llc.base_model_name_or_path << std::endl;
+  os << "precision: " << llc.precision << std::endl;
+  return os;
+}
+
+}; // namespace FlexFlow
diff --git a/src/ops/mean.cc b/src/ops/mean.cc
index b2ec94fdf8..0d41276735 100644
--- a/src/ops/mean.cc
+++ b/src/ops/mean.cc
@@ -87,8 +87,7 @@ OpMeta *Mean::init_task(Task const *task,
                         Context ctx,
                         Runtime *runtime) {
   FFHandler handler = *((FFHandler const *)task->local_args);
-  OpMeta *m = new OpMeta(handler);
-  return m;
+  return nullptr;
 }
 
 void Mean::forward(FFModel const &ff) {}
diff --git a/src/ops/noop.cc b/src/ops/noop.cc
index da2d4922e3..45bd76d59d 100644
--- a/src/ops/noop.cc
+++ b/src/ops/noop.cc
@@ -90,8 +90,9 @@ OpMeta *NoOp::init_task(Task const *task,
                         std::vector<PhysicalRegion> const &regions,
                         Context ctx,
                         Runtime *runtime) {
+  NoOp *no_op = (NoOp *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  OpMeta *m = new OpMeta(handle);
+  OpMeta *m = new OpMeta(handle, no_op);
   return m;
 }
 
@@ -167,7 +168,7 @@ void NoOp::init_inference(FFModel const &ff,
     set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
     IndexLauncher launcher(NOOP_INIT_TASK_ID,
                            parallel_is,
-                           TaskArgument(NULL, 0),
+                           TaskArgument(this, sizeof(NoOp)),
                            argmap,
                            Predicate::TRUE_PRED,
                            false /*must*/,
@@ -244,7 +245,7 @@ void NoOp::init(FFModel const &ff) {
     set_argumentmap_for_init(ff, argmap);
     IndexLauncher launcher(NOOP_INIT_TASK_ID,
                            parallel_is,
-                           TaskArgument(NULL, 0),
+                           TaskArgument(this, sizeof(NoOp)),
                            argmap,
                            Predicate::TRUE_PRED,
                            false /*must*/,
diff --git a/src/ops/pool_2d.cc b/src/ops/pool_2d.cc
index 4621ab5909..c8b194afa9 100644
--- a/src/ops/pool_2d.cc
+++ b/src/ops/pool_2d.cc
@@ -315,7 +315,7 @@ OpMeta *Pool2D::init_task(Task const *task,
   assert(task->regions.size() == 2);
   Pool2D const *pool = (Pool2D *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  Pool2DMeta *m = new Pool2DMeta(handle);
+  Pool2DMeta *m = new Pool2DMeta(handle, pool);
   m->profiling = pool->profiling;
   m->inference_debugging = pool->inference_debugging;
   std::strcpy(m->op_name, pool->name);
@@ -545,7 +545,7 @@ bool Pool2D::measure_operator_cost(Simulator *sim,
   int output_n = sub_output.dims[3].size;
   int pad_h = ((output_h - 1) * stride_h + kernel_h - input_h + 1) / 2;
   int pad_w = ((output_w - 1) * stride_w + kernel_w - input_w + 1) / 2;
-  Pool2DMeta *m = sim->pool2d_meta;
+  Pool2DMeta *m = new Pool2DMeta(sim->handler, this);
 
   init_kernel(m,
               input_w,
diff --git a/src/ops/reduce.cc b/src/ops/reduce.cc
index 454a35caf4..1c0566e9ca 100644
--- a/src/ops/reduce.cc
+++ b/src/ops/reduce.cc
@@ -41,7 +41,7 @@ ReduceParams Reduce::get_params() const {
   }
   params.keepdims = keepdims;
   params.layer_guid = this->layer_guid;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/reduce.cpp b/src/ops/reduce.cpp
index c062955ed6..fe122b13eb 100644
--- a/src/ops/reduce.cpp
+++ b/src/ops/reduce.cpp
@@ -25,7 +25,7 @@ using Legion::Domain;
 ReduceMeta::ReduceMeta(FFHandler handler,
                        Reduce const *rd,
                        Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, rd) {
   checkCUDNN(miopenCreateReduceTensorDescriptor(&reduceDesc));
   checkCUDNN(miopenCreateTensorDescriptor(&inputTensor));
   checkCUDNN(miopenCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/reduce.cu b/src/ops/reduce.cu
index 65efd90e9b..1352787a12 100644
--- a/src/ops/reduce.cu
+++ b/src/ops/reduce.cu
@@ -24,7 +24,7 @@ using Legion::Domain;
 ReduceMeta::ReduceMeta(FFHandler handler,
                        Reduce const *rd,
                        Domain const &input_domain)
-    : OpMeta(handler) {
+    : OpMeta(handler, rd) {
   checkCUDNN(cudnnCreateReduceTensorDescriptor(&reduceDesc));
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&outputTensor));
diff --git a/src/ops/reshape.cc b/src/ops/reshape.cc
index 49f99e2cb5..4e7fd2eb96 100644
--- a/src/ops/reshape.cc
+++ b/src/ops/reshape.cc
@@ -180,7 +180,7 @@ OpMeta *Reshape::init_task(Task const *task,
                            Runtime *runtime) {
   Reshape const *reshape = (Reshape *)task->args;
   FFHandler handle = *((FFHandler const *)task->local_args);
-  ReshapeMeta *m = new ReshapeMeta(handle);
+  ReshapeMeta *m = new ReshapeMeta(handle, reshape);
   std::strcpy(m->op_name, reshape->name);
   m->layer_guid = reshape->layer_guid;
   m->data_type = reshape->outputs[0]->data_type;
@@ -296,7 +296,7 @@ ReshapeParams Reshape::get_params() const {
   ReshapeParams params;
   params.shape = shape_vec;
   params.layer_guid = this->layer_guid;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/residual_layer_norm.cc b/src/ops/residual_layer_norm.cc
index 8dd670eea3..2a30d12d6d 100644
--- a/src/ops/residual_layer_norm.cc
+++ b/src/ops/residual_layer_norm.cc
@@ -44,7 +44,8 @@ bool operator==(ResidualLayerNormParams const &lhs,
   return lhs.layer_guid == rhs.layer_guid && lhs.axes == rhs.axes &&
          lhs.elementwise_affine == rhs.elementwise_affine &&
          lhs.use_bias == rhs.use_bias &&
-         lhs.use_two_residuals == rhs.use_two_residuals;
+         lhs.use_two_residuals == rhs.use_two_residuals &&
+         lhs.inplace_residual == rhs.inplace_residual;
 }
 
 bool ResidualLayerNormParams::is_valid(
@@ -63,7 +64,8 @@ ResidualLayerNormParams ResidualLayerNorm::get_params() const {
   params.eps = this->eps;
   params.use_bias = this->use_bias;
   params.use_two_residuals = this->use_two_residuals;
-  if (this->name != nullptr) {
+  params.inplace_residual = this->inplace_residual;
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -78,6 +80,7 @@ void FFModel::residual_layer_norm(const Tensor input,
                                   bool elementwise_affine,
                                   float eps,
                                   bool use_bias,
+                                  bool inplace_residual,
                                   DataType data_type,
                                   char const *name) {
   // In PyTorch, axes must be the sizes of the last axes.size() dimensions of
@@ -117,7 +120,6 @@ void FFModel::residual_layer_norm(const Tensor input,
   }
 
   int num_weights = elementwise_affine ? (use_bias ? 2 : 1) : 0;
-  Layer *ln = nullptr;
   Tensor casted_input =
       (data_type != input->data_type)
           ? cast(input, data_type, "type cast for residual_layer_norm")
@@ -133,20 +135,20 @@ void FFModel::residual_layer_norm(const Tensor input,
             ? cast(residual2, data_type, "type cast for residual2_layer_norm")
             : residual2;
   }
-  ln = new Layer(this,
-                 OP_RESIDUAL_LAYERNORM,
-                 data_type,
-                 name,
-                 2 + use_two_residuals /*inputs*/,
-                 num_weights,
-                 2 /*outputs*/,
-                 casted_input,
-                 casted_residual1,
-                 casted_residual2);
+  Layer *ln = new Layer(this,
+                        OP_RESIDUAL_LAYERNORM,
+                        data_type,
+                        name,
+                        2 + use_two_residuals /*inputs*/,
+                        num_weights,
+                        2 /*outputs*/,
+                        casted_input,
+                        casted_residual1,
+                        casted_residual2);
   ln->outputs[0] = create_tensor_legion_ordering(
-      input->num_dims, input->dims, data_type, ln, 0, false /*create_grad*/);
+      input->num_dims, input->dims, data_type, ln, 0, true /*create_grad*/);
   ln->outputs[1] = create_tensor_legion_ordering(
-      input->num_dims, input->dims, data_type, ln, 1, false /*create_grad*/);
+      input->num_dims, input->dims, data_type, ln, 1, true /*create_grad*/);
   {
     int numdims = axes.size();
     int dims[numdims];
@@ -179,6 +181,7 @@ void FFModel::residual_layer_norm(const Tensor input,
   ln->add_int_vector_property("axes", axes);
   ln->add_float_property("eps", eps);
   ln->add_int_property("use_two_residuals", use_two_residuals);
+  ln->add_int_property("inplace_residual", inplace_residual);
   layers.push_back(ln);
   outputs[0] = ln->outputs[0];
   outputs[1] = ln->outputs[1];
@@ -199,6 +202,9 @@ Op *ResidualLayerNorm::create_operator_from_layer(
   layer->get_float_property("eps", eps);
   layer->get_int_property("use_two_residuals", value);
   bool use_two_residuals = (bool)value;
+  layer->get_int_property("inplace_residual", value);
+  bool inplace_residual = (bool)value;
+
   return new ResidualLayerNorm(model,
                                layer->layer_guid,
                                inputs[0],
@@ -209,6 +215,7 @@ Op *ResidualLayerNorm::create_operator_from_layer(
                                elementwise_affine,
                                use_bias,
                                eps,
+                               inplace_residual,
                                false, // allocate_weights
                                layer->name);
 }
@@ -230,6 +237,7 @@ ResidualLayerNorm::ResidualLayerNorm(
                         params.elementwise_affine,
                         params.use_bias,
                         params.eps,
+                        params.inplace_residual,
                         allocate_weights,
                         params.name) {}
 
@@ -243,6 +251,7 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
                                      bool _elementwise_affine,
                                      bool _use_bias,
                                      float _eps,
+                                     bool _inplace_residual,
                                      bool allocate_weights,
                                      char const *name)
     : Op(model,
@@ -256,7 +265,8 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
          _residual1,
          _use_two_residuals ? _residual2 : nullptr),
       elementwise_affine(_elementwise_affine), eps(_eps), axes(_axes),
-      use_bias(_use_bias), use_two_residuals(_use_two_residuals) {
+      use_bias(_use_bias), use_two_residuals(_use_two_residuals),
+      inplace_residual(_inplace_residual) {
   // overwrite layer_guid
   layer_guid = _layer_guid;
   outputs[0] = model.create_parallel_tensor_legion_ordering(
@@ -326,6 +336,22 @@ ResidualLayerNorm::ResidualLayerNorm(FFModel &model,
   }
 }
 
+void ResidualLayerNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  if (inplace_residual) {
+    outputs[0]->parallel_is = inputs[0]->parallel_is;
+    outputs[0]->region = inputs[0]->region;
+    outputs[0]->part = inputs[0]->part;
+    outputs[0]->region_grad = inputs[0]->region_grad;
+    outputs[0]->part_grad = inputs[0]->part_grad;
+    // map output 1 to new region
+    ff.map_tensor(outputs[1], this);
+  } else {
+    Op::map_output_tensors(ff);
+  }
+}
+
 void ResidualLayerNorm::init_inference(
     FFModel const &ff,
     std::vector<ParallelTensor> const &batch_inputs,
@@ -347,13 +373,19 @@ void ResidualLayerNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
   int field_id = 0;
   // input
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  // added: input + residual(s)
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual1
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
@@ -371,13 +403,15 @@ void ResidualLayerNorm::init_inference(
                                                       batch_inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
-  // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(field_id++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
@@ -422,13 +456,17 @@ void ResidualLayerNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
+  assert(outputs[0]->part == inputs[0]->part);
+  assert(outputs[0]->region == inputs[0]->region);
   int field_id = 0;
   // input
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
+  // added: input + residual(s)
+  launcher.add_region_requirement(
+      RegionRequirement(inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual1
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
@@ -439,20 +477,21 @@ void ResidualLayerNorm::init(FFModel const &ff) {
   launcher.add_field(field_id++, FID_DATA);
   // residual2
   if (use_two_residuals) {
-    launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
+    launcher.add_region_requirement(RegionRequirement(inputs[2]->part,
                                                       0 /*projection id*/,
                                                       READ_ONLY,
                                                       EXCLUSIVE,
-                                                      inputs[1]->region));
+                                                      inputs[2]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  if (!inplace_residual) {
+    launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      WRITE_ONLY,
+                                                      EXCLUSIVE,
+                                                      outputs[0]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
-  // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(field_id++, FID_DATA);
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
@@ -516,7 +555,323 @@ void ResidualLayerNorm::forward(FFModel const &ff) {
 }
 
 void ResidualLayerNorm::backward(FFModel const &ff) {
-  assert(false);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // added output
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad 1
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (use_two_residuals) {
+    // residual grad 2
+    launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      inputs[2]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+    // gamma_grad
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                      0 /*projection id*/,
+                                                      READ_WRITE,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+    if (use_bias) {
+      // beta_grad
+      launcher.add_region_requirement(
+          RegionRequirement(weights[1]->part_grad,
+                            0 /*projection id*/,
+                            READ_WRITE,
+                            EXCLUSIVE,
+                            weights[1]->region_grad));
+      launcher.add_field(field_id++, FID_DATA);
+    }
+  }
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void ResidualLayerNorm::backward_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  ResidualLayerNormMeta const *m =
+      *((ResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() ==
+         4 + m->use_two_residuals +
+             (m->elementwise_affine ? (m->use_bias ? 3 : 2) : 0));
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR added_output =
+      helperGetGenericTensorAccessorRO(m->output_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual2_grad;
+  if (m->use_two_residuals) {
+    residual2_grad =
+        helperGetGenericTensorAccessorRW(m->input_type[2],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  GenericTensorAccessorR gamma;
+  GenericTensorAccessorW gamma_grad, beta_grad;
+  if (m->elementwise_affine) {
+    assert(m->use_bias == (regions.size() == 6));
+    gamma = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+    gamma_grad =
+        helperGetGenericTensorAccessorRW(m->output_type[0],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+    if (m->use_bias) {
+      beta_grad =
+          helperGetGenericTensorAccessorRW(m->output_type[0],
+                                           regions[region_idx++],
+                                           task->regions[task_region_idx++],
+                                           FID_DATA,
+                                           ctx,
+                                           runtime);
+    }
+  }
+  ResidualLayerNorm::backward_kernel_wrapper(m,
+                                             output_grad,
+                                             added_output,
+                                             input_grad,
+                                             residual1_grad,
+                                             residual2_grad,
+                                             gamma,
+                                             gamma_grad,
+                                             beta_grad);
+}
+
+Legion::FutureMap ResidualLayerNorm::peft_bwd(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  int field_id = 0;
+  // output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // input grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  // residual grad 1
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(field_id++, FID_DATA);
+  if (use_two_residuals) {
+    // residual grad 2
+    launcher.add_region_requirement(
+        RegionRequirement(batch_inputs[2]->part_grad,
+                          0 /*projection id*/,
+                          reset_input_grads[2] ? WRITE_ONLY : READ_WRITE,
+                          EXCLUSIVE,
+                          batch_inputs[2]->region_grad));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  if (elementwise_affine) {
+    // gamma
+    launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                      0 /*projection id*/,
+                                                      READ_ONLY,
+                                                      EXCLUSIVE,
+                                                      weights[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void ResidualLayerNorm::peft_bwd_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  assert(task->regions.size() == regions.size());
+  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
+  assert(regions.size() == 3 + m->use_two_residuals + m->elementwise_affine);
+
+  int region_idx = 0, task_region_idx = 0;
+
+  GenericTensorAccessorR output_grad =
+      helperGetGenericTensorAccessorRO(m->output_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[1],
+                                       regions[region_idx++],
+                                       task->regions[task_region_idx++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual2_grad;
+  if (m->use_two_residuals) {
+    GenericTensorAccessorW residual2_grad =
+        helperGetGenericTensorAccessorRW(m->input_type[2],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
+  GenericTensorAccessorR gamma;
+  if (m->elementwise_affine) {
+    gamma = helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                             regions[region_idx++],
+                                             task->regions[task_region_idx++],
+                                             FID_DATA,
+                                             ctx,
+                                             runtime);
+  }
+  ResidualLayerNorm::peft_bwd_kernel_wrapper(
+      m, output_grad, input_grad, residual1_grad, residual2_grad, gamma);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    std::vector<GenericTensorAccessorR> input_accessors;
+    input_accessors.push_back(input_grad);
+    input_accessors.push_back(residual1_grad);
+    if (m->use_two_residuals) {
+      input_accessors.push_back(residual2_grad);
+    }
+    std::vector<GenericTensorAccessorR> weights_accessors;
+    if (m->elementwise_affine) {
+      weights_accessors.push_back(gamma);
+    }
+    ResidualLayerNorm::save_inference_tensors_to_file(m,
+                                                      shard_id,
+                                                      bc,
+                                                      input_accessors,
+                                                      weights_accessors,
+                                                      {output_grad},
+                                                      false);
+  }
 }
 
 Op *ResidualLayerNorm::materialize(FFModel &ff,
@@ -554,13 +909,19 @@ FutureMap ResidualLayerNorm::inference(
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
   int field_id = 0;
   // input
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
+  // added: input + residual(s)
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
   launcher.add_field(field_id++, FID_DATA);
   // residual1
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
@@ -578,13 +939,15 @@ FutureMap ResidualLayerNorm::inference(
                                                       batch_inputs[2]->region));
     launcher.add_field(field_id++, FID_DATA);
   }
-  // added: input + residual(s)
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(field_id++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(field_id++, FID_DATA);
+  }
   // layer norm output
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
@@ -620,14 +983,13 @@ void ResidualLayerNorm::inference_task(
 
   assert(task->regions.size() == regions.size());
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
   if (bc->num_tokens == 0) {
     return;
   }
 
-  ResidualLayerNormMeta *m = *((ResidualLayerNormMeta **)task->local_args);
-
   assert(regions.size() ==
-         4 + m->use_two_residuals +
+         3 + m->use_two_residuals +
              (m->elementwise_affine ? (m->use_bias ? 2 : 1) : 0));
 
   int region_idx = 0, task_region_idx = 0;
@@ -655,13 +1017,23 @@ void ResidualLayerNorm::inference_task(
                                          ctx,
                                          runtime);
   }
-  GenericTensorAccessorW added_output =
-      helperGetGenericTensorAccessorWO(m->output_type[0],
-                                       regions[region_idx++],
-                                       task->regions[task_region_idx++],
-                                       FID_DATA,
-                                       ctx,
-                                       runtime);
+  GenericTensorAccessorW added_output;
+  if (m->inplace_residual) {
+    added_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                    regions[0],
+                                                    task->regions[0],
+                                                    FID_DATA,
+                                                    ctx,
+                                                    runtime);
+  } else {
+    added_output =
+        helperGetGenericTensorAccessorWO(m->output_type[0],
+                                         regions[region_idx++],
+                                         task->regions[task_region_idx++],
+                                         FID_DATA,
+                                         ctx,
+                                         runtime);
+  }
   GenericTensorAccessorW output =
       helperGetGenericTensorAccessorWO(m->output_type[1],
                                        regions[region_idx++],
@@ -699,8 +1071,14 @@ void ResidualLayerNorm::inference_task(
     assert(in_domain.get_volume() == residual2_domain.get_volume());
     assert(residual2_domain == in_domain);
   }
-  Domain added_out_domain = runtime->get_index_space_domain(
-      ctx, task->regions[task_region_idx++].region.get_index_space());
+  Domain added_out_domain;
+  if (m->inplace_residual) {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[0].region.get_index_space());
+  } else {
+    added_out_domain = runtime->get_index_space_domain(
+        ctx, task->regions[task_region_idx++].region.get_index_space());
+  }
   Domain out_domain = runtime->get_index_space_domain(
       ctx, task->regions[task_region_idx++].region.get_index_space());
   Domain gamma_domain, beta_domain;
@@ -734,13 +1112,13 @@ void ResidualLayerNorm::inference_task(
          m->effective_num_elements * m->effective_batch_size);
 
   ResidualLayerNorm::inference_kernel_wrapper(
-      m, input, residual1, residual2, added_output, output, gamma, beta);
+      m, bc, input, residual1, residual2, added_output, output, gamma, beta);
 
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
     std::vector<GenericTensorAccessorR> input_accessors;
-    input_accessors.push_back(input);
+    // input_accessors.push_back(input);
     input_accessors.push_back(residual1);
     if (m->use_two_residuals) {
       input_accessors.push_back(residual2);
@@ -779,6 +1157,7 @@ void ResidualLayerNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->eps);
   sez.serialize(this->use_bias);
   sez.serialize(this->use_two_residuals);
+  sez.serialize(this->inplace_residual);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -794,6 +1173,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   bool elementwise_affine;
   bool use_bias;
   bool use_two_residuals;
+  bool inplace_residual;
   float eps;
   size_t id, transformer_layer_id, deserialized_model_id;
   dez.deserialize(id);
@@ -810,6 +1190,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   dez.deserialize(eps);
   dez.deserialize(use_bias);
   dez.deserialize(use_two_residuals);
+  dez.deserialize(inplace_residual);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -827,6 +1208,7 @@ Node ResidualLayerNorm::deserialize(FFModel &ff,
   params.eps = eps;
   params.use_bias = use_bias;
   params.use_two_residuals = use_two_residuals;
+  params.inplace_residual = inplace_residual;
   strcpy(params.name, name);
   if (use_two_residuals) {
     return ff.get_or_create_node<ResidualLayerNorm>(
@@ -853,6 +1235,7 @@ size_t hash<FlexFlow::ResidualLayerNormParams>::operator()(
   hash_combine(key, params.elementwise_affine);
   hash_combine(key, params.use_bias);
   hash_combine(key, params.use_two_residuals);
+  hash_combine(key, params.inplace_residual);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/residual_layer_norm.cpp b/src/ops/residual_layer_norm.cpp
index f1b7a537b0..582e0752ef 100644
--- a/src/ops/residual_layer_norm.cpp
+++ b/src/ops/residual_layer_norm.cpp
@@ -23,11 +23,12 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
                                              ResidualLayerNorm const *ln,
                                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   use_two_residuals = ln->use_two_residuals;
@@ -36,6 +37,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   profiling = ln->profiling;
   inference_debugging = ln->inference_debugging;
   eps = ln->eps;
+  inplace_residual = ln->inplace_residual;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
   gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
@@ -45,6 +47,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 ResidualLayerNormMeta::~ResidualLayerNormMeta(void) {
@@ -75,7 +78,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -84,9 +87,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
-            ? shared[lid]
-            : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -110,8 +111,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T residual2_val = (residual2_ptr == nullptr)
                                 ? T(0)
@@ -120,12 +120,10 @@ __global__ void ResidualLayerNormKernel(int64_t N,
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -137,7 +135,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -161,19 +159,9 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
                                          T const *beta_ptr,
                                          hipStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   hipLaunchKernelGGL(HIP_KERNEL_NAME(ResidualLayerNormKernel<T>),
-                     num_blocks,
-                     num_threads,
+                     m->effective_batch_size,
+                     std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
                      0,
                      stream,
                      m->effective_num_elements,
@@ -188,10 +176,41 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
                      beta_ptr,
                      output_ptr);
 }
+template <typename T>
+void save_inference_tensors(ResidualLayerNormMeta const *m) {
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "fwd_step_" + std::to_string(m->decoding_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
+}
 
 /*static*/
 void ResidualLayerNorm::inference_kernel_wrapper(
-    ResidualLayerNormMeta const *m,
+    ResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &residual1,
     GenericTensorAccessorR const &residual2,
@@ -208,12 +227,13 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     checkCUDA(hipEventCreate(&t_end));
     checkCUDA(hipEventRecord(t_start, stream));
   }
+
   if (m->input_type[0] == DT_FLOAT) {
     ResidualLayerNorm::inference_kernel<float>(
         m,
         input.get_float_ptr(),
         residual1.get_float_ptr(),
-        residual2.get_float_ptr(),
+        m->use_two_residuals ? residual2.get_float_ptr() : nullptr,
         added_output.get_float_ptr(),
         output.get_float_ptr(),
         m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
@@ -224,7 +244,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
         m,
         input.get_half_ptr(),
         residual1.get_half_ptr(),
-        residual2.get_half_ptr(),
+        m->use_two_residuals ? residual2.get_half_ptr() : nullptr,
         added_output.get_half_ptr(),
         output.get_half_ptr(),
         m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
@@ -234,6 +254,76 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     assert(false && "unsupport datatype in layernorm");
   }
 
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->inference_debugging) {
+    if (m->input_type[0] == DT_FLOAT) {
+      save_inference_tensors<float>(m);
+    } else if (m->input_type[0] == DT_HALF) {
+      save_inference_tensors<half>(m);
+    } else {
+      assert(false && "unsupport datatype in layernorm");
+    }
+  }
+
   if (m->profiling) {
     checkCUDA(hipEventRecord(t_end, stream));
     checkCUDA(hipEventSynchronize(t_end));
@@ -245,4 +335,551 @@ void ResidualLayerNorm::inference_kernel_wrapper(
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual1,
+                                      T *dX_residual2,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad1,
+                                      bool reset_residual_grad2,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual1_i = dX_residual1 + i1 * N;
+  T *dX_residual2_i =
+      (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad1) {
+      dX_residual1_i[l] = f_grad_input;
+    } else {
+      dX_residual1_i[l] += f_grad_input;
+    }
+    if (dX_residual2 != nullptr) {
+      if (reset_residual_grad2) {
+        dX_residual2_i[l] = f_grad_input;
+      } else {
+        dX_residual2_i[l] += f_grad_input;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual1,
+                                             T *dX_residual2,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad1,
+                                             bool reset_residual_grad2,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual1,
+             dX_residual2,
+             reset_input_grad,
+             reset_residual_grad1,
+             reset_residual_grad2,
+             N,
+             buf);
+}
+
+/*static*/
+template <typename T>
+void backward_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *added_output_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     T *gamma_grad_ptr,
+                     T *beta_grad_ptr,
+                     hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeInternalGradientsCUDAKernel<T>),
+                     M,
+                     kCUDABlockReduceNumThreads,
+                     0,
+                     stream,
+                     N,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     gamma_ptr,
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(ComputeGradientFusedParamsCUDAKernel<T>),
+                     B,
+                     kCUDANumThreads,
+                     0,
+                     stream,
+                     M,
+                     N,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     static_cast<T *>(m->ds_ptr),
+                     static_cast<T *>(m->db_ptr),
+                     static_cast<T *>(m->scale_ptr),
+                     static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     added_output_ptr,
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual1_grad_ptr,
+                     residual2_grad_ptr,
+                     m->reset_input_grads[0],
+                     m->reset_input_grads[1],
+                     m->reset_input_grads[2],
+                     N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardSimpleCUDAKernel<T>),
+                         B,
+                         kCUDANumThreads,
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(GammaBetaBackwardCUDAKernel<T>),
+                         B,
+                         dim3(kThreadX, kThreadY),
+                         0,
+                         stream,
+                         M,
+                         N,
+                         output_grad_ptr,
+                         added_output_ptr,
+                         static_cast<T *>(m->mean_ptr),
+                         static_cast<T *>(m->rstd_ptr),
+                         gamma_grad_ptr,
+                         beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void ResidualLayerNorm::backward_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &added_output,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual1_grad.get_float_ptr(),
+        m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual1_grad.get_half_ptr(),
+        m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void peft_bwd_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     hipStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
+
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(layer_norm_grad_input_kernel),
+                     blocks,
+                     num_threads,
+                     nshared,
+                     stream,
+                     output_grad_ptr,
+                     static_cast<T const *>(m->input_activation),
+                     static_cast<T *>(m->mean_ptr),
+                     static_cast<T *>(m->rstd_ptr),
+                     gamma_ptr,
+                     input_grad_ptr,
+                     residual1_grad_ptr,
+                     residual2_grad_ptr,
+                     m->reset_input_grads[0],
+                     m->reset_input_grads[1],
+                     m->reset_input_grads[2],
+                     N);
+}
+
+/*static*/
+void ResidualLayerNorm::peft_bwd_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual1_grad.get_float_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_float_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual1_grad.get_half_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_half_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/residual_layer_norm.cu b/src/ops/residual_layer_norm.cu
index e5ebdce6ed..8cdf87a92c 100644
--- a/src/ops/residual_layer_norm.cu
+++ b/src/ops/residual_layer_norm.cu
@@ -22,11 +22,12 @@ namespace FlexFlow {
 #define C10_WARP_SIZE 32
 constexpr int kCUDABlockReduceNumThreads = 512;
 constexpr int kCUDANumThreads = 256;
+constexpr int kColwiseReduceTileSize = 32;
 
 ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
                                              ResidualLayerNorm const *ln,
                                              MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ln) {
   elementwise_affine = ln->elementwise_affine;
   use_bias = ln->use_bias;
   use_two_residuals = ln->use_two_residuals;
@@ -35,6 +36,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
   profiling = ln->profiling;
   inference_debugging = ln->inference_debugging;
   eps = ln->eps;
+  inplace_residual = ln->inplace_residual;
   DataType data_type = ln->data_type;
   size_t totalSize = effective_batch_size * data_type_size(data_type) * 3;
   gpu_mem_allocator.create_legion_instance(reserveInst, totalSize);
@@ -44,6 +46,7 @@ ResidualLayerNormMeta::ResidualLayerNormMeta(FFHandler handle,
       data_type_size(data_type) * effective_batch_size);
   bias_ptr = gpu_mem_allocator.allocate_instance_untyped(
       data_type_size(data_type) * effective_batch_size);
+  allocated_peft_buffer_size = 0;
 }
 
 ResidualLayerNormMeta::~ResidualLayerNormMeta(void) {
@@ -74,7 +77,7 @@ __inline__ __device__ T WarpReduceSum(T val) {
 }
 
 template <typename T>
-__inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
+__inline__ __device__ T BlockReduceSum(T val, T *shared) {
   int const lid = threadIdx.x % C10_WARP_SIZE;
   int const wid = threadIdx.x / C10_WARP_SIZE;
   val = WarpReduceSum(val);
@@ -83,9 +86,7 @@ __inline__ __device__ T BlockReduceSum(T val, T *shared, int max_num_threads) {
     shared[wid] = val;
   }
   __syncthreads();
-  val = (threadIdx.x < min(blockDim.x, max_num_threads) / C10_WARP_SIZE)
-            ? shared[lid]
-            : 0;
+  val = (threadIdx.x < (blockDim.x / C10_WARP_SIZE)) ? shared[lid] : T(0);
   if (wid == 0) {
     val = WarpReduceSum(val);
   }
@@ -109,8 +110,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   const int64_t i = blockIdx.x;
   float sum1 = 0.0f;
   float sum2 = 0.0f;
-  for (int64_t j = threadIdx.x; j < N;
-       j += min(blockDim.x, kCUDABlockReduceNumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T residual2_val = (residual2_ptr == nullptr)
                                 ? T(0)
@@ -119,12 +119,10 @@ __global__ void ResidualLayerNormKernel(int64_t N,
     sum1 += static_cast<float>(X[index]);
     sum2 += static_cast<float>(X[index]) * static_cast<float>(X[index]);
   }
-  if (threadIdx.x < kCUDABlockReduceNumThreads) {
-    sum1 = BlockReduceSum<float>(
-        sum1, m_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-    sum2 = BlockReduceSum<float>(
-        sum2, v_shared, min(blockDim.x, kCUDABlockReduceNumThreads));
-  }
+
+  sum1 = BlockReduceSum<float>(sum1, m_shared);
+  sum2 = BlockReduceSum<float>(sum2, v_shared);
+
   if (threadIdx.x == 0) {
     float const scale = float(1) / static_cast<float>(N);
     sum1 *= scale;
@@ -136,7 +134,7 @@ __global__ void ResidualLayerNormKernel(int64_t N,
   __syncthreads();
 
   using T_ACC = T;
-  for (int64_t j = threadIdx.x; j < N; j += min(blockDim.x, kCUDANumThreads)) {
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
     const int64_t index = i * N + j;
     const T_ACC gamma_v =
         gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
@@ -160,33 +158,57 @@ void ResidualLayerNorm::inference_kernel(ResidualLayerNormMeta const *m,
                                          T const *beta_ptr,
                                          cudaStream_t stream) {
 
-  std::pair<int, int> kernel1_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDABlockReduceNumThreads);
-  std::pair<int, int> kernel2_parallelism =
-      std::make_pair(m->effective_batch_size, kCUDANumThreads);
-
-  int num_blocks =
-      std::max(kernel1_parallelism.first, kernel2_parallelism.first);
-  int num_threads =
-      std::max(kernel1_parallelism.second, kernel2_parallelism.second);
-
   ResidualLayerNormKernel<T>
-      <<<num_blocks, num_threads, 0, stream>>>(m->effective_num_elements,
-                                               m->eps,
-                                               input_ptr,
-                                               residual1_ptr,
-                                               residual2_ptr,
-                                               added_output_ptr,
-                                               static_cast<T *>(m->mean_ptr),
-                                               static_cast<T *>(m->rstd_ptr),
-                                               gamma_ptr,
-                                               beta_ptr,
-                                               output_ptr);
+      <<<m->effective_batch_size,
+         std::min(CUDA_NUM_THREADS, (int)m->effective_num_elements),
+         0,
+         stream>>>(m->effective_num_elements,
+                   m->eps,
+                   input_ptr,
+                   residual1_ptr,
+                   residual2_ptr,
+                   added_output_ptr,
+                   static_cast<T *>(m->mean_ptr),
+                   static_cast<T *>(m->rstd_ptr),
+                   gamma_ptr,
+                   beta_ptr,
+                   output_ptr);
+}
+template <typename T>
+void save_inference_tensors(ResidualLayerNormMeta const *m) {
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "fwd_step_" + std::to_string(m->decoding_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
 }
 
 /*static*/
 void ResidualLayerNorm::inference_kernel_wrapper(
-    ResidualLayerNormMeta const *m,
+    ResidualLayerNormMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input,
     GenericTensorAccessorR const &residual1,
     GenericTensorAccessorR const &residual2,
@@ -203,6 +225,7 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+
   if (m->input_type[0] == DT_FLOAT) {
     ResidualLayerNorm::inference_kernel<float>(
         m,
@@ -229,6 +252,76 @@ void ResidualLayerNorm::inference_kernel_wrapper(
     assert(false && "unsupport datatype in layernorm");
   }
 
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int first_token_offset = bc->requestsInfo[i].first_token_offset_in_batch;
+      int in_dim = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t activation_size_needed =
+            data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_float_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(
+              m->input_activation,
+              added_output.get_half_ptr() + first_token_offset * in_dim,
+              data_type_size(m->input_type[0]) * num_peft_tokens * in_dim,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
+  if (m->inference_debugging) {
+    if (m->input_type[0] == DT_FLOAT) {
+      save_inference_tensors<float>(m);
+    } else if (m->input_type[0] == DT_HALF) {
+      save_inference_tensors<half>(m);
+    } else {
+      assert(false && "unsupport datatype in layernorm");
+    }
+  }
+
   if (m->profiling) {
     cudaEventRecord(t_end, stream);
     checkCUDA(cudaEventSynchronize(t_end));
@@ -240,4 +333,529 @@ void ResidualLayerNorm::inference_kernel_wrapper(
   }
 }
 
+template <typename T>
+__global__ void ComputeInternalGradientsCUDAKernel(
+    int64_t N, T const *dY, T const *X, T const *gamma, T *ds, T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC ds_shared[C10_WARP_SIZE];
+  __shared__ T_ACC db_shared[C10_WARP_SIZE];
+  const int64_t i = blockIdx.x;
+  T_ACC sum1 = 0;
+  T_ACC sum2 = 0;
+  for (int64_t j = threadIdx.x; j < N; j += blockDim.x) {
+    const int64_t index = i * N + j;
+    const T_ACC gamma_v =
+        gamma == nullptr ? T_ACC(1) : static_cast<T_ACC>(gamma[j]);
+    sum1 +=
+        static_cast<T_ACC>(dY[index]) * static_cast<T_ACC>(X[index]) * gamma_v;
+    sum2 += static_cast<T_ACC>(dY[index]) * gamma_v;
+  }
+  sum1 = BlockReduceSum<T_ACC>(sum1, ds_shared);
+  sum2 = BlockReduceSum<T_ACC>(sum2, db_shared);
+  if (threadIdx.x == 0) {
+    ds[i] = sum1;
+    db[i] = sum2;
+  }
+}
+
+template <typename T>
+__global__ void ComputeGradientFusedParamsCUDAKernel(int64_t M,
+                                                     int64_t N,
+                                                     T const *mean,
+                                                     T const *rstd,
+                                                     T const *ds,
+                                                     T const *db,
+                                                     T *c1,
+                                                     T *c2) {
+  using T_ACC = T;
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < M) {
+    const T_ACC s = T_ACC(1) / static_cast<T_ACC>((int)N);
+    const T_ACC a = (db[index] * static_cast<T_ACC>(mean[index]) - ds[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) *
+                    static_cast<T_ACC>(rstd[index]) * s;
+    c1[index] = a;
+    c2[index] = -(a * static_cast<T_ACC>(mean[index]) +
+                  db[index] * static_cast<T_ACC>(rstd[index]) * s);
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardSimpleCUDAKernel(int64_t M,
+                                                  int64_t N,
+                                                  T const *dY,
+                                                  T const *X,
+                                                  T const *mean,
+                                                  T const *rstd,
+                                                  T *dg,
+                                                  T *db) {
+  using T_ACC = T;
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < N) {
+    T_ACC sum1 = 0;
+    T_ACC sum2 = 0;
+    for (int64_t i = 0; i < M; ++i) {
+      const int64_t index = i * N + j;
+      sum1 += dg == nullptr ? T_ACC(0)
+                            : static_cast<T_ACC>(dY[index]) *
+                                  (static_cast<T_ACC>(X[index]) -
+                                   static_cast<T_ACC>(mean[i])) *
+                                  static_cast<T_ACC>(rstd[i]);
+      sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index]);
+    }
+    if (dg != nullptr) {
+      dg[j] = sum1;
+    }
+    if (db != nullptr) {
+      db[j] = sum2;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GammaBetaBackwardCUDAKernel(int64_t M,
+                                            int64_t N,
+                                            T const *dY,
+                                            T const *X,
+                                            T const *mean,
+                                            T const *rstd,
+                                            T *dg,
+                                            T *db) {
+  using T_ACC = T;
+  __shared__ T_ACC g_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  __shared__ T_ACC b_shared[kColwiseReduceTileSize][kColwiseReduceTileSize + 1];
+  const int64_t j = blockIdx.x * blockDim.x + threadIdx.x;
+  T_ACC dg_sum1 = 0;
+  T_ACC dg_sum2 = 0;
+  T_ACC db_sum1 = 0;
+  T_ACC db_sum2 = 0;
+  if (j < N) {
+    for (int64_t i = threadIdx.y; i < M; i += blockDim.y * 2) {
+      const int64_t i1 = i;
+      const int64_t i2 = i + blockDim.y;
+      const int64_t index1 = i1 * N + j;
+      const int64_t index2 = i2 * N + j;
+      dg_sum1 += dg == nullptr ? T_ACC(0)
+                               : static_cast<T_ACC>(dY[index1]) *
+                                     (static_cast<T_ACC>(X[index1]) -
+                                      static_cast<T_ACC>(mean[i1])) *
+                                     static_cast<T_ACC>(rstd[i1]);
+      db_sum1 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index1]);
+      if (i2 < M) {
+        dg_sum2 += dg == nullptr ? T_ACC(0)
+                                 : static_cast<T_ACC>(dY[index2]) *
+                                       (static_cast<T_ACC>(X[index2]) -
+                                        static_cast<T_ACC>(mean[i2])) *
+                                       static_cast<T_ACC>(rstd[i2]);
+        db_sum2 += db == nullptr ? T_ACC(0) : static_cast<T_ACC>(dY[index2]);
+      }
+    }
+  }
+  g_shared[threadIdx.y][threadIdx.x] = dg_sum1;
+  g_shared[threadIdx.y + blockDim.y][threadIdx.x] = dg_sum2;
+  b_shared[threadIdx.y][threadIdx.x] = db_sum1;
+  b_shared[threadIdx.y + blockDim.y][threadIdx.x] = db_sum2;
+  __syncthreads();
+  T_ACC sum1 = g_shared[threadIdx.x][threadIdx.y];
+  T_ACC sum2 = b_shared[threadIdx.x][threadIdx.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+  sum1 = g_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum2 = b_shared[threadIdx.x][threadIdx.y + blockDim.y];
+  sum1 = WarpReduceSum(sum1);
+  sum2 = WarpReduceSum(sum2);
+  if (threadIdx.x == 0) {
+    const int64_t j = blockIdx.x * blockDim.x + threadIdx.y + blockDim.y;
+    if (j < N) {
+      if (dg != nullptr) {
+        dg[j] = sum1;
+      }
+      if (db != nullptr) {
+        db[j] = sum2;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ __inline__ void compute_gI(T const *__restrict__ dY,
+                                      T const *__restrict__ X,
+                                      T const *__restrict__ mean,
+                                      T const *__restrict__ rstd,
+                                      T const *__restrict__ gamma,
+                                      T *dX,
+                                      T *dX_residual1,
+                                      T *dX_residual2,
+                                      bool reset_input_grad,
+                                      bool reset_residual_grad1,
+                                      bool reset_residual_grad2,
+                                      int const N,
+                                      T *buf) {
+  auto const i1 = blockIdx.x;
+  const T mean_val = mean[i1];
+  const T rstd_val = rstd[i1];
+  T stats_x1{0}, stats_x2{0};
+  constexpr int unroll = 4;
+  auto l = unroll * threadIdx.x;
+  T const *X_i = X + i1 * N;
+  T const *dY_i = dY + i1 * N;
+  T *dX_i = dX + i1 * N;
+  T *dX_residual1_i = dX_residual1 + i1 * N;
+  T *dX_residual2_i =
+      (dX_residual2 != nullptr) ? dX_residual2 + i1 * N : nullptr;
+  // vectorized reads don't improve perf, so use regular unrolling
+
+  for (; l + unroll - 1 < N; l += blockDim.x * unroll) {
+#pragma unroll
+    for (int k = 0; k < unroll; k++) {
+      T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l + k]) : T(1);
+      const T c_h = static_cast<T>(X_i[l + k]);
+      const T c_loss = static_cast<T>(dY_i[l + k]);
+      stats_x1 += c_loss * gamma_val;
+      stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+    }
+  }
+  for (; l < N; l++) {
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    const T c_h = static_cast<T>(X_i[l]);
+    const T c_loss = static_cast<T>(dY_i[l]);
+    stats_x1 += c_loss * gamma_val;
+    stats_x2 += c_loss * gamma_val * (c_h - mean_val) * rstd_val;
+  }
+
+  stats_x1 = BlockReduceSum(stats_x1, buf);
+  stats_x2 = BlockReduceSum(stats_x2, buf);
+  if (threadIdx.x == 0) {
+    buf[0] = stats_x1;
+    buf[1] = stats_x2;
+  }
+  __syncthreads();
+  stats_x1 = buf[0];
+  stats_x2 = buf[1];
+  T fH = N;
+  T term1 = (T(1) / fH) * rstd_val;
+
+  for (int l = threadIdx.x; l < N; l += blockDim.x) {
+    const T x = X_i[l];
+    const T dy = dY_i[l];
+    T gamma_val = (gamma != nullptr) ? static_cast<T>(gamma[l]) : T(1);
+    T f_grad_input = fH * gamma_val * dy;
+    f_grad_input -= (x - mean_val) * rstd_val * stats_x2;
+    f_grad_input -= stats_x1;
+    f_grad_input *= term1;
+    if (reset_input_grad) {
+      dX_i[l] = f_grad_input;
+    } else {
+      dX_i[l] += f_grad_input;
+    }
+    if (reset_residual_grad1) {
+      dX_residual1_i[l] = f_grad_input;
+    } else {
+      dX_residual1_i[l] += f_grad_input;
+    }
+    if (dX_residual2 != nullptr) {
+      if (reset_residual_grad2) {
+        dX_residual2_i[l] = f_grad_input;
+      } else {
+        dX_residual2_i[l] += f_grad_input;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void layer_norm_grad_input_kernel(T const *__restrict__ dY,
+                                             T const *__restrict__ X,
+                                             T const *__restrict__ mean,
+                                             T const *__restrict__ rstd,
+                                             T const *__restrict__ gamma,
+                                             T *dX,
+                                             T *dX_residual1,
+                                             T *dX_residual2,
+                                             bool reset_input_grad,
+                                             bool reset_residual_grad1,
+                                             bool reset_residual_grad2,
+                                             int const N) {
+  alignas(sizeof(double)) extern __shared__ char s_data1[];
+  T *buf = reinterpret_cast<T *>(&s_data1);
+  compute_gI(dY,
+             X,
+             mean,
+             rstd,
+             gamma,
+             dX,
+             dX_residual1,
+             dX_residual2,
+             reset_input_grad,
+             reset_residual_grad1,
+             reset_residual_grad2,
+             N,
+             buf);
+}
+
+/*static*/
+template <typename T>
+void backward_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T const *added_output_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     T *gamma_grad_ptr,
+                     T *beta_grad_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+  ComputeInternalGradientsCUDAKernel<T>
+      <<<M, kCUDABlockReduceNumThreads, 0, stream>>>(
+          N,
+          output_grad_ptr,
+          added_output_ptr,
+          gamma_ptr,
+          static_cast<T *>(m->ds_ptr),
+          static_cast<T *>(m->db_ptr));
+  const int64_t B = (M + kCUDANumThreads - 1) / kCUDANumThreads;
+  ComputeGradientFusedParamsCUDAKernel<T>
+      <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                          N,
+                                          static_cast<T *>(m->mean_ptr),
+                                          static_cast<T *>(m->rstd_ptr),
+                                          static_cast<T *>(m->ds_ptr),
+                                          static_cast<T *>(m->db_ptr),
+                                          static_cast<T *>(m->scale_ptr),
+                                          static_cast<T *>(m->bias_ptr));
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      added_output_ptr,
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual1_grad_ptr,
+      residual2_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      m->reset_input_grads[2],
+      N);
+
+  if (gamma_grad_ptr != NULL || beta_grad_ptr != NULL) {
+    if (M < 512) {
+      // For small batch size, do colwise reduce directly
+      const int64_t B = (N + kCUDANumThreads - 1) / kCUDANumThreads;
+      GammaBetaBackwardSimpleCUDAKernel<T>
+          <<<B, kCUDANumThreads, 0, stream>>>(M,
+                                              N,
+                                              output_grad_ptr,
+                                              added_output_ptr,
+                                              static_cast<T *>(m->mean_ptr),
+                                              static_cast<T *>(m->rstd_ptr),
+                                              gamma_grad_ptr,
+                                              beta_grad_ptr);
+    } else {
+      const int64_t B =
+          (N + kColwiseReduceTileSize - 1) / kColwiseReduceTileSize;
+      constexpr int kThreadX = kColwiseReduceTileSize;
+      constexpr int kThreadY = kColwiseReduceTileSize / 2;
+      GammaBetaBackwardCUDAKernel<T>
+          <<<B, dim3(kThreadX, kThreadY), 0, stream>>>(
+              M,
+              N,
+              output_grad_ptr,
+              added_output_ptr,
+              static_cast<T *>(m->mean_ptr),
+              static_cast<T *>(m->rstd_ptr),
+              gamma_grad_ptr,
+              beta_grad_ptr);
+    }
+  }
+}
+
+/*static*/
+void ResidualLayerNorm::backward_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &added_output,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma,
+    GenericTensorAccessorW const &gamma_grad,
+    GenericTensorAccessorW const &beta_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    backward_kernel(
+        m,
+        output_grad.get_float_ptr(),
+        added_output.get_float_ptr(),
+        input_grad.get_float_ptr(),
+        residual1_grad.get_float_ptr(),
+        m->use_two_residuals ? residual2_grad.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_float_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_float_ptr()
+                                               : nullptr,
+        stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    backward_kernel(
+        m,
+        output_grad.get_half_ptr(),
+        added_output.get_half_ptr(),
+        input_grad.get_half_ptr(),
+        residual1_grad.get_half_ptr(),
+        m->use_two_residuals ? residual2_grad.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+        m->elementwise_affine ? gamma_grad.get_half_ptr() : nullptr,
+        (m->elementwise_affine && m->use_bias) ? beta_grad.get_half_ptr()
+                                               : nullptr,
+        stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualLayerNorm] backward time (CF) = %.2fms\n", elapsed);
+  }
+}
+
+/*static*/
+template <typename T>
+void peft_bwd_kernel(ResidualLayerNormMeta const *m,
+                     T const *output_grad_ptr,
+                     T *input_grad_ptr,
+                     T *residual1_grad_ptr,
+                     T *residual2_grad_ptr,
+                     T const *gamma_ptr,
+                     cudaStream_t stream) {
+  const int64_t M = m->effective_batch_size;
+  const int64_t N = m->effective_num_elements;
+
+  if (m->inference_debugging) {
+    // save stuff here
+    std::string op_name_without_uid =
+        ResidualLayerNorm::get_op_name_without_uid(m);
+    char const *folder_path = "./inference_tensors/";
+    std::string base_filepath = std::string(folder_path);
+    if (m->layer_guid.model_id > 0) {
+      base_filepath += "model_" + std::to_string(m->layer_guid.model_id) + "_";
+    }
+    base_filepath += "bwd_step_" + std::to_string(m->bwd_step);
+    base_filepath += "_layers_" +
+                     std::to_string(m->layer_guid.transformer_layer_id) + "_" +
+                     op_name_without_uid + "_shard_" + std::to_string(0);
+
+    std::string filename1 = base_filepath + "_mean";
+    save_tensor(static_cast<T *>(m->mean_ptr),
+                m->effective_batch_size,
+                filename1.c_str());
+    std::string filename2 = base_filepath + "_rstd";
+    save_tensor(static_cast<T *>(m->rstd_ptr),
+                m->effective_batch_size,
+                filename2.c_str());
+    std::string filename3 = base_filepath + "_input_activation";
+    save_tensor(static_cast<T *>(m->input_activation),
+                m->effective_batch_size * m->effective_num_elements,
+                filename3.c_str());
+  }
+
+  int const warp_size = C10_WARP_SIZE;
+  int const num_threads = 128;
+  const dim3 blocks(M);
+  int nshared = (num_threads / warp_size) * sizeof(T);
+
+  layer_norm_grad_input_kernel<<<blocks, num_threads, nshared, stream>>>(
+      output_grad_ptr,
+      static_cast<T const *>(m->input_activation),
+      static_cast<T *>(m->mean_ptr),
+      static_cast<T *>(m->rstd_ptr),
+      gamma_ptr,
+      input_grad_ptr,
+      residual1_grad_ptr,
+      residual2_grad_ptr,
+      m->reset_input_grads[0],
+      m->reset_input_grads[1],
+      m->reset_input_grads[2],
+      N);
+}
+
+/*static*/
+void ResidualLayerNorm::peft_bwd_kernel_wrapper(
+    ResidualLayerNormMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input_grad,
+    GenericTensorAccessorW const &residual1_grad,
+    GenericTensorAccessorW const &residual2_grad,
+    GenericTensorAccessorR const &gamma) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  if (m->output_type[0] == DT_FLOAT) {
+    peft_bwd_kernel(m,
+                    output_grad.get_float_ptr(),
+                    input_grad.get_float_ptr(),
+                    residual1_grad.get_float_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_float_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_float_ptr() : nullptr,
+                    stream);
+  } else if (m->output_type[0] == DT_HALF) {
+    peft_bwd_kernel(m,
+                    output_grad.get_half_ptr(),
+                    input_grad.get_half_ptr(),
+                    residual1_grad.get_half_ptr(),
+                    m->use_two_residuals ? residual2_grad.get_half_ptr()
+                                         : nullptr,
+                    m->elementwise_affine ? gamma.get_half_ptr() : nullptr,
+                    stream);
+  } else {
+    assert(false && "Unsupported data type");
+  }
+
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[ResidualLayerNorm] peft_bwd time (CF) = %.2fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/residual_rms_norm.cc b/src/ops/residual_rms_norm.cc
index b3ee7179d0..744902f908 100644
--- a/src/ops/residual_rms_norm.cc
+++ b/src/ops/residual_rms_norm.cc
@@ -42,7 +42,8 @@ using namespace FlexFlow::Kernels::ResidualRMSNorm;
 
 bool operator==(ResidualRMSNormParams const &lhs,
                 ResidualRMSNormParams const &rhs) {
-  return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps;
+  return lhs.layer_guid == rhs.layer_guid && lhs.eps == rhs.eps &&
+         lhs.dim == rhs.dim && lhs.inplace_residual == rhs.inplace_residual;
 }
 
 bool ResidualRMSNormParams::is_valid(
@@ -55,7 +56,8 @@ ResidualRMSNormParams ResidualRMSNorm::get_params() const {
   params.layer_guid = this->layer_guid;
   params.eps = this->eps;
   params.dim = this->dim;
-  if (this->name != nullptr) {
+  params.inplace_residual = this->inplace_residual;
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -66,6 +68,7 @@ void FFModel::residual_rms_norm(const Tensor input1,
                                 Tensor *outputs,
                                 float eps,
                                 int dim,
+                                bool inplace_residual,
                                 DataType data_type,
                                 char const *name) {
   if (data_type == DT_NONE) {
@@ -90,9 +93,9 @@ void FFModel::residual_rms_norm(const Tensor input1,
                         casted_input2);
 
   rm->outputs[0] = create_tensor_legion_ordering(
-      input1->num_dims, input1->dims, data_type, rm, 0, false /*create_grad*/);
+      input1->num_dims, input1->dims, data_type, rm, 0, true /*create_grad*/);
   rm->outputs[1] = create_tensor_legion_ordering(
-      input1->num_dims, input1->dims, data_type, rm, 1, false /*create_grad*/);
+      input1->num_dims, input1->dims, data_type, rm, 1, true /*create_grad*/);
 
   // weights
   int weight_dims[1] = {dim};
@@ -100,12 +103,13 @@ void FFModel::residual_rms_norm(const Tensor input1,
                                                  weight_dims,
                                                  data_type,
                                                  rm,
-                                                 true /*create_grad*/,
+                                                 false /*create_grad*/,
                                                  nullptr,
                                                  CHOSEN_SYNC_TYPE);
 
   rm->add_float_property("eps", eps);
   rm->add_int_property("dim", dim);
+  rm->add_int_property("inplace_residual", inplace_residual);
   layers.push_back(rm);
   outputs[0] = rm->outputs[0];
   outputs[1] = rm->outputs[1];
@@ -120,6 +124,8 @@ Op *ResidualRMSNorm::create_operator_from_layer(
   long long value;
   layer->get_int_property("dim", value);
   int dim = value;
+  layer->get_int_property("inplace_residual", value);
+  bool inplace_residual = (bool)value;
 
   return new ResidualRMSNorm(model,
                              layer->layer_guid,
@@ -127,6 +133,7 @@ Op *ResidualRMSNorm::create_operator_from_layer(
                              inputs[1],
                              eps,
                              dim,
+                             inplace_residual,
                              false,
                              layer->name);
 }
@@ -143,6 +150,7 @@ ResidualRMSNorm::ResidualRMSNorm(
                       inputs.second,
                       params.eps,
                       params.dim,
+                      params.inplace_residual,
                       allocate_weights,
                       params.name) {}
 
@@ -157,6 +165,7 @@ ResidualRMSNorm::ResidualRMSNorm(
                       inputs.second,
                       other.eps,
                       other.dim,
+                      other.inplace_residual,
                       allocate_weights,
                       other.name) {}
 ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
@@ -165,6 +174,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
                                  const ParallelTensor _input2,
                                  float _eps,
                                  int dim,
+                                 bool _inplace_residual,
                                  bool allocate_weights,
                                  char const *name)
     : Op(model,
@@ -177,6 +187,7 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
          _input1,
          _input2) {
   eps = _eps;
+  inplace_residual = _inplace_residual;
   inputs[0] = _input1;
   inputs[1] = _input2;
   layer_guid = _layer_guid;
@@ -234,6 +245,22 @@ ResidualRMSNorm::ResidualRMSNorm(FFModel &model,
   }
 }
 
+void ResidualRMSNorm::map_output_tensors(FFModel &ff) {
+  assert(numOutputs == 2);
+  assert(outputs[0]->get_volume() == inputs[0]->get_volume());
+  if (inplace_residual) {
+    outputs[0]->parallel_is = inputs[0]->parallel_is;
+    outputs[0]->region = inputs[0]->region;
+    outputs[0]->part = inputs[0]->part;
+    outputs[0]->region_grad = inputs[0]->region_grad;
+    outputs[0]->part_grad = inputs[0]->part_grad;
+    // map output 1 to new region
+    ff.map_tensor(outputs[1], this);
+  } else {
+    Op::map_output_tensors(ff);
+  }
+}
+
 void ResidualRMSNorm::init(FFModel const &ff) {
   assert(check_output_input_weight_same_parallel_is());
   parallel_is = outputs[0]->parallel_is;
@@ -249,36 +276,44 @@ void ResidualRMSNorm::init(FFModel const &ff) {
                          false /*must*/,
                          0 /*mapper_id*/,
                          outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(outputs[0]->part == inputs[0]->part);
+    assert(outputs[0]->region == inputs[0]->region);
+  }
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                      0 /*projection id*/,
+                                                      WRITE_ONLY,
+                                                      EXCLUSIVE,
+                                                      outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   launcher.add_region_requirement(RegionRequirement(outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap(ff, fm);
@@ -306,36 +341,45 @@ void ResidualRMSNorm::init_inference(
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   FutureMap fm = runtime->execute_index_space(ctx, launcher);
   fm.wait_all_results();
   set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
@@ -383,73 +427,131 @@ FutureMap
                          0 /*mapper_id*/,
                          machine_view_hash);
   launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
+  if (inplace_residual) {
+    assert(batch_outputs[0]->part == batch_inputs[0]->part);
+    assert(batch_outputs[0]->region == batch_inputs[0]->region);
+  }
+  int fid = 0;
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part,
+                        0 /*projection id*/,
+                        inplace_residual ? READ_WRITE : READ_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region));
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(batch_inputs[1]->part,
                                                     0 /*projection id*/,
                                                     READ_ONLY,
                                                     EXCLUSIVE,
                                                     batch_inputs[1]->region));
-  launcher.add_field(1, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(2, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region));
+    launcher.add_field(fid++, FID_DATA);
+  }
   launcher.add_region_requirement(RegionRequirement(batch_outputs[1]->part,
                                                     0 /*projection id*/,
                                                     WRITE_ONLY,
                                                     EXCLUSIVE,
                                                     batch_outputs[1]->region));
-  launcher.add_field(3, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
                                                     0 /*projection id*/,
-                                                    READ_WRITE,
+                                                    READ_ONLY,
                                                     EXCLUSIVE,
                                                     weights[0]->region));
-  launcher.add_field(4, FID_DATA);
+  launcher.add_field(fid++, FID_DATA);
   return runtime->execute_index_space(ctx, launcher);
 }
 
 /*
-  regions[0](I): input1
+  regions[0](I/O): input1 / residual output
   regions[1](I): input2
-  regions[2](O): residual output
-  regions[3](O): output
-  regions[4](I/O): weight
+  regions[2](O): output
+  regions[3](I): weight
 */
 void ResidualRMSNorm::inference_task(Task const *task,
                                      std::vector<PhysicalRegion> const &regions,
                                      Context ctx,
                                      Runtime *runtime) {
-  assert(task->regions.size() == 5);
-  assert(regions.size() == 5);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
   }
   ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args);
+  assert(task->regions.size() == 5 - m->inplace_residual);
+  assert(regions.size() == 5 - m->inplace_residual);
   GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO(
       m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO(
       m->input_type[1], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW residual_output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[1], regions[3], task->regions[3], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
-      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input1, input2, weight, residual_output, output);
+
+  GenericTensorAccessorW residual_output, output;
+  GenericTensorAccessorR weight;
+  if (m->inplace_residual) {
+    // residual_output is mapped to the same region as the input
+    residual_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                       regions[0],
+                                                       task->regions[0],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    output = helperGetGenericTensorAccessorWO(m->output_type[1],
+                                              regions[2],
+                                              task->regions[2],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+    weight = helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                              regions[3],
+                                              task->regions[3],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+  } else {
+    residual_output = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                       regions[2],
+                                                       task->regions[2],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    output = helperGetGenericTensorAccessorWO(m->output_type[1],
+                                              regions[3],
+                                              task->regions[3],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+    weight = helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                              regions[4],
+                                              task->regions[4],
+                                              FID_DATA,
+                                              ctx,
+                                              runtime);
+  }
+
+  inference_kernel_wrapper(
+      m, bc, input1, input2, weight, residual_output, output);
+
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
-    ResidualRMSNorm::save_inference_tensors_to_file(
-        m, shard_id, bc, {input1, input2}, {weight}, {residual_output, output});
+    if (m->inplace_residual) {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m, shard_id, bc, {input2}, {weight}, {residual_output, output});
+    } else {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m,
+          shard_id,
+          bc,
+          {input1, input2},
+          {weight},
+          {residual_output, output});
+    }
   }
 }
 
@@ -459,6 +561,7 @@ void ResidualRMSNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.model_id);
   sez.serialize(this->eps);
   sez.serialize(this->dim);
+  sez.serialize(this->inplace_residual);
   sez.serialize(strlen(this->name));
   sez.serialize(this->name, strlen(this->name));
 }
@@ -479,6 +582,8 @@ Node ResidualRMSNorm::deserialize(FFModel &ff,
   LayerID layer_guid(id, transformer_layer_id, deserialized_model_id);
   dez.deserialize(eps);
   dez.deserialize(dim);
+  int inplace_residual;
+  dez.deserialize(inplace_residual);
   size_t name_len;
   char name[MAX_OPNAME] = {0};
   dez.deserialize(name_len);
@@ -487,13 +592,285 @@ Node ResidualRMSNorm::deserialize(FFModel &ff,
   params.layer_guid = layer_guid;
   params.eps = eps;
   params.dim = dim;
+  params.inplace_residual = inplace_residual;
   strcpy(params.name, name);
   return ff.get_or_create_node<ResidualRMSNorm>({inputs[0], inputs[1]}, params);
 }
 
 void ResidualRMSNorm::backward(FFModel const &ff) {
-  assert(false);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RESIDUAL_RMSNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // regions[0](I): RMS output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[1]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I): residual output / RMS input
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // regions[2](I/O): residual input grad 0
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(2, FID_DATA);
+  // regions[3](I/O): residual input grad 1
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(3, FID_DATA);
+  // regions[4](I): gamma
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(4, FID_DATA);
+  // regions[5](I/O): gamma_grad
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(5, FID_DATA);
+
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I): Residual output / RMS input
+  regions[2](I/O): Residual input 0 grad
+  regions[3](I/O): Residual input 1 grad
+  regions[4](I): weight
+  regions[5](I/O): weight_grad
+*/
+void ResidualRMSNorm::backward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  assert(task->regions.size() == 6);
+  assert(regions.size() == 6);
+  ResidualRMSNormMeta const *m = *((ResidualRMSNormMeta **)task->local_args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW residual_output_rms_input =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[1],
+                                       task->regions[1],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_input0_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[2],
+                                       task->regions[2],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW residual_input1_grad =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[3],
+                                       task->regions[3],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW(
+      m->weight_type[0], regions[5], task->regions[5], FID_DATA, ctx, runtime);
+  backward_kernel_wrapper(m,
+                          output_grad,
+                          residual_output_rms_input,
+                          residual_input0_grad,
+                          residual_input1_grad,
+                          weight,
+                          weight_grad);
 }
+
+Legion::FutureMap
+    ResidualRMSNorm::peft_bwd(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  int fid = 0;
+  // residual input grad 0
+  launcher.add_region_requirement(RegionRequirement(
+      batch_inputs[0]->part_grad,
+      0 /*projection id*/,
+      inplace_residual && !reset_input_grads[0] ? READ_WRITE : WRITE_ONLY,
+      EXCLUSIVE,
+      batch_inputs[0]->region_grad));
+  launcher.add_field(fid++, FID_DATA);
+  // residual input grad 1
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(fid++, FID_DATA);
+  if (!inplace_residual && !reset_input_grads[0]) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part_grad,
+                          0 /*projection id*/,
+                          READ_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region_grad));
+    launcher.add_field(fid++, FID_DATA);
+  }
+  // RMS output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[1]->part_grad,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[1]->region_grad));
+  launcher.add_field(fid++, FID_DATA);
+  // gamma
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(fid++, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): RMS output_grad
+  regions[1](I/O): Residual input 0 grad
+  regions[2](I/O): Residual input 1 grad
+  regions[3](I): weight
+*/
+void ResidualRMSNorm::peft_bwd_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  ResidualRMSNormMeta *m = *((ResidualRMSNormMeta **)task->local_args);
+  int expected_regions =
+      (m->inplace_residual || m->reset_input_grads[0]) ? 4 : 5;
+  assert(task->regions.size() == expected_regions);
+  assert(regions.size() == expected_regions);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+
+  int rid = 0, t_rid = 0;
+  GenericTensorAccessorW input_grad_0 =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorW input_grad_1 =
+      helperGetGenericTensorAccessorRW(m->input_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
+  GenericTensorAccessorR output_grad_0;
+  if (!m->reset_input_grads[0]) {
+    if (m->inplace_residual) {
+      // mapped to input 0
+      output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                                       regions[0],
+                                                       task->regions[0],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    } else {
+      output_grad_0 = helperGetGenericTensorAccessorRO(m->output_type[0],
+                                                       regions[rid++],
+                                                       task->regions[t_rid++],
+                                                       FID_DATA,
+                                                       ctx,
+                                                       runtime);
+    }
+  }
+  GenericTensorAccessorR output_grad_1 =
+      helperGetGenericTensorAccessorRO(m->output_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+  GenericTensorAccessorR weight =
+      helperGetGenericTensorAccessorRO(m->weight_type[0],
+                                       regions[rid++],
+                                       task->regions[t_rid++],
+                                       FID_DATA,
+                                       ctx,
+                                       runtime);
+
+  peft_bwd_kernel_wrapper(
+      m, bc, output_grad_0, output_grad_1, input_grad_0, input_grad_1, weight);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    if (!m->reset_input_grads[0]) {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m,
+          shard_id,
+          bc,
+          {input_grad_0, input_grad_1},
+          {weight},
+          {output_grad_0, output_grad_1},
+          false);
+    } else {
+      ResidualRMSNorm::save_inference_tensors_to_file(
+          m,
+          shard_id,
+          bc,
+          {input_grad_0, input_grad_1},
+          {weight},
+          {output_grad_1},
+          false);
+    }
+  }
+}
+
 Op *ResidualRMSNorm::materialize(FFModel &ff,
                                  ParallelTensor inputs[],
                                  int num_inputs) const {
@@ -516,6 +893,7 @@ size_t hash<FlexFlow::ResidualRMSNormParams>::operator()(
   hash_combine(key, params.eps);
   hash_combine(key, params.layer_guid.id);
   hash_combine(key, params.dim);
+  hash_combine(key, params.inplace_residual);
   return key;
 }
 }; // namespace std
diff --git a/src/ops/rms_norm.cc b/src/ops/rms_norm.cc
index 79dce65c57..8dadd7dcc3 100644
--- a/src/ops/rms_norm.cc
+++ b/src/ops/rms_norm.cc
@@ -53,7 +53,7 @@ RMSNormParams RMSNorm::get_params() const {
   params.layer_guid = this->layer_guid;
   params.eps = this->eps;
   params.dim = this->dim;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -422,7 +422,7 @@ void RMSNorm::inference_task(Task const *task,
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
   GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
       m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
-  forward_kernel_wrapper(m, input, weight, output);
+  inference_kernel_wrapper(m, bc, input, weight, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -431,6 +431,166 @@ void RMSNorm::inference_task(Task const *task,
   }
 }
 
+void RMSNorm::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(RMSNORM_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // regions[0](I): output_grad
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I): input
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // regions[2](I/O): input_grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(2, FID_DATA);
+  // regions[3](I): gamma
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(3, FID_DATA);
+  // regions[4](I/O): gamma_grad
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region_grad));
+  launcher.add_field(4, FID_DATA);
+
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output_grad
+  regions[1](I): input
+  regions[2](I/O): input_grad
+  regions[3](I): weight
+  regions[4](I/O): weight_grad
+*/
+void RMSNorm::backward_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(task->regions.size() == 5);
+  assert(regions.size() == 5);
+  RMSNormMeta const *m = *((RMSNormMeta **)task->local_args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW weight_grad = helperGetGenericTensorAccessorRW(
+      m->weight_type[0], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+  backward_kernel_wrapper(
+      m, output_grad, input, input_grad, weight, weight_grad);
+}
+
+Legion::FutureMap
+    RMSNorm::peft_bwd(FFModel const &ff,
+                      BatchConfigFuture const &bc,
+                      std::vector<ParallelTensor> const &batch_inputs,
+                      std::vector<ParallelTensor> const &batch_outputs,
+                      MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(RMSNORM_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  // regions[0](I): output_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // regions[1](I/O): input_grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  // regions[2](I): weight
+  launcher.add_region_requirement(RegionRequirement(weights[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    weights[0]->region));
+  launcher.add_field(2, FID_DATA);
+
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output_grad
+  regions[1](I/O): input_grad
+  regions[2](I): weight
+*/
+void RMSNorm::peft_bwd_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(task->regions.size() == 3);
+  assert(regions.size() == 3);
+  RMSNormMeta *m = *((RMSNormMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR weight = helperGetGenericTensorAccessorRO(
+      m->weight_type[0], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  peft_bwd_kernel_wrapper(m, bc, output_grad, input_grad, weight);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    RMSNorm::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {weight}, {output_grad}, false);
+  }
+}
+
 void RMSNorm::serialize(Legion::Serializer &sez) const {
   sez.serialize(this->layer_guid.id);
   sez.serialize(this->layer_guid.transformer_layer_id);
@@ -474,11 +634,9 @@ Op *RMSNorm::materialize(FFModel &ff,
                          ParallelTensor inputs[],
                          int num_inputs) const {
   RMSNormParams params = get_params();
-  return new RMSNorm(ff, params, inputs[0], true, this->name);
+  return new RMSNorm(ff, params, inputs[0], true, params.name);
 }
 
-void RMSNorm::backward(FFModel const &ff) {}
-
 bool RMSNorm::measure_operator_cost(Simulator *sim,
                                     MachineView const &mv,
                                     CostMetrics &cost_metrics) const {
diff --git a/src/ops/sampling.cc b/src/ops/sampling.cc
index b38c68843b..0358a2cd31 100644
--- a/src/ops/sampling.cc
+++ b/src/ops/sampling.cc
@@ -88,7 +88,7 @@ Op *Sampling::create_operator_from_layer(
 SamplingParams Sampling::get_params() const {
   SamplingParams params;
   params.top_p = this->top_p;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -302,7 +302,7 @@ InferenceResult
   GenericTensorAccessorW indices = helperGetGenericTensorAccessorWO(
       DT_INT32, regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
-  int batch_size = bc->num_active_tokens();
+  int batch_size = bc->num_active_infr_tokens();
   Sampling::forward_kernel_wrapper(m, input, indices, batch_size);
 
   if (m->inference_debugging) {
@@ -313,7 +313,7 @@ InferenceResult
   }
 
   InferenceResult ir;
-  download_tensor<BatchConfig::TokenId>(
+  copy_tensor_dev_to_host<BatchConfig::TokenId>(
       indices.get_int32_ptr(), ir.token_ids, batch_size);
   return ir;
 }
diff --git a/src/ops/sigmoid_silu_multi.cc b/src/ops/sigmoid_silu_multi.cc
index 3d1c8d9094..e7c2fea19c 100644
--- a/src/ops/sigmoid_silu_multi.cc
+++ b/src/ops/sigmoid_silu_multi.cc
@@ -52,7 +52,7 @@ bool SigmoidSiluMultiParams::is_valid(
 SigmoidSiluMultiParams SigmoidSiluMulti::get_params() const {
   SigmoidSiluMultiParams params;
   params.layer_guid = this->layer_guid;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -254,7 +254,188 @@ void SigmoidSiluMulti::forward(FFModel const &ff) {
 }
 
 void SigmoidSiluMulti::backward(FFModel const &ff) {
-  assert(false);
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  set_argumentmap_for_backward(ff, argmap);
+  IndexLauncher launcher(SIGMOID_SILU_MULTI_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  // output grad
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // input 1
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // input 2
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region));
+  launcher.add_field(2, FID_DATA);
+  // input 1 grad
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(3, FID_DATA);
+  // input 2 grad
+  launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[1]->region_grad));
+  launcher.add_field(4, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output grad
+  regions[1](I): input 1
+  regions[2](I): input 2
+  regions[3](I/O): input 1 grad
+  regions[4](I/O): input 2 grad
+*/
+void SigmoidSiluMulti::backward_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 5);
+
+  SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args);
+
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input1 = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR input2 = helperGetGenericTensorAccessorRO(
+      m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[3], task->regions[3], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[1], regions[4], task->regions[4], FID_DATA, ctx, runtime);
+
+  SigmoidSiluMulti::backward_kernel_wrapper(
+      m, output_grad, input1, input2, input1_grad, input2_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    SigmoidSiluMulti::save_inference_tensors_to_file(
+        m,
+        shard_id,
+        nullptr,
+        {output_grad, input1, input2},
+        {},
+        {input1_grad, input2_grad});
+  }
+}
+
+FutureMap
+    SigmoidSiluMulti::peft_bwd(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  // output grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  // input 1 grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  // input 2 grad
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[1]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[1] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[1]->region_grad));
+  launcher.add_field(2, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*
+  regions[0](I): output grad
+  regions[3](I/O): input 1 grad
+  regions[4](I/O): input 2 grad
+*/
+void SigmoidSiluMulti::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 3);
+
+  SigmoidSiluMultiMeta *m = *((SigmoidSiluMultiMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input1_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input2_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[1], regions[2], task->regions[2], FID_DATA, ctx, runtime);
+
+  SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+      m, bc, output_grad, input1_grad, input2_grad);
+
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    SigmoidSiluMulti::save_inference_tensors_to_file(m,
+                                                     shard_id,
+                                                     nullptr,
+                                                     {input1_grad, input2_grad},
+                                                     {},
+                                                     {output_grad},
+                                                     false);
+  }
 }
 
 FutureMap SigmoidSiluMulti::inference(
@@ -347,7 +528,7 @@ void SigmoidSiluMulti::inference_task(
   assert(input1_domain == input2_domain);
   assert(input1_domain == output_domain);
 
-  SigmoidSiluMulti::inference_kernel_wrapper(m, input1, input2, output);
+  SigmoidSiluMulti::inference_kernel_wrapper(m, bc, input1, input2, output);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
diff --git a/src/ops/sigmoid_silu_multi.cpp b/src/ops/sigmoid_silu_multi.cpp
index 7b7f30a288..ceaa1a7788 100644
--- a/src/ops/sigmoid_silu_multi.cpp
+++ b/src/ops/sigmoid_silu_multi.cpp
@@ -23,7 +23,7 @@ namespace FlexFlow {
 SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
                                            SigmoidSiluMulti const *ssm,
                                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ssm) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
 }
@@ -34,36 +34,56 @@ SigmoidSiluMultiMeta::~SigmoidSiluMultiMeta(void) {
   }
 }
 
-__device__ __forceinline__ float sigmoid_float(float x) {
-  return 1.0 / (1.0 + expf(-x));
-}
-
-__device__ __forceinline__ half sigmoid_half(half x) {
-  return (half)1.0 / ((half)1.0 + hexp(-x));
-}
-
-__global__ void SigmoidSiluMultiKernelFloat(int num_elements,
-                                            float const *input1_ptr,
-                                            float const *input2_ptr,
-                                            float *output_ptr) {
+template <typename T>
+__global__ void SigmoidSiluMultiKernel(int num_elements,
+                                       T const *input1_ptr,
+                                       T const *input2_ptr,
+                                       T *output_ptr) {
   CUDA_KERNEL_LOOP(i, num_elements) {
-    output_ptr[i] =
-        input1_ptr[i] * sigmoid_float(input1_ptr[i]) * input2_ptr[i];
+    float sigmoid_val = static_cast<float>(input1_ptr[i]);
+    sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
+    output_ptr[i] = input1_ptr[i] * T(sigmoid_val) * input2_ptr[i];
   }
 }
 
-__global__ void SigmoidSiluMultiKernelHalf(int num_elements,
-                                           half const *input1_ptr,
-                                           half const *input2_ptr,
-                                           half *output_ptr) {
+template <typename T>
+__global__ void SigmoidSiluMultiBackwardKernel(int num_elements,
+                                               T const *output_grad_ptr,
+                                               T const *input1_ptr,
+                                               T const *input2_ptr,
+                                               T *input1_grad_ptr,
+                                               T *input2_grad_ptr,
+                                               bool reset_input_grad1,
+                                               bool reset_input_grad2) {
   CUDA_KERNEL_LOOP(i, num_elements) {
-    output_ptr[i] = input1_ptr[i] * sigmoid_half(input1_ptr[i]) * input2_ptr[i];
+    float sigmoid_val = static_cast<float>(input1_ptr[i]);
+    sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
+
+    if (reset_input_grad2) {
+      input2_grad_ptr[i] =
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    } else {
+      input2_grad_ptr[i] +=
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    }
+    T ss_grad_val = output_grad_ptr[i] * input2_ptr[i];
+    if (reset_input_grad1) {
+      input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val);
+    } else {
+      input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
+    }
+    T sig_grad = ss_grad_val * input1_ptr[i];
+
+    float x1_grad_val = static_cast<float>(sig_grad);
+    x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val);
+    input1_grad_ptr[i] += T(x1_grad_val);
   }
 }
 
 /*static*/
 void SigmoidSiluMulti::inference_kernel_wrapper(
-    SigmoidSiluMultiMeta const *m,
+    SigmoidSiluMultiMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input1,
     GenericTensorAccessorR const &input2,
     GenericTensorAccessorW const &output) {
@@ -81,8 +101,84 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     checkCUDA(hipEventRecord(t_start, stream));
   }
 
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t input_tensor_size =
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim;
+        size_t activation_size_needed =
+            2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(hipMemcpyAsync(m->input_activation,
+                                   input1.get_float_ptr() +
+                                       tokens_previous_requests * in_dim,
+                                   input_tensor_size,
+                                   hipMemcpyDeviceToDevice,
+                                   stream));
+          checkCUDA(hipMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_float_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(hipMemcpyAsync(m->input_activation,
+                                   input1.get_half_ptr() +
+                                       tokens_previous_requests * in_dim,
+                                   input_tensor_size,
+                                   hipMemcpyDeviceToDevice,
+                                   stream));
+          checkCUDA(hipMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_half_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              hipMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
   if (m->input_type[0] == DT_FLOAT) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelFloat),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel),
                        GET_BLOCKS(num_elements),
                        min(CUDA_NUM_THREADS, num_elements),
                        0,
@@ -92,7 +188,7 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
                        input2.get_float_ptr(),
                        output.get_float_ptr());
   } else if (m->input_type[0] == DT_HALF) {
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernelHalf),
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiKernel),
                        GET_BLOCKS(num_elements),
                        min(CUDA_NUM_THREADS, num_elements),
                        0,
@@ -116,4 +212,159 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
   }
 }
 
+/*static*/
+void SigmoidSiluMulti::backward_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input1,
+    GenericTensorAccessorR const &input2,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  int num_elements = output_grad.domain.get_volume();
+  assert(input1.domain.get_volume() == num_elements);
+  assert(input2.domain.get_volume() == num_elements);
+  assert(input1_grad.domain.get_volume() == num_elements);
+  assert(input2_grad.domain.get_volume() == num_elements);
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  if (m->input_type[0] == DT_FLOAT) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       output_grad.domain.get_volume(),
+                       output_grad.get_float_ptr(),
+                       input1.get_float_ptr(),
+                       input2.get_float_ptr(),
+                       input1_grad.get_float_ptr(),
+                       input2_grad.get_float_ptr(),
+                       m->reset_input_grads[0],
+                       m->reset_input_grads[1]);
+  } else if (m->input_type[0] == DT_HALF) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       output_grad.domain.get_volume(),
+                       output_grad.get_half_ptr(),
+                       input1.get_half_ptr(),
+                       input2.get_half_ptr(),
+                       input1_grad.get_half_ptr(),
+                       input2_grad.get_half_ptr(),
+                       m->reset_input_grads[0],
+                       m->reset_input_grads[1]);
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed);
+  }
+}
+
+/*static*/
+void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    BatchConfig const *bc,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume());
+  assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume());
+
+  hipEvent_t t_start, t_end;
+  if (m->profiling) {
+    checkCUDA(hipEventCreate(&t_start));
+    checkCUDA(hipEventCreate(&t_end));
+    checkCUDA(hipEventRecord(t_start, stream));
+  }
+
+  int num_peft_requests = 0;
+  int num_peft_tokens = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+      num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  if (num_peft_requests == 0) {
+    // No PEFT requests
+    return;
+  } else {
+    // Otherwise assume at most 1 peft request
+    assert(num_peft_requests == 1);
+    assert(num_peft_tokens >= 1);
+  }
+  int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  int num_elements = in_dim * num_peft_tokens;
+
+  if (m->input_type[0] == DT_FLOAT) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       num_elements,
+                       output_grad.get_float_ptr(),
+                       static_cast<float const *>(m->input_activation),
+                       static_cast<float const *>(m->input_activation) +
+                           num_peft_tokens * in_dim,
+                       input1_grad.get_float_ptr(),
+                       input2_grad.get_float_ptr(),
+                       m->reset_input_grads[0],
+                       m->reset_input_grads[1]);
+  } else if (m->input_type[0] == DT_HALF) {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(SigmoidSiluMultiBackwardKernel),
+                       GET_BLOCKS(num_elements),
+                       min(CUDA_NUM_THREADS, num_elements),
+                       0,
+                       stream,
+                       num_elements,
+                       output_grad.get_half_ptr(),
+                       static_cast<half const *>(m->input_activation),
+                       static_cast<half const *>(m->input_activation) +
+                           num_peft_tokens * in_dim,
+                       input1_grad.get_half_ptr(),
+                       input2_grad.get_half_ptr(),
+                       m->reset_input_grads[0],
+                       m->reset_input_grads[1]);
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    checkCUDA(hipEventRecord(t_end, stream));
+    checkCUDA(hipEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(hipEventElapsedTime(&elapsed, t_start, t_end));
+    checkCUDA(hipEventDestroy(t_start));
+    checkCUDA(hipEventDestroy(t_end));
+    printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/sigmoid_silu_multi.cu b/src/ops/sigmoid_silu_multi.cu
index 590b641b5a..929d557a17 100644
--- a/src/ops/sigmoid_silu_multi.cu
+++ b/src/ops/sigmoid_silu_multi.cu
@@ -22,7 +22,7 @@ namespace FlexFlow {
 SigmoidSiluMultiMeta::SigmoidSiluMultiMeta(FFHandler handle,
                                            SigmoidSiluMulti const *ssm,
                                            MemoryAllocator &gpu_mem_allocator)
-    : OpMeta(handle) {
+    : OpMeta(handle, ssm) {
   profiling = ssm->profiling;
   inference_debugging = ssm->inference_debugging;
 }
@@ -45,9 +45,44 @@ __global__ void SigmoidSiluMultiKernel(int num_elements,
   }
 }
 
+template <typename T>
+__global__ void SigmoidSiluMultiBackwardKernel(int num_elements,
+                                               T const *output_grad_ptr,
+                                               T const *input1_ptr,
+                                               T const *input2_ptr,
+                                               T *input1_grad_ptr,
+                                               T *input2_grad_ptr,
+                                               bool reset_input_grad1,
+                                               bool reset_input_grad2) {
+  CUDA_KERNEL_LOOP(i, num_elements) {
+    float sigmoid_val = static_cast<float>(input1_ptr[i]);
+    sigmoid_val = 1.0f / (1.0f + exp(-sigmoid_val));
+
+    if (reset_input_grad2) {
+      input2_grad_ptr[i] =
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    } else {
+      input2_grad_ptr[i] +=
+          output_grad_ptr[i] * (input1_ptr[i] * T(sigmoid_val));
+    }
+    T ss_grad_val = output_grad_ptr[i] * input2_ptr[i];
+    if (reset_input_grad1) {
+      input1_grad_ptr[i] = ss_grad_val * T(sigmoid_val);
+    } else {
+      input1_grad_ptr[i] += ss_grad_val * T(sigmoid_val);
+    }
+    T sig_grad = ss_grad_val * input1_ptr[i];
+
+    float x1_grad_val = static_cast<float>(sig_grad);
+    x1_grad_val = x1_grad_val * sigmoid_val * (1.0f - sigmoid_val);
+    input1_grad_ptr[i] += T(x1_grad_val);
+  }
+}
+
 /*static*/
 void SigmoidSiluMulti::inference_kernel_wrapper(
-    SigmoidSiluMultiMeta const *m,
+    SigmoidSiluMultiMeta *m,
+    BatchConfig const *bc,
     GenericTensorAccessorR const &input1,
     GenericTensorAccessorR const &input2,
     GenericTensorAccessorW const &output) {
@@ -64,6 +99,83 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
     cudaEventCreate(&t_end);
     cudaEventRecord(t_start, stream);
   }
+
+  // save input activation if needed for PEFT
+  if (bc->num_active_peft_tokens() > 0) {
+    // Check that we have at most one request that requires peft_bwd
+    int num_peft_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        continue;
+      }
+      if (bc->requestsInfo[i].peft_bwd) {
+        num_peft_requests++;
+      }
+    }
+    assert(num_peft_requests <= 1);
+
+    int tokens_previous_requests = 0;
+    for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+      if (bc->request_completed[i]) {
+        continue;
+      }
+      // Skip non-PEFT requests
+      if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+        // FIXME: use the new approach to computing token offset
+        tokens_previous_requests += bc->requestsInfo[i].num_tokens_in_batch;
+        continue;
+      }
+      int num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+      int max_peft_tokens = bc->requestsInfo[i].max_sequence_length;
+      int in_dim = input1.domain.hi()[0] - input1.domain.lo()[0] + 1;
+      if (bc->requestsInfo[i].peft_bwd) {
+        size_t input_tensor_size =
+            data_type_size(m->input_type[0]) * num_peft_tokens * in_dim;
+        size_t activation_size_needed =
+            2 * data_type_size(m->input_type[0]) * max_peft_tokens * in_dim;
+        if (activation_size_needed > m->allocated_peft_buffer_size) {
+          MemoryAllocator *allocator = m->handle.peft_activation_allocator;
+          m->input_activation =
+              allocator->allocate_instance_untyped(activation_size_needed);
+          m->allocated_peft_buffer_size = activation_size_needed;
+        }
+        // copy input activation
+        if (m->input_type[0] == DT_FLOAT) {
+          checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                    input1.get_float_ptr() +
+                                        tokens_previous_requests * in_dim,
+                                    input_tensor_size,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+          checkCUDA(cudaMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_float_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else if (m->input_type[0] == DT_HALF) {
+          checkCUDA(cudaMemcpyAsync(m->input_activation,
+                                    input1.get_half_ptr() +
+                                        tokens_previous_requests * in_dim,
+                                    input_tensor_size,
+                                    cudaMemcpyDeviceToDevice,
+                                    stream));
+          checkCUDA(cudaMemcpyAsync(
+              (void *)((char *)m->input_activation + input_tensor_size),
+              input2.get_half_ptr() + tokens_previous_requests * in_dim,
+              input_tensor_size,
+              cudaMemcpyDeviceToDevice,
+              stream));
+        } else {
+          assert(false && "unsupport datatype in layernorm");
+        }
+      }
+    }
+  }
+
   if (m->input_type[0] == DT_FLOAT) {
     SigmoidSiluMultiKernel<<<GET_BLOCKS(num_elements),
                              min(CUDA_NUM_THREADS, num_elements),
@@ -95,4 +207,152 @@ void SigmoidSiluMulti::inference_kernel_wrapper(
   }
 }
 
+/*static*/
+void SigmoidSiluMulti::backward_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorR const &input1,
+    GenericTensorAccessorR const &input2,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  int num_elements = output_grad.domain.get_volume();
+  assert(input1.domain.get_volume() == num_elements);
+  assert(input2.domain.get_volume() == num_elements);
+  assert(input1_grad.domain.get_volume() == num_elements);
+  assert(input2_grad.domain.get_volume() == num_elements);
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+  if (m->input_type[0] == DT_FLOAT) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(output_grad.domain.get_volume(),
+                                               output_grad.get_float_ptr(),
+                                               input1.get_float_ptr(),
+                                               input2.get_float_ptr(),
+                                               input1_grad.get_float_ptr(),
+                                               input2_grad.get_float_ptr(),
+                                               m->reset_input_grads[0],
+                                               m->reset_input_grads[1]);
+  } else if (m->input_type[0] == DT_HALF) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(output_grad.domain.get_volume(),
+                                               output_grad.get_half_ptr(),
+                                               input1.get_half_ptr(),
+                                               input2.get_half_ptr(),
+                                               input1_grad.get_half_ptr(),
+                                               input2_grad.get_half_ptr(),
+                                               m->reset_input_grads[0],
+                                               m->reset_input_grads[1]);
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[SigmoidSiluMulti] backward time (CF) = %.9fms\n", elapsed);
+  }
+}
+
+/*static*/
+void SigmoidSiluMulti::peft_bwd_kernel_wrapper(
+    SigmoidSiluMultiMeta const *m,
+    BatchConfig const *bc,
+    GenericTensorAccessorR const &output_grad,
+    GenericTensorAccessorW const &input1_grad,
+    GenericTensorAccessorW const &input2_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+
+  assert(input1_grad.domain.get_volume() == output_grad.domain.get_volume());
+  assert(input2_grad.domain.get_volume() == input1_grad.domain.get_volume());
+
+  cudaEvent_t t_start, t_end;
+  if (m->profiling) {
+    cudaEventCreate(&t_start);
+    cudaEventCreate(&t_end);
+    cudaEventRecord(t_start, stream);
+  }
+
+  int num_peft_requests = 0;
+  int num_peft_tokens = 0;
+  for (int i = 0; i < bc->max_requests_per_batch(); i++) {
+    if (bc->request_completed[i]) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_model_id == PEFTModelID::NO_ID) {
+      continue;
+    }
+    if (bc->requestsInfo[i].peft_bwd) {
+      num_peft_requests++;
+      num_peft_tokens = bc->requestsInfo[i].num_tokens_in_batch;
+    }
+  }
+  if (num_peft_requests == 0) {
+    // No PEFT requests
+    return;
+  } else {
+    // Otherwise assume at most 1 peft request
+    assert(num_peft_requests == 1);
+    assert(num_peft_tokens >= 1);
+  }
+  int in_dim = output_grad.domain.hi()[0] - output_grad.domain.lo()[0] + 1;
+  int num_elements = in_dim * num_peft_tokens;
+
+  if (m->input_type[0] == DT_FLOAT) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(
+        num_elements,
+        output_grad.get_float_ptr(),
+        static_cast<float const *>(m->input_activation),
+        static_cast<float const *>(m->input_activation) +
+            num_peft_tokens * in_dim,
+        input1_grad.get_float_ptr(),
+        input2_grad.get_float_ptr(),
+        m->reset_input_grads[0],
+        m->reset_input_grads[1]);
+  } else if (m->input_type[0] == DT_HALF) {
+    SigmoidSiluMultiBackwardKernel<<<GET_BLOCKS(num_elements),
+                                     min(CUDA_NUM_THREADS, num_elements),
+                                     0,
+                                     stream>>>(
+        num_elements,
+        output_grad.get_half_ptr(),
+        static_cast<half const *>(m->input_activation),
+        static_cast<half const *>(m->input_activation) +
+            num_peft_tokens * in_dim,
+        input1_grad.get_half_ptr(),
+        input2_grad.get_half_ptr(),
+        m->reset_input_grads[0],
+        m->reset_input_grads[1]);
+  } else {
+    assert(false && "unsupport datatype in SigmoidSiluMulti");
+  }
+  if (m->profiling) {
+    cudaEventRecord(t_end, stream);
+    checkCUDA(cudaEventSynchronize(t_end));
+    float elapsed = 0;
+    checkCUDA(cudaEventElapsedTime(&elapsed, t_start, t_end));
+    cudaEventDestroy(t_start);
+    cudaEventDestroy(t_end);
+    printf("[SigmoidSiluMulti] peft_bwd time (CF) = %.9fms\n", elapsed);
+  }
+}
+
 }; // namespace FlexFlow
diff --git a/src/ops/softmax.cc b/src/ops/softmax.cc
index 03618423be..a02d88b98b 100644
--- a/src/ops/softmax.cc
+++ b/src/ops/softmax.cc
@@ -86,7 +86,7 @@ SoftmaxParams Softmax::get_params() const {
   SoftmaxParams params;
   params.layer_guid = this->layer_guid;
   params.dim = this->dim;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -270,52 +270,12 @@ OpMeta *Softmax::init_task(Task const *task,
     domain = input_domain;
   }
   SoftmaxMeta *m = new SoftmaxMeta(handle, softmax, domain);
-  m->input_type = softmax->inputs[0]->data_type;
-  m->output_type = softmax->outputs[0]->data_type;
   // checkCUDNN(cudnnCreateTensorDescriptor(&m->outputTensor));
   std::strcpy(m->op_name, softmax->name);
   m->layer_guid = softmax->layer_guid;
   return m;
 }
 
-FutureMap Softmax::inference(FFModel const &ff,
-                             BatchConfigFuture const &bc,
-                             std::vector<ParallelTensor> const &batch_inputs,
-                             std::vector<ParallelTensor> const &batch_outputs,
-                             MachineView const *mv) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  parallel_is = batch_outputs[0]->parallel_is;
-  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
-  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
-  size_t machine_view_hash = view->hash();
-  /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
-            << std::endl; */
-  IndexLauncher launcher(SOFTMAX_INF_TASK_ID,
-                         parallel_is,
-                         TaskArgument(nullptr, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         machine_view_hash);
-  launcher.add_future(bc);
-  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    batch_outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  return runtime->execute_index_space(ctx, launcher);
-}
-
 void Softmax::forward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -354,17 +314,11 @@ void Softmax::forward_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args);
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
-  if (m->output_type == DT_HALF) {
-    forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
-  } else if (m->output_type == DT_FLOAT) {
-    forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
-  } else {
-    assert(false && "Unsupported data type");
-  }
+  forward_kernel_wrapper(m, input, output);
 }
 
 void Softmax::backward(FFModel const &ff) {
@@ -402,52 +356,69 @@ void Softmax::backward_task(Task const *task,
   Domain in_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
   SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args);
-  switch (in_domain.get_dim()) {
-#define DIMFUNC(DIM)                                                           \
-  case DIM:                                                                    \
-    if (m->output_type == DT_HALF) {                                           \
-      return backward_task_with_dim<half, DIM>(task, regions, ctx, runtime);   \
-    } else if (m->output_type == DT_FLOAT) {                                   \
-      return backward_task_with_dim<float, DIM>(task, regions, ctx, runtime);  \
-    } else {                                                                   \
-      assert(false && "Unsupported data type");                                \
-    }
-    LEGION_FOREACH_N(DIMFUNC)
-#undef DIMFUNC
-    default:
-      assert(false);
-  }
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  backward_kernel_wrapper(m, input_grad, output_grad);
 }
 
-/*
-  regions[0](I/O): input_grad
-  regions[1](I): output_grad
-*/
-// Note that the backward task of softmax is actually a no op (i.e., input_grad
-// = output_grad) since the upstream cross_entropy_loss function computes
-// performs softmax_cross_entropy_loss to avoid intermediate zeros
-template <typename DT, int NDIM>
-void Softmax::backward_task_with_dim(Task const *task,
-                                     std::vector<PhysicalRegion> const &regions,
-                                     Context ctx,
-                                     Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  // const Softmax* softmax = (Softmax*) task->args;
-  SoftmaxMeta const *m = *((SoftmaxMeta **)task->local_args);
-  TensorAccessorW<DT, NDIM> acc_input_grad(regions[0],
-                                           task->regions[0],
-                                           FID_DATA,
-                                           ctx,
-                                           runtime,
-                                           true /*readOutput*/);
-  TensorAccessorR<DT, NDIM> acc_output_grad(
-      regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  // make sure the image indices match!
-  assert(acc_input_grad.rect == acc_output_grad.rect);
-
-  backward_kernel_wrapper(
-      m, acc_input_grad.ptr, acc_output_grad.ptr, acc_input_grad.rect.volume());
+FutureMap Softmax::inference(FFModel const &ff,
+                             BatchConfigFuture const &bc,
+                             std::vector<ParallelTensor> const &batch_inputs,
+                             std::vector<ParallelTensor> const &batch_outputs,
+                             MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(SOFTMAX_INF_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  // if this is the last operator, we add the region below in order to copy the
+  // output to the grad tensor
+  assert(ff.config.computationMode == COMP_MODE_INFERENCE);
+  int last_op = ff.operators.size() - 1;
+  assert(ff.operators[last_op]->op_type == OP_ARGMAX ||
+         ff.operators[last_op]->op_type == OP_ARG_TOPK ||
+         ff.operators[last_op]->op_type == OP_SAMPLING);
+  last_op -= 1;
+  while (ff.operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
+    last_op -= 1;
+  }
+  if (ff.operators[last_op] == this) {
+    launcher.add_region_requirement(
+        RegionRequirement(batch_outputs[0]->part_grad,
+                          0 /*projection id*/,
+                          WRITE_ONLY,
+                          EXCLUSIVE,
+                          batch_outputs[0]->region_grad));
+    launcher.add_field(2, FID_DATA);
+  }
+  return runtime->execute_index_space(ctx, launcher);
 }
 
 void Softmax::inference_task(Task const *task,
@@ -455,8 +426,8 @@ void Softmax::inference_task(Task const *task,
                              Context ctx,
                              Runtime *runtime) {
   assert(task->regions.size() == regions.size());
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
+  assert(regions.size() == 3 || regions.size() == 2);
+  bool is_last_op = (regions.size() == 3);
   BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
   if (bc->num_tokens == 0) {
     return;
@@ -465,16 +436,19 @@ void Softmax::inference_task(Task const *task,
       ctx, task->regions[0].region.get_index_space());
   SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args);
   GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->output_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
-  if (m->output_type == DT_HALF) {
-    forward_kernel_wrapper(m, input.get_half_ptr(), output.get_half_ptr());
-  } else if (m->output_type == DT_FLOAT) {
-    forward_kernel_wrapper(m, input.get_float_ptr(), output.get_float_ptr());
-  } else {
-    assert(false && "Unsupported data type");
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output_grad;
+  if (is_last_op) {
+    output_grad = helperGetGenericTensorAccessorWO(m->output_type[0],
+                                                   regions[2],
+                                                   task->regions[2],
+                                                   FID_DATA,
+                                                   ctx,
+                                                   runtime);
   }
+  inference_kernel_wrapper(m, bc, is_last_op, input, output, output_grad);
   if (m->inference_debugging) {
     assert(task->index_point.get_dim() == 1);
     int shard_id = task->index_point.point_data[0];
@@ -483,6 +457,73 @@ void Softmax::inference_task(Task const *task,
   }
 }
 
+FutureMap Softmax::peft_bwd(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_outputs[0]->machine_view;
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  /* std::cout << "Softmax op machine_view: " << *(MachineView const *)mv
+            << std::endl; */
+  IndexLauncher launcher(SOFTMAX_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        reset_input_grads[0] ? WRITE_ONLY : READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+void Softmax::peft_bwd_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(task->regions.size() == regions.size());
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  Domain in_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  SoftmaxMeta *m = *((SoftmaxMeta **)task->local_args);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    Softmax::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {}, {output_grad}, false);
+  }
+}
+
 bool Softmax::get_int_parameter(PMParameter para, int *value) const {
   switch (para) {
     case PM_SOFTMAX_DIM:
@@ -508,29 +549,35 @@ bool Softmax::measure_operator_cost(Simulator *sim,
 
   sim->free_all();
   float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+  GenericTensorAccessorR input_acc(DT_FLOAT, sub_input.get_domain(), input_ptr);
   assert(input_ptr != NULL);
   cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
 
   float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+  GenericTensorAccessorW output_acc(
+      DT_FLOAT, sub_output.get_domain(), output_ptr);
   assert(output_ptr != NULL);
   cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
 
   std::function<void()> forward, backward;
-  forward = [&] { forward_kernel_wrapper(m, input_ptr, output_ptr); };
+  forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); };
   if (sim->computationMode == COMP_MODE_TRAINING) {
     float *input_grad_ptr =
         (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
+    GenericTensorAccessorW input_grad_acc(
+        DT_FLOAT, sub_input.get_domain(), input_grad_ptr);
     assert(input_grad_ptr != NULL);
     cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);
 
     float *output_grad_ptr =
         (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
+    GenericTensorAccessorW output_grad_acc(
+        DT_FLOAT, sub_output.get_domain(), output_grad_ptr);
     assert(output_grad_ptr != NULL);
     cost_metrics.outputs_memory +=
         cost_metrics.total_mem_diff_from(sim->offset);
     backward = [&] {
-      backward_kernel_wrapper(
-          m, input_grad_ptr, output_grad_ptr, sub_output.get_volume());
+      backward_kernel_wrapper(m, input_grad_acc, output_grad_acc);
     };
   }
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cc b/src/ops/spec_inc_multihead_self_attention.cc
index 68d3a4c205..52da51fb26 100644
--- a/src/ops/spec_inc_multihead_self_attention.cc
+++ b/src/ops/spec_inc_multihead_self_attention.cc
@@ -850,7 +850,7 @@ SpecIncMultiHeadSelfAttentionParams
   params.scaling_factor = this->scaling_factor;
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
 
diff --git a/src/ops/spec_inc_multihead_self_attention.cpp b/src/ops/spec_inc_multihead_self_attention.cpp
index b1687d12a2..aebd5e8892 100644
--- a/src/ops/spec_inc_multihead_self_attention.cpp
+++ b/src/ops/spec_inc_multihead_self_attention.cpp
@@ -141,7 +141,7 @@ template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             BeamSearchBatchConfig const *bc,
                             hipStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
   // printf("curr depth: %d\n", curr_depth);
   // assert(curr_depth < 3);
@@ -200,15 +200,16 @@ void compute_attention_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #else
+  //   // TODO: currently use the hipblas_data_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #endif
   // int num_requests = bc->num_active_requests();
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int tokens_previous_requests = 0;
   int tokens_prev_requests_squares = 0;
   // int qkv_block_size =
diff --git a/src/ops/spec_inc_multihead_self_attention.cu b/src/ops/spec_inc_multihead_self_attention.cu
index a00ea9c95f..4688a8233c 100644
--- a/src/ops/spec_inc_multihead_self_attention.cu
+++ b/src/ops/spec_inc_multihead_self_attention.cu
@@ -361,7 +361,7 @@ template <typename DT>
 void update_kv_cache_kernel(SpecIncMultiHeadSelfAttentionMeta const *m,
                             BeamSearchBatchConfig const *bc,
                             cudaStream_t stream) {
-  int num_tokens = bc->num_active_tokens();
+  int num_tokens = bc->num_active_infr_tokens();
   int curr_depth = bc->beamRequestsInfo[0].current_depth;
   if (num_tokens > 0) {
     int parallelism = m->hidden_size * KV_WEIGHT_NUM * num_tokens;
@@ -471,17 +471,18 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int num_tokens = bc->num_active_tokens();
   int tokens_previous_requests = 0;
@@ -541,20 +542,9 @@ void compute_attention_kernel_prompt(SpecIncMultiHeadSelfAttentionMeta const *m,
     DT const *A = static_cast<DT *>(m->devQKVProjArray) +
                   bc->requestsInfo[i].first_token_offset_in_batch *
                       m->qProjSize * m->num_q_heads * QKV_WEIGHT_NUM;
-    // To get B, skip over K entries from previous requests (all heads +
-    // padding)
-
-    // print_tensor<float>((float*)A, 32, "A");
     DT const *B = static_cast<DT *>(m->keyCache) + i * kt_req_block_size;
+    DT *C = static_cast<DT *>(m->qk_prods);
 
-    // if (i == 0 && sub_req_id == 0 &&
-    //     bc->beam_slots.at(0).current_depth == 1) {
-    //   int offset = (float *)B - m->keyCache;
-    //   printf("key cache offset %d\n", kt_req_block_size);
-    // }
-    // To get C, skip over QK^T products from previous requests
-    DT *C = static_cast<DT *>(m->qk_prods) +
-            m->num_q_heads * tokens_prev_requests_squares;
     checkCUDA(cublasGemmStridedBatchedEx(m->handle.blas,
                                          CUBLAS_OP_T,
                                          CUBLAS_OP_N,
@@ -854,29 +844,15 @@ SpecIncMultiHeadSelfAttentionMeta::SpecIncMultiHeadSelfAttentionMeta(
   // allocate memory for the seqArray and reserve space
   {
     beam_token_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerTokenInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo));
-
+        static_cast<BeamSearchBatchConfig::BeamSearchPerTokenInfo *>(
+            handler.batch_config_metadata->beamTokenInfo);
     beam_request_infos =
-        reinterpret_cast<BeamSearchBatchConfig::BeamSearchPerRequestInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo) +
-            sizeof(BeamSearchBatchConfig::beamTokenInfo));
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BeamSearchBatchConfig::beamTokenInfo) +
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo) +
-        sizeof(BatchConfig::causalMask));
+        static_cast<BeamSearchBatchConfig::BeamSearchPerRequestInfo *>(
+            handler.batch_config_metadata->beamRequestsInfo);
+    causalMask = static_cast<BatchConfig::BitMask *>(
+        handler.batch_config_metadata->causalMask);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
   }
 
   cudaStreamSynchronize(stream);
diff --git a/src/ops/split.cc b/src/ops/split.cc
index 7c6b631b20..92cfbd49e9 100644
--- a/src/ops/split.cc
+++ b/src/ops/split.cc
@@ -50,7 +50,7 @@ SplitParams Split::get_params() const {
   SplitParams params;
   params.splits = this->splits;
   params.legion_axis = this->legion_axis;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/topk.cc b/src/ops/topk.cc
index 7d30a8aff3..0e88befa68 100644
--- a/src/ops/topk.cc
+++ b/src/ops/topk.cc
@@ -87,7 +87,7 @@ TopKParams TopK::get_params() const {
   TopKParams params;
   params.k = this->k;
   params.sorted = this->sorted;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -226,7 +226,7 @@ OpMeta *TopK::init_task(Task const *task,
                         Runtime *runtime) {
   TopK *topk = (TopK *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  TopKMeta *m = new TopKMeta(handle);
+  TopKMeta *m = new TopKMeta(handle, topk);
   m->profiling = topk->profiling;
   m->inference_debugging = topk->inference_debugging;
   m->sorted = topk->sorted;
@@ -474,7 +474,7 @@ bool TopK::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  TopKMeta *m = new TopKMeta(sim->handler);
+  TopKMeta *m = new TopKMeta(sim->handler, this);
   m->sorted = sorted;
 
   // allocate
diff --git a/src/ops/topk.cpp b/src/ops/topk.cpp
index b6e898b654..303c6e85e9 100644
--- a/src/ops/topk.cpp
+++ b/src/ops/topk.cpp
@@ -513,6 +513,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m,
   // TODO: missing profiling here
 }
 
-TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {}
+TopKMeta::TopKMeta(FFHandler handler, TopK const *topk)
+    : OpMeta(handler, topk) {}
 
 }; // namespace FlexFlow
diff --git a/src/ops/topk.cu b/src/ops/topk.cu
index cc87ee8a42..cfb2bf6448 100644
--- a/src/ops/topk.cu
+++ b/src/ops/topk.cu
@@ -509,6 +509,7 @@ void TopK::backward_kernel_wrapper(TopKMeta const *m,
   }
 }
 
-TopKMeta::TopKMeta(FFHandler handler) : OpMeta(handler) {}
+TopKMeta::TopKMeta(FFHandler handler, TopK const *topk)
+    : OpMeta(handler, topk) {}
 
 }; // namespace FlexFlow
diff --git a/src/ops/transpose.cc b/src/ops/transpose.cc
index 7a179c4f7d..bffde477de 100644
--- a/src/ops/transpose.cc
+++ b/src/ops/transpose.cc
@@ -51,7 +51,7 @@ TransposeParams Transpose::get_params() const {
   for (int i = 0; i < outputs[0]->num_dims; i++) {
     params.perm.push_back(this->perm[i]);
   }
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -193,7 +193,7 @@ OpMeta *Transpose::init_task(Task const *task,
   Domain out_domain = runtime->get_index_space_domain(
       ctx, task->regions[1].region.get_index_space());
 
-  TransposeMeta *m = new TransposeMeta(handle);
+  TransposeMeta *m = new TransposeMeta(handle, transpose);
   transpose->init_meta(m, in_domain, out_domain);
   m->profiling = transpose->profiling;
   m->inference_debugging = transpose->inference_debugging;
@@ -320,7 +320,7 @@ bool Transpose::measure_operator_cost(Simulator *sim,
     return false;
   }
 
-  TransposeMeta *m = sim->transpose_meta;
+  TransposeMeta *m = new TransposeMeta(sim->handler, this);
   this->init_meta(m, sub_input.get_domain(), sub_output.get_domain());
 
   sim->free_all();
diff --git a/src/ops/tree_inc_multihead_self_attention.cc b/src/ops/tree_inc_multihead_self_attention.cc
index df722a3d51..132a48be40 100644
--- a/src/ops/tree_inc_multihead_self_attention.cc
+++ b/src/ops/tree_inc_multihead_self_attention.cc
@@ -924,7 +924,7 @@ TreeIncMultiHeadSelfAttentionParams
   params.qk_prod_scaling = this->qk_prod_scaling;
   params.position_bias = this->position_bias;
   params.tensor_parallelism_degree = this->tensor_parallelism_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/ops/tree_inc_multihead_self_attention.cpp b/src/ops/tree_inc_multihead_self_attention.cpp
index 26291fb3b4..890d32bc87 100644
--- a/src/ops/tree_inc_multihead_self_attention.cpp
+++ b/src/ops/tree_inc_multihead_self_attention.cpp
@@ -16,6 +16,8 @@
 #include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
+#include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
+#include "flexflow/ops/tree_inc_multihead_self_attention.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_complex.h>
 #include <hip/hip_runtime.h>
@@ -26,11 +28,333 @@ namespace FlexFlow {
 using Legion::coord_t;
 using Legion::Memory;
 
+#define WARP_SIZE 32
+
 using namespace Kernels::IncMultiHeadAttention;
 
 namespace Kernels {
 namespace TreeIncMultiHeadAttention {
 
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL(unsigned mask, T var, int srcLane, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_sync(mask, var, srcLane, width);
+#else
+  return __shfl(var, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T
+    WARP_SHFL_XOR(unsigned mask, T var, int laneMask, int width = warpSize) {
+#ifndef __HIP_PLATFORM_HCC__
+  return __shfl_xor_sync(mask, var, laneMask, width);
+#else
+  return __shfl_xor(var, laneMask, width);
+#endif
+}
+
+template <typename DT,
+          int THREADS_PER_BLOCK,
+          int Dh,
+          int Dh_MAX,
+          int THREADS_PER_KEY,
+          int THREADS_PER_VALUE>
+__global__ void compute_attention_kernel_fused_kernel(
+    DT const *query,
+    DT const *key_cache,
+    DT const *value_cache,
+    DT *output_ptr,
+    float const scale,
+    int const max_seq_length,
+    int const max_token_per_batch,
+    int per_head_size,
+    int hidden_size,
+    BatchConfig::PerRequestInfo *request_infos,
+    int num_heads,
+    int num_requests,
+    BatchConfig::BitMask *causalMask,
+    bool *request_completed,
+    int qk_smem_sz) {
+
+  // q, k
+  using Q_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using K_vec = typename VEC_K<DT, THREADS_PER_KEY>::Type;
+  using V_vec = typename VEC_V<DT>::Type;
+  using Out_sum = typename Vec_fp32_<V_vec>::Type;
+
+  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(DT);
+  constexpr int K_ELTS_PER_THREAD = Dh / THREADS_PER_KEY;
+  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+  // constexpr int QK_ELTS_IN_16B = 16 / sizeof(DT);
+
+  // thread id
+  int const tidx = threadIdx.x;
+  // head id
+  int const head_idx = blockIdx.x;
+  // request idx
+  int const request_idx = blockIdx.y;
+
+  int const batch_config_request_id =
+      request_infos[request_idx].batch_config_request_id;
+
+  int const first_step = 0;
+
+  int const tlength =
+      request_infos[batch_config_request_id].first_token_depth_in_request +
+      request_infos[batch_config_request_id].num_tokens_in_batch;
+  int const qlength =
+      request_infos[batch_config_request_id].num_tokens_in_batch;
+
+  BatchConfig::BitMask bitmask = causalMask[batch_config_request_id];
+
+  int first_token_idx = 0;
+  for (int r = 0; r < batch_config_request_id; r++) {
+    first_token_idx +=
+        request_completed[r] ? 0 : request_infos[r].num_tokens_in_batch;
+  }
+
+  bool prompt_phase = request_infos[batch_config_request_id].prompt_phase;
+  int q_start =
+      request_infos[batch_config_request_id].first_token_depth_in_request;
+
+  // shared memory objects
+  extern __shared__ char smem_[];
+
+  float *qk_smem = reinterpret_cast<float *>(smem_);
+  float *out_smem = reinterpret_cast<float *>(smem_ + qk_smem_sz);
+
+  float qk_max = -FLT_MAX;
+
+  // first WARPS_PER_BLOCK for store qk_max, second WARPS_PER_BLOCK for sum
+  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+
+  const DT *q_ptr = query + first_token_idx * hidden_size * QKV_WEIGHT_NUM +
+                    head_idx * per_head_size;
+  __shared__ Q_vec q_vecs[THREADS_PER_KEY][K_VECS_PER_THREAD];
+
+  // the start offset of the element eg. (0, 1, 2, 3) * K_VEC_SIZE
+  int ki = tidx % THREADS_PER_KEY * K_VEC_SIZE;
+  int ki_o = tidx % THREADS_PER_KEY;
+  // the first key's offset for this thread
+  // ko = 0, 0, 0, 0, 1, 1, 1, 1, ....
+  int ko = tidx / THREADS_PER_KEY;
+  // load q tensor
+  Q_vec q_vec[K_VECS_PER_THREAD];
+
+  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+  // The number of keys per warp.
+  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+  DT const *k_cache_batch =
+      key_cache + batch_config_request_id * max_seq_length * hidden_size + ki;
+
+  int ti_end =
+      div_up(tlength - first_step, K_PER_WARP) * K_PER_WARP + first_step;
+
+  for (int qi = 0; qi < qlength; qi += 1) {
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+      q_vecs[ki_o][ii] = *reinterpret_cast<Q_vec const *>(
+          q_ptr + (hidden_size * QKV_WEIGHT_NUM * qi) + ki +
+          ii * THREADS_PER_KEY * K_VEC_SIZE);
+
+      // if (head_idx == 0 && request_idx == 1 && tidx == 0) {
+      //     printf("laod q %d,  %d %.10f\n",
+      //     request_idx,
+      //            qi,q_vecs[ki_o][ii].x);
+      //   }
+    }
+
+    __syncthreads();
+    for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+      K_vec k[K_VECS_PER_THREAD];
+      int const ti_circ = ti % max_seq_length;
+
+      for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+        int jj = ii * THREADS_PER_KEY * K_VEC_SIZE;
+        if (ti < tlength) {
+          k[ii] = *reinterpret_cast<K_vec const *>(
+              k_cache_batch + ti_circ * hidden_size + head_idx * per_head_size +
+              jj);
+        }
+      }
+      float qk = scale * Qk_dot<DT, THREADS_PER_KEY>::dot(q_vecs[ki_o], k);
+
+      if (ti < tlength && tidx % THREADS_PER_KEY == 0) {
+        bool const mask =
+            prompt_phase ? (qi + q_start < ti)
+                         : (ti >= bitmask.non_tree_cache_size &&
+                            (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
+                               (1 << qi))));
+
+        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+
+        // if (head_idx == 0 && !mask) {
+        //   printf("tree attn qkqkqkqk request id %d qi%d, ti %d, %.10f, %.10f,
+        //   %.10f, %d\n",
+        //          request_idx,
+        //          qi,
+        //          ti,
+        //          qk,
+        //          q_vecs[ki_o][0].x,
+        //          k[0].x,
+        //          bitmask.non_tree_cache_size);
+        // }
+        qk_smem[ti - first_step] = mask ? 0.0f : qk;
+      }
+    }
+
+    __syncthreads();
+
+#pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+      qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+    }
+
+    // Decompose the thread index into warp and lane.
+    int const warp = tidx / WARP_SIZE;
+    int const lane = tidx % WARP_SIZE;
+
+    // The warp leader writes the max to shared memory.
+    if (lane == 0) {
+      red_smem[warp] = qk_max;
+    }
+
+    // Make sure the products are in shared memory.
+    __syncthreads();
+
+    // The warps finalize the reduction.
+    qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+    for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+      qk_max = fmaxf(qk_max, WARP_SHFL_XOR(uint32_t(-1), qk_max, mask));
+    }
+
+    // Broadcast to all the threads in the warp.
+    qk_max = WARP_SHFL(uint32_t(-1), qk_max, 0);
+
+    // if (head_idx == 0 && qi == 9 && tidx == 0) {
+    //   printf("tree attn first token qk_max %f\n", qk_max);
+    // }
+
+    float exp_sum = 0.f;
+    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+      bool const mask =
+          prompt_phase ? (q_start + qi < ti)
+                       : (ti >= bitmask.non_tree_cache_size &&
+                          (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
+                             (1 << qi))));
+      float logit = mask ? 0.0f : __expf(qk_smem[ti - first_step] - qk_max);
+      exp_sum += logit;
+      qk_smem[ti - first_step] = mask ? 0.0f : logit;
+    }
+
+    // Compute the sum.
+    exp_sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], exp_sum);
+
+    // softmax
+    float inv_sum = __fdividef(1.f, exp_sum + 1.e-6);
+    for (int ti = first_step + tidx; ti < tlength; ti += THREADS_PER_BLOCK) {
+      qk_smem[ti - first_step] *= inv_sum;
+    }
+
+    __syncthreads();
+
+    // value projection
+    constexpr int V_VEC_SIZE = 16 / sizeof(DT);
+    // A vector of V elements for the current timestep.
+    // using V_vec_k = typename V_vec_k_<DT, V_VEC_SIZE>::Type;
+    // using V_vec_acum = typename V_vec_acum_fp32_<V_vec_k>::Type;
+
+    // The value computed by this thread.
+    int vo = tidx / THREADS_PER_VALUE;
+    // The hidden dimensions computed by this particular thread.
+    int vi = tidx % THREADS_PER_VALUE * V_VEC_SIZE;
+    constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+
+    Out_sum out;
+    zero(out);
+
+    // The base pointer for the value in the cache buffer.
+    DT const *v_cache_batch =
+        value_cache + batch_config_request_id * max_seq_length * hidden_size +
+        vi;
+
+    if (Dh == Dh_MAX || vi < Dh) {
+      for (int ti = first_step + vo; ti < tlength; ti += V_PER_ITER) {
+        // Load the values from the cache.
+        int const ti_circ = ti % max_seq_length;
+        // int const real_cache_idx = topology.real_token_pos[sub_req_idx][ti];
+        V_vec v = *reinterpret_cast<V_vec const *>(
+            v_cache_batch + ti_circ * hidden_size + head_idx * per_head_size);
+
+        if (ti < tlength) {
+          bool const mask =
+              prompt_phase
+                  ? (q_start + qi < ti)
+                  : (ti >= bitmask.non_tree_cache_size &&
+                     (!(bitmask.mask[ti - bitmask.non_tree_cache_size] &
+                        (1 << qi))));
+          float logit = mask ? 0.0f : qk_smem[ti - first_step];
+          out = FlexFlow::fma(logit, cast_to_float(v), out);
+        }
+      }
+    }
+
+    //   // Make sure we can start writing to shared memory.
+    __syncthreads();
+
+    // Run the final reduction amongst the different groups computing different
+    // partial outputs.
+    if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+      for (int active_groups = V_PER_ITER; active_groups >= 2;
+           active_groups /= 2) {
+
+        // The midpoint in the number of active groups.
+        int midpoint = active_groups / 2;
+
+        // The upper part of active threads store to shared memory.
+        if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+          *reinterpret_cast<Out_sum *>(out_smem + (vo - midpoint) * Dh + vi) =
+              out;
+        }
+        __syncthreads();
+
+        // The bottom warps update their values.
+        if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+          out = add(*reinterpret_cast<Out_sum const *>(out_smem + vo * Dh + vi),
+                    out);
+        }
+        __syncthreads();
+      }
+    }
+
+    // Output the final values.
+    if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+      convert_from_float(*reinterpret_cast<V_vec *>(
+                             output_ptr + (first_token_idx + qi) * hidden_size +
+                             head_idx * per_head_size + vi),
+                         out);
+      // if (blockIdx.y == 0 && blockIdx.x == 0 && tidx == 0 && qi == 1) {
+      //   printf("tree attn final value, %.9f, %.9f, %.9f, %.9f, %d, %d\n",
+      //          out.x,
+      //          out.y,
+      //          out.z,
+      //          out.w,
+      //          vi,
+      //          (first_token_idx + qi) * hidden_size + head_idx *
+      //          per_head_size +
+      //              vi);
+      // }
+    }
+  }
+}
+
 template <typename DT>
 __global__ void commit_tokens_kernel(
     DT const *devQKVProjArray,
@@ -45,15 +369,15 @@ __global__ void commit_tokens_kernel(
     int max_seq_len,
     int hidden_size) {
 
-  CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size * 2) {
+  CUDA_KERNEL_LOOP(i, num_tokens_to_commit * hidden_size) {
 
-    int token_pos = i / (hidden_size * KV_WEIGHT_NUM);
+    int token_pos = i / (hidden_size);
     int token_idx_in_last_batch = committedTokenInfos[token_pos].token_index;
     int offset = i % hidden_size;
     assert(token_idx_in_last_batch < num_active_tokens_in_last_batch);
 
-    size_t val_idx =
-        token_idx_in_last_batch * 3 * hidden_size + hidden_size + offset;
+    size_t val_idx = token_idx_in_last_batch * QKV_WEIGHT_NUM * hidden_size +
+                     hidden_size + offset;
 
     DT kVal = devQKVProjArray[val_idx];
     DT vVal = devQKVProjArray[val_idx + hidden_size];
@@ -89,8 +413,9 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->kProjSize,
         m->vProjSize,
         num_tokens_to_commit,
-        m->num_active_tokens, // number of active tokens in previous batch
-        BatchConfig::max_sequence_length(),
+        m->num_active_infr_tokens, // number of active tokens in previous batch
+        BatchConfig::max_sequence_length() +
+            BatchConfig::max_spec_tree_token_num(),
         m->hidden_size);
   }
 }
@@ -109,12 +434,15 @@ __global__ void update_tree_branch_kv_cache(
     int total_tokens_in_batch,
     int max_seq_len,
     int hidden_size) {
-  CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size * 2) {
-    int token_idx = i / (hidden_size * KV_WEIGHT_NUM);
+  CUDA_KERNEL_LOOP(i, num_tokens_in_branch * hidden_size) {
+
+    int token_idx = i / (hidden_size);
     int offset = i % hidden_size;
 
     token_idx += processed_tokens_in_batch; // get index in the whole batch
-    size_t val_idx = token_idx * 3 * hidden_size + hidden_size + offset;
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+
     DT kVal = devQKVProjArray[val_idx];
     DT vVal = devQKVProjArray[val_idx + hidden_size];
 
@@ -127,6 +455,53 @@ __global__ void update_tree_branch_kv_cache(
   }
 }
 
+template <typename DT>
+__global__ void update_tree_branch_kv_cache_fused(
+    DT const *devQKVProjArray,
+    DT *kCache_ptr,
+    DT *vCache_ptr,
+    TreeVerifyBatchConfig::PerTokenInfo const *tokenInfos,
+    BatchConfig::PerRequestInfo *request_infos,
+    int qProjSize,
+    int kProjSize,
+    int vProjSize,
+    int num_new_tokens,
+    int max_seq_len,
+    int hidden_size) {
+  CUDA_KERNEL_LOOP(i, num_new_tokens * hidden_size) {
+
+    int token_idx = i / hidden_size;
+    int offset = i % hidden_size;
+    size_t val_idx =
+        token_idx * QKV_WEIGHT_NUM * hidden_size + hidden_size + offset;
+
+    DT kVal = devQKVProjArray[val_idx];
+    DT vVal = devQKVProjArray[val_idx + hidden_size];
+
+    int const req_id = tokenInfos[token_idx].request_index;
+    // int const tok_id = tokenInfos[token_idx].abs_depth_in_request;
+
+    int const request_token_offset =
+        request_infos[req_id].first_token_offset_in_batch;
+    int const first_token_depth =
+        request_infos[req_id].first_token_depth_in_request;
+
+    // if(i % hidden_size == 0){
+    //   printf("update token request id: %d, %d, %d  real id %d, value%.10f\n",
+    //   req_id, token_idx, request_token_offset,(token_idx + first_token_depth
+    //   - request_token_offset), kVal);
+    // }
+    kCache_ptr[req_id * (hidden_size * max_seq_len) +
+               (token_idx + first_token_depth - request_token_offset) *
+                   hidden_size +
+               offset] = kVal;
+    vCache_ptr[req_id * (hidden_size * max_seq_len) +
+               (token_idx + first_token_depth - request_token_offset) *
+                   hidden_size +
+               offset] = vVal;
+  }
+}
+
 template <typename DT>
 __global__ void tree_fill_entries_above_diagonal(DT *matrix,
                                                  size_t new_tokens,
@@ -157,13 +532,14 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   hipblasDatatype_t hipblas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   miopenDataType_t miopen_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   hipblasDatatype_t compute_type = hipblas_data_type;
-#else
-  // TODO: currently use the hipblas_data_type
-  // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  hipblasDatatype_t compute_type = hipblas_data_type;
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #else
+  //   // TODO: currently use the hipblas_data_type
+  //   // cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   hipblasDatatype_t compute_type = hipblas_data_type;
+  // #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
@@ -171,16 +547,20 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   int q_block_size = m->qProjSize;
   int kt_block_size = m->kProjSize;
   int kt_req_block_size =
-      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+      kt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
+      BatchConfig::max_spec_tree_token_num();
   int vt_block_size = m->vProjSize;
   int vt_req_block_size =
-      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length();
+      vt_block_size * m->num_q_heads * BatchConfig::max_sequence_length() +
+      BatchConfig::max_spec_tree_token_num();
   assert(m->qProjSize == m->kProjSize);
 
   for (int i = 0; i < bc->max_requests_per_batch(); i++) {
     if (bc->request_completed[i]) {
       continue;
     }
+    assert(processed_tokens_in_batch ==
+           bc->requestsInfo[i].first_token_offset_in_batch);
     int last_token_idx_of_the_request =
         processed_tokens_in_batch + bc->requestsInfo[i].num_tokens_in_batch - 1;
     while (processed_tokens_in_batch <= last_token_idx_of_the_request) {
@@ -213,7 +593,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens, // total_tokens_in_batch
             BatchConfig::max_sequence_length(),
             m->hidden_size);
       }
@@ -335,24 +715,23 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                          MIOPEN_SOFTMAX_MODE_CHANNEL));
       // Matmul softmax(QK^T/sqrt(d_k)) by V
       alpha = 1.0f, beta = 0.0f;
-      m_ = num_new_tokens;
-      n = m->vProjSize;
+      m_ = m->vProjSize;
+      n = num_new_tokens;
       k = total_tokens_in_request;
-      lda = m_, ldb = n * m->num_q_heads, ldc = m_;
-      strideA = num_new_tokens * total_tokens_in_request;
-      strideB = vt_block_size;
-      strideC = num_new_tokens * m->vProjSize;
-      // To get A, skip over softmax(QK^T/sqrt(d_k)) entries from previous
-      // requests (all heads)
-      A = C_softmax;
-      // To get B, skip over V^T entries from previous requests (all heads +
+      lda = m_ * m->num_q_heads, ldb = n, ldc = m_ * m->num_q_heads;
+      strideA = vt_block_size;
+      strideB = num_new_tokens * total_tokens_in_request;
+      strideC = m->vProjSize;
+      // To get A, skip over V^T entries from previous requests (all heads +
       // padding)
-      B = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      A = static_cast<DT *>(m->valueCache) + i * vt_req_block_size;
+      // To get B, skip over softmax(QK^T/sqrt(d_k)) entries from previous
+      // requests (all heads)
+      B = C_softmax;
       // To get C, skip over softmax(QK^T/sqrt(d_k))V products from previous
       // requests
       C = static_cast<DT *>(m->attn_heads) +
           processed_tokens_in_batch * m->num_q_heads * m->vProjSize;
-
       checkCUDA(hipblasGemmStridedBatchedEx(m->handle.blas,
                                             HIPBLAS_OP_N,
                                             HIPBLAS_OP_T,
@@ -376,45 +755,44 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                             m->num_q_heads,
                                             compute_type,
                                             HIPBLAS_GEMM_DEFAULT));
-
-      // Project to output, save result directly on output tensor
-      alpha = 1.0f, beta = 0.0f;
-      m_ = m->oProjSize;
-      k = m->vProjSize * m->num_q_heads;
-      n = num_new_tokens;
-      lda = k, ldb = n, ldc = m_;
-      A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
-                                   m->kProjSize * m->num_q_heads +
-                                   m->vProjSize * m->num_q_heads);
-      B = C;
-      C = static_cast<DT *>(output_ptr) +
-          processed_tokens_in_batch * m->oProjSize;
-
-      checkCUDA(hipblasGemmEx(m->handle.blas,
-                              HIPBLAS_OP_T,
-                              HIPBLAS_OP_T,
-                              m_,
-                              n,
-                              k,
-                              &alpha,
-                              A,
-                              hipblas_data_type,
-                              lda,
-                              B,
-                              hipblas_data_type,
-                              ldb,
-                              &beta,
-                              C,
-                              hipblas_data_type,
-                              ldc,
-                              compute_type,
-                              HIPBLAS_GEMM_DEFAULT));
       processed_tokens_in_batch += num_new_tokens;
     }
     // Before moving to the next request
     // check that we have finished all tokens of the request
     assert(last_token_idx_of_the_request + 1 == processed_tokens_in_batch);
   }
+  // Project to output, save result directly on output tensor
+  DT alpha = 1.0f, beta = 0.0f;
+  int m_ = m->oProjSize;
+  int k = m->vProjSize * m->num_q_heads;
+  int n = processed_tokens_in_batch;
+  int lda = k, ldb = k, ldc = m_;
+  DT const *A = weight_ptr + m->qSize * (m->qProjSize * m->num_q_heads +
+                                         m->kProjSize * m->num_q_heads +
+                                         m->vProjSize * m->num_q_heads);
+  DT const *B = static_cast<DT *>(m->attn_heads);
+  DT *C = static_cast<DT *>(output_ptr);
+
+  checkCUDA(hipblasGemmEx(m->handle.blas,
+                          HIPBLAS_OP_T,
+                          HIPBLAS_OP_T,
+                          m_,
+                          n,
+                          k,
+                          &alpha,
+                          A,
+                          hipblas_data_type,
+                          lda,
+                          B,
+                          hipblas_data_type,
+                          ldb,
+                          &beta,
+                          C,
+                          hipblas_data_type,
+                          ldc,
+                          compute_type,
+                          HIPBLAS_GEMM_DEFAULT));
+
   if (*m->final_bias && shard_id == 0) {
     int parallelism = m->oProjSize * processed_tokens_in_batch;
     int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
@@ -432,7 +810,85 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                        m->oProjSize);
   }
 
-  assert(processed_tokens_in_batch == bc->num_active_tokens());
+  assert(processed_tokens_in_batch == bc->num_active_infr_tokens());
+}
+
+#define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(                             \
+    DT, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream)      \
+  smem_size_in_bytes_tree<DT>(m->qProjSize,                                    \
+                              BatchConfig::max_sequence_length() +             \
+                                  BatchConfig::max_spec_tree_token_num(),      \
+                              THDS_PER_VALUE,                                  \
+                              THDS_PER_BLOCK,                                  \
+                              bc,                                              \
+                              smem_sz);                                        \
+  compute_attention_kernel_fused_kernel<DT,                                    \
+                                        THDS_PER_BLOCK,                        \
+                                        Dh,                                    \
+                                        Dh_MAX,                                \
+                                        THDS_PER_KEY,                          \
+                                        THDS_PER_VALUE>                        \
+      <<<grid, THDS_PER_BLOCK, smem_sz[1], stream>>>(                          \
+          static_cast<DT *>(m->devQKVProjArray),                               \
+          static_cast<DT *>(m->keyCache),                                      \
+          static_cast<DT *>(m->valueCache),                                    \
+          output_ptr,                                                          \
+          scale,                                                               \
+          BatchConfig::max_sequence_length() +                                 \
+              BatchConfig::BatchConfig::max_spec_tree_token_num(),             \
+          BatchConfig::max_tokens_per_batch(),                                 \
+          m->qProjSize,                                                        \
+          m->hidden_size,                                                      \
+          m->request_infos,                                                    \
+          m->num_q_heads,                                                      \
+          bc->num_active_requests(),                                           \
+          m->causalMask,                                                       \
+          m->request_completed,                                                \
+          smem_sz[0])
+
+template <typename DT>
+void compute_attention_kernel_fused(TreeIncMultiHeadSelfAttentionMeta const *m,
+                                    TreeVerifyBatchConfig const *bc,
+                                    DT *output_ptr,
+                                    hipStream_t stream) {
+
+  // update the kv cache
+  //  update K-V cache
+  int num_new_tokens = bc->num_active_tokens();
+  int parallelism = m->hidden_size * num_new_tokens;
+  update_tree_branch_kv_cache_fused<<<GET_BLOCKS(parallelism),
+                                      min(CUDA_NUM_THREADS, parallelism),
+                                      0,
+                                      stream>>>(
+      static_cast<DT *>(m->devQKVProjArray),
+      static_cast<DT *>(m->keyCache),
+      static_cast<DT *>(m->valueCache),
+      m->token_infos,
+      m->request_infos,
+      m->qProjSize,
+      m->kProjSize,
+      m->vProjSize,
+      num_new_tokens,
+      BatchConfig::max_sequence_length() +
+          BatchConfig::max_spec_tree_token_num(),
+      m->hidden_size);
+
+  dim3 grid(m->num_q_heads, bc->num_active_requests());
+  int const per_head_size = m->qProjSize;
+  float scale = (*m->qk_prod_scaling) ? 1.0f / sqrt(m->kProjSize) : 1.0f;
+  // 0->qk production size, 1->total shared size
+  int smem_sz[2];
+  if (per_head_size == 64) {
+    constexpr int THREADS_PER_VALUE_64 = threads_per_value_t<DT, 64>::value;
+    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
+        DT, 64, 64, 4, THREADS_PER_VALUE_64, 128, stream);
+  } else if (per_head_size == 128) {
+    constexpr int THREADS_PER_VALUE_128 = threads_per_value_t<DT, 128>::value;
+    LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(
+        DT, 128, 128, 4, THREADS_PER_VALUE_128, 128, stream);
+  } else {
+    assert(false && "a unsupported head size");
+  }
 }
 
 template <typename DT>
@@ -461,21 +917,17 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
     }
   }
   // copy committed tokens info to GPU for the commit_tokens kernel
-  // Note that m->num_active_tokens stores the number of active
+  // Note that m->num_active_infr_tokens stores the number of active
   // tokens in the previous batch, which is needed for committing
   // keys/values to the key-value cache
-  checkCUDA(
-      hipMemcpyAsync(m->committed_token_infos,
-                     &(bc->committed_tokens),
-                     bc->num_tokens_to_commit *
-                         sizeof(TreeVerifyBatchConfig::CommittedTokensInfo),
-                     hipMemcpyHostToDevice,
-                     stream));
+  // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit <<
+  // "\n";
+
   commit_tokens<DT>(m, bc, stream);
 
-  // After commit we update m->num_active_tokens to be the number of active
+  // After commit we update m->num_active_infr_tokens to be the number of active
   // tokens for the current batch
-  m->num_active_tokens = bc->num_active_tokens();
+  m->num_active_infr_tokens = bc->num_active_infr_tokens();
 
   // here because we need postion info in infernece 1
   if (m->offload && m->biasSize > 0) {
@@ -483,12 +935,6 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
         m->bias_ptr, bias_ptr, m->biasSize, hipMemcpyHostToDevice, stream));
     bias_ptr = static_cast<DT *>(m->bias_ptr);
   }
-  checkCUDA(hipMemcpyAsync(m->token_infos,
-                           &(bc->tokensInfo),
-                           bc->num_active_tokens() *
-                               sizeof(TreeVerifyBatchConfig::PerTokenInfo),
-                           hipMemcpyHostToDevice,
-                           stream));
   // phase 1: Implement kernel to compute KQV for input tokens
   compute_qkv_kernel(m,
                      bc,
@@ -502,11 +948,20 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   // phase 2: No need to update key/val cache
   // IncMultiHeadSelfAttention::update_kv_cache_kernel(
   //    m, bc, stream);
+  // use the new kernel
+  compute_attention_kernel_fused<DT>(
+      m, bc, static_cast<DT *>(m->attn_heads), stream);
+
+  int processed_tokens_in_batch = bc->num_active_tokens();
 
-  // phase 3: Compute attention score
-  // 3 kernels for pahse 3: matmul1 - softmax - matmal2
-  compute_attention_kernel(
-      m, bc, shard_id, output_ptr, bias_ptr, weight_ptr, stream);
+  compute_o_prod_bias(m,
+                      bc,
+                      shard_id,
+                      output_ptr,
+                      weight_ptr,
+                      bias_ptr,
+                      processed_tokens_in_batch,
+                      stream);
 }
 
 } // namespace TreeIncMultiHeadAttention
@@ -622,34 +1077,21 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     _num_kv_heads,
                                     attn->quantization_type,
                                     attn->offload),
-      num_active_tokens(0) {
+      num_active_infr_tokens(0) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(miopenSetStream(handler.dnn, stream));
 
   // allocate memory for the seqArray and reserve space
   {
-    int max_tokens_per_batch = BatchConfig::max_tokens_per_batch();
-    size_t committed_tokeninfo_size = max_tokens_per_batch;
-    size_t total_size = committed_tokeninfo_size *
-                        sizeof(TreeVerifyBatchConfig::CommittedTokensInfo);
-    if (offload) {
-      // assert that we have enough reserved work space left
-      assert(gpu_mem_allocator.reserved_total_size -
-                 gpu_mem_allocator.reserved_allocated_size >=
-             total_size);
-      committed_token_infos =
-          gpu_mem_allocator
-              .allocate_reserved<TreeVerifyBatchConfig::CommittedTokensInfo>(
-                  committed_tokeninfo_size);
-    } else {
-      gpu_mem_allocator.create_legion_instance(committed_token_reserve_inst,
-                                               total_size);
-      committed_token_infos =
-          gpu_mem_allocator
-              .allocate_instance<TreeVerifyBatchConfig::CommittedTokensInfo>(
-                  committed_tokeninfo_size);
-    }
+
+    causalMask = static_cast<BatchConfig::BitMask *>(
+        handler.batch_config_metadata->causalMask);
+    committed_token_infos =
+        static_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
+            handler.batch_config_metadata->committed_tokens);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
   }
 
   checkCUDA(hipStreamSynchronize(stream));
diff --git a/src/ops/tree_inc_multihead_self_attention.cu b/src/ops/tree_inc_multihead_self_attention.cu
index 50c056c816..86c53d7ea1 100644
--- a/src/ops/tree_inc_multihead_self_attention.cu
+++ b/src/ops/tree_inc_multihead_self_attention.cu
@@ -12,9 +12,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
 #include "cuComplex.h"
-#endif
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_kernels.h"
 #include "flexflow/ops/kernels/inc_multihead_self_attention_utils.cuh"
@@ -390,7 +388,7 @@ void commit_tokens(TreeIncMultiHeadSelfAttentionMeta const *m,
         m->kProjSize,
         m->vProjSize,
         num_tokens_to_commit,
-        m->num_active_tokens, // number of active tokens in previous batch
+        m->num_active_infr_tokens, // number of active tokens in previous batch
         BatchConfig::max_sequence_length() +
             BatchConfig::max_spec_tree_token_num(),
         m->hidden_size);
@@ -509,17 +507,18 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
   cudaDataType_t cublas_data_type = ff_to_cuda_datatype(m->output_type[0]);
   cudnnDataType_t cudnn_data_type = ff_to_cudnn_datatype(m->output_type[0]);
   assert(data_type_size(m->output_type[0]) == sizeof(DT));
-#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
   cudaDataType_t compute_type = cublas_data_type;
-#else
-  // For best performance, set the default cublas compute type to
-  // CUBLAS_COMPUTE_16F for half precision and to
-  // CUBLAS_COMPUTE_32F_FAST_16F for full precision
-  cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
-  if (m->output_type[0] == DT_FLOAT) {
-    compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
-  }
-#endif
+  // #if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+  //   cudaDataType_t compute_type = cublas_data_type;
+  // #else
+  //   // For best performance, set the default cublas compute type to
+  //   // CUBLAS_COMPUTE_16F for half precision and to
+  //   // CUBLAS_COMPUTE_32F_FAST_16F for full precision
+  //   cublasComputeType_t compute_type = CUBLAS_COMPUTE_16F;
+  //   if (m->output_type[0] == DT_FLOAT) {
+  //     compute_type = CUBLAS_COMPUTE_32F_FAST_16F;
+  //   }
+  // #endif
   // int num_requests = bc->num_active_requests();
   int processed_tokens_in_batch = 0;
   // int qkv_block_size =
@@ -571,7 +570,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
             m->vProjSize,
             num_new_tokens,            // num_tokens_in_branch
             processed_tokens_in_batch, // num_processed_tokens_in_batch
-            m->num_active_tokens,      // total_tokens_in_batch
+            m->num_active_infr_tokens, // total_tokens_in_batch
             BatchConfig::max_sequence_length(),
             m->hidden_size);
       }
@@ -773,6 +772,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                          ldc,
                          compute_type,
                          CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
   if (*m->final_bias && shard_id == 0) {
     int parallelism = m->oProjSize * processed_tokens_in_batch;
     int qkv_weight_size = m->qProjSize * m->global_num_q_heads +
@@ -788,7 +788,7 @@ void compute_attention_kernel(TreeIncMultiHeadSelfAttentionMeta const *m,
                                   m->oProjSize);
   }
 
-  assert(processed_tokens_in_batch == bc->num_active_tokens());
+  assert(processed_tokens_in_batch == bc->num_active_infr_tokens());
 }
 
 #define LAUNCH_TREE_VERIFY_ATTENTION_SCORE_KERNEL(                             \
@@ -896,7 +896,7 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
   }
 
   // copy committed tokens info to GPU for the commit_tokens kernel
-  // Note that m->num_active_tokens stores the number of active
+  // Note that m->num_active_infr_tokens stores the number of active
   // tokens in the previous batch, which is needed for committing
   // keys/values to the key-value cache
   // std::cout << "tokens to be committed: " << bc->num_tokens_to_commit <<
@@ -904,9 +904,9 @@ void inference_kernel(TreeIncMultiHeadSelfAttentionMeta *m,
 
   commit_tokens<DT>(m, bc, stream);
 
-  // After commit we update m->num_active_tokens to be the number of active
+  // After commit we update m->num_active_infr_tokens to be the number of active
   // tokens for the current batch
-  m->num_active_tokens = bc->num_active_tokens();
+  m->num_active_infr_tokens = bc->num_active_infr_tokens();
 
   // here because we need postion info in infernece 1
   if (m->offload && m->biasSize > 0) {
@@ -1052,7 +1052,7 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
                                     _num_kv_heads,
                                     attn->quantization_type,
                                     attn->offload),
-      num_active_tokens(0) {
+      num_active_infr_tokens(0) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   checkCUDNN(cudnnSetStream(handler.dnn, stream));
@@ -1060,21 +1060,13 @@ TreeIncMultiHeadSelfAttentionMeta::TreeIncMultiHeadSelfAttentionMeta(
   // allocate memory for the seqArray and reserve space
   {
 
-    causalMask = reinterpret_cast<BatchConfig::BitMask *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo));
+    causalMask = static_cast<BatchConfig::BitMask *>(
+        handler.batch_config_metadata->causalMask);
     committed_token_infos =
-        reinterpret_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
-            reinterpret_cast<char *>(handler.batch_config_metadata) +
-            sizeof(BatchConfig::tokensInfo) +
-            sizeof(BatchConfig::requestsInfo) +
-            sizeof(BatchConfig::causalMask));
-
-    request_completed = reinterpret_cast<bool *>(
-        reinterpret_cast<char *>(handler.batch_config_metadata) +
-        sizeof(BatchConfig::tokensInfo) + sizeof(BatchConfig::requestsInfo) +
-        sizeof(BatchConfig::causalMask) +
-        sizeof(TreeVerifyBatchConfig::committed_tokens));
+        static_cast<TreeVerifyBatchConfig::CommittedTokensInfo *>(
+            handler.batch_config_metadata->committed_tokens);
+    request_completed =
+        static_cast<bool *>(handler.batch_config_metadata->request_completed);
   }
 
   cudaStreamSynchronize(stream);
diff --git a/src/parallel_ops/allreduce.cc b/src/parallel_ops/allreduce.cc
index 5d38e28903..52c4ec2e28 100644
--- a/src/parallel_ops/allreduce.cc
+++ b/src/parallel_ops/allreduce.cc
@@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::AllReduce;
 
 /* Params */
 bool operator==(AllReduceParams const &lhs, AllReduceParams const &rhs) {
-  return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim;
+  return lhs.allreduce_legion_dim == rhs.allreduce_legion_dim &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool AllReduceParams::is_valid(ParallelTensorShape const &input) const {
@@ -55,7 +56,7 @@ bool AllReduceParams::is_valid(ParallelTensorShape const &input) const {
 AllReduceParams AllReduce::get_params() const {
   AllReduceParams params;
   params.allreduce_legion_dim = this->allreduce_dim;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -110,6 +111,7 @@ OpMeta *AllReduce::init_task(Task const *task,
   meta->input_type[0] = ar->inputs[0]->data_type;
   meta->output_type[0] = ar->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
+  std::strcpy(meta->op_name, ar->name);
   return meta;
 }
 
@@ -146,6 +148,102 @@ void AllReduce::init(FFModel const &ff) {
   set_opmeta_from_futuremap(ff, fm);
 }
 
+void AllReduce::forward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = outputs[0]->parallel_is;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  set_argumentmap_for_forward(ff, argmap);
+  IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID,
+                         outputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void AllReduce::forward_task(Task const *task,
+                             std::vector<PhysicalRegion> const &regions,
+                             Context ctx,
+                             Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  forward_kernel_wrapper(m, input, output);
+}
+
+void AllReduce::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID,
+                         inputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         inputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void AllReduce::backward_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  backward_kernel_wrapper(m, input_grad, output_grad);
+}
+
 void AllReduce::init_inference(FFModel const &ff,
                                std::vector<ParallelTensor> const &batch_inputs,
                                std::vector<ParallelTensor> const &batch_outputs,
@@ -224,64 +322,103 @@ FutureMap AllReduce::inference(FFModel const &ff,
   return runtime->execute_index_space(ctx, launcher);
 }
 
-void AllReduce::forward(FFModel const &ff) {
-  ArgumentMap argmap;
-  Context ctx = ff.config.lg_ctx;
-  Runtime *runtime = ff.config.lg_hlr;
-  parallel_is = outputs[0]->parallel_is;
-  assert(numOutputs == 1);
-  assert(numInputs == 1);
-  set_argumentmap_for_forward(ff, argmap);
-  IndexLauncher launcher(ALLREDUCE_FWD_TASK_ID,
-                         outputs[0]->parallel_is,
-                         TaskArgument(NULL, 0),
-                         argmap,
-                         Predicate::TRUE_PRED,
-                         false /*must*/,
-                         0 /*mapper_id*/,
-                         outputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region));
-  launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-                                                    0 /*projection id*/,
-                                                    WRITE_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region));
-  launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
+/*static*/
+void AllReduce::inference_task(Task const *task,
+                               std::vector<PhysicalRegion> const &regions,
+                               Context ctx,
+                               Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  AllReduceMeta *m = *((AllReduceMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
+    return;
+  }
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  inference_kernel_wrapper(m, bc, input, output);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    AllReduce::save_inference_tensors_to_file(
+        m, shard_id, bc, {input}, {}, {output});
+  }
 }
 
-void AllReduce::backward(FFModel const &ff) {
+FutureMap AllReduce::peft_bwd(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
   Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
   assert(numOutputs == 1);
   assert(numInputs == 1);
-  IndexLauncher launcher(ALLREDUCE_BWD_TASK_ID,
-                         inputs[0]->parallel_is,
-                         TaskArgument(NULL, 0),
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+  size_t machine_view_hash =
+      mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(ALLREDUCE_PEFT_BWD_TASK_ID,
+                         batch_outputs[0]->parallel_is,
+                         TaskArgument(nullptr, 0),
                          argmap,
                          Predicate::TRUE_PRED,
                          false /*must*/,
                          0 /*mapper_id*/,
-                         inputs[0]->machine_view.hash());
-  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_WRITE,
-                                                    EXCLUSIVE,
-                                                    inputs[0]->region_grad));
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        WRITE_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
   launcher.add_field(0, FID_DATA);
-  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-                                                    0 /*projection id*/,
-                                                    READ_ONLY,
-                                                    EXCLUSIVE,
-                                                    outputs[0]->region_grad));
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
   launcher.add_field(1, FID_DATA);
-  runtime->execute_index_space(ctx, launcher);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void AllReduce::peft_bwd_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  AllReduceMeta *m = *((AllReduceMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    AllReduce::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {}, {output_grad}, false);
+  }
 }
 
 bool AllReduce::measure_operator_cost(Simulator *sim,
@@ -318,62 +455,6 @@ bool AllReduce::append_parallel_op_info(
   return true;
 }
 
-/*static*/
-void AllReduce::inference_task(Task const *task,
-                               std::vector<PhysicalRegion> const &regions,
-                               Context ctx,
-                               Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
-  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  assert(input.data_type == output.data_type);
-  inference_kernel_wrapper(m, bc, input, output);
-}
-
-/*static*/
-void AllReduce::forward_task(Task const *task,
-                             std::vector<PhysicalRegion> const &regions,
-                             Context ctx,
-                             Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
-
-  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
-      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  assert(input.data_type == output.data_type);
-  forward_kernel_wrapper(m, input, output);
-}
-
-void AllReduce::backward_task(Task const *task,
-                              std::vector<PhysicalRegion> const &regions,
-                              Context ctx,
-                              Runtime *runtime) {
-  assert(regions.size() == 2);
-  assert(task->regions.size() == 2);
-  AllReduceMeta const *m = *((AllReduceMeta **)task->local_args);
-
-  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
-      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
-  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
-      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
-
-  assert(input_grad.data_type == output_grad.data_type);
-  backward_kernel_wrapper(m, input_grad, output_grad);
-}
-
 }; // namespace FlexFlow
 
 namespace std {
diff --git a/src/parallel_ops/combine.cc b/src/parallel_ops/combine.cc
index acc5c414c7..ce9c032350 100644
--- a/src/parallel_ops/combine.cc
+++ b/src/parallel_ops/combine.cc
@@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Combine;
 /* Params */
 bool operator==(CombineParams const &lhs, CombineParams const &rhs) {
   return lhs.combine_legion_dim == rhs.combine_legion_dim &&
-         lhs.combine_degree == rhs.combine_degree;
+         lhs.combine_degree == rhs.combine_degree &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool CombineParams::is_valid(ParallelTensorShape const &input) const {
@@ -58,7 +59,7 @@ CombineParams Combine::get_params() const {
   CombineParams params;
   params.combine_legion_dim = this->combine_dim;
   params.combine_degree = this->combine_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -102,10 +103,11 @@ OpMeta *Combine::init_task(Task const *task,
                            Runtime *runtime) {
   Combine *cmb = (Combine *)task->args;
   FFHandler handle = *((FFHandler *)task->local_args);
-  CombineMeta *m = new CombineMeta(handle);
+  CombineMeta *m = new CombineMeta(handle, cmb);
   m->input_type[0] = cmb->inputs[0]->data_type;
   m->output_type[0] = cmb->outputs[0]->data_type;
   assert(m->input_type[0] == m->output_type[0]);
+  std::strcpy(m->op_name, cmb->name);
   return m;
 }
 
@@ -202,12 +204,23 @@ void Combine::create_input_partition_inference(
   assert(ff.config.computationMode == COMP_MODE_INFERENCE);
   assert(batch_outputs[0]->part != LogicalPartition::NO_PART);
   assert(batch_inputs[0]->part != LogicalPartition::NO_PART);
-  // input_lp is a disjoint partition
+  // partition batch_inputs[0]->region into inference_input_lps[batch_inputs[0]]
+  // according to the partitioning of batch_outputs[0] (i.e. make the
+  // partitioned dimension whole again by combining the partitions)
   ff.create_disjoint_partition(batch_outputs[0]->num_dims,
                                batch_outputs[0]->dims,
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  // partition batch_outputs[0]->region_grad into
+  // inference_output_grad_lps[batch_outputs[0]] according to the partitioning
+  // of batch_inputs[0] (i.e. restore the partition in the dimension that was
+  // combined in the forward pass)
+  ff.create_disjoint_partition(batch_inputs[0]->num_dims,
+                               batch_inputs[0]->dims,
+                               batch_inputs[0]->parallel_is,
+                               batch_outputs[0]->region_grad,
+                               inference_output_grad_lps[batch_outputs[0]]);
 }
 
 FutureMap Combine::inference(FFModel const &ff,
@@ -226,7 +239,7 @@ FutureMap Combine::inference(FFModel const &ff,
   size_t machine_view_hash =
       mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
   set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
-  IndexLauncher launcher(COMBINE_FWD_TASK_ID,
+  IndexLauncher launcher(COMBINE_INF_TASK_ID,
                          batch_outputs[0]->parallel_is,
                          TaskArgument(nullptr, 0),
                          argmap,
@@ -234,6 +247,7 @@ FutureMap Combine::inference(FFModel const &ff,
                          false /*must*/,
                          0 /*mapper_id*/,
                          machine_view_hash);
+  launcher.add_future(bc);
   launcher.add_region_requirement(
       RegionRequirement(inference_input_lps[batch_inputs[0]],
                         0 /*projection id*/,
@@ -278,6 +292,52 @@ void Combine::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
+FutureMap Combine::peft_bwd(FFModel const &ff,
+                            BatchConfigFuture const &bc,
+                            std::vector<ParallelTensor> const &batch_inputs,
+                            std::vector<ParallelTensor> const &batch_outputs,
+                            MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = inputs[0]->data_type;
+
+  // Warning: we need to use batch_inputs[0] here, instead of the usual
+  // batch_outputs[0]
+  parallel_is = batch_inputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view;
+
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(COMBINE_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(&data_type, sizeof(DataType)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(inference_output_grad_lps[batch_outputs[0]],
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        WRITE_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
 void Combine::backward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -357,6 +417,37 @@ tl::optional<RecordFormatter> Combine::as_dot() const {
   return rf;
 }
 
+/*static*/
+void Combine::inference_task(Task const *task,
+                             std::vector<PhysicalRegion> const &regions,
+                             Context ctx,
+                             Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  CombineMeta const *m = *((CombineMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
+    return;
+  }
+  DataType data_type = m->input_type[0];
+  if (m->inference_debugging) {
+    std::cout << "INF " << m->op_name << std::endl;
+  }
+  if (data_type == DT_HALF) {
+    forward_task_with_type<half>(task, regions, ctx, runtime);
+  } else if (data_type == DT_FLOAT) {
+    forward_task_with_type<float>(task, regions, ctx, runtime);
+  } else if (data_type == DT_DOUBLE) {
+    forward_task_with_type<double>(task, regions, ctx, runtime);
+  } else if (data_type == DT_INT32) {
+    forward_task_with_type<int32_t>(task, regions, ctx, runtime);
+  } else if (data_type == DT_INT64) {
+    forward_task_with_type<int64_t>(task, regions, ctx, runtime);
+  } else {
+    assert(false && "Unsupported data type in Combine forward");
+  }
+}
+
 /*static*/
 void Combine::forward_task(Task const *task,
                            std::vector<PhysicalRegion> const &regions,
@@ -400,6 +491,56 @@ void Combine::forward_task_with_type(Task const *task,
   forward_kernel<DT>(input_ptr, output_ptr, output_domain.get_volume());
 }
 
+void Combine::peft_bwd_task(Task const *task,
+                            std::vector<PhysicalRegion> const &regions,
+                            Context ctx,
+                            Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  // CombineMeta const *m = *((CombineMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  // TODO: figure out why m->output_type[0] or m->input_type[0] are not working
+  DataType data_type = *((DataType *)task->args);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      data_type, regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      data_type, regions[1], task->regions[1], FID_DATA, ctx, runtime);
+  assert(input_grad.data_type == data_type);
+  assert(output_grad.domain == input_grad.domain);
+  CombineMeta const *m = *((CombineMeta **)task->local_args);
+  int shard_id = task->index_point.point_data[0];
+  if (shard_id == 0 && m->inference_debugging) {
+    // m is null when shard_id > 0 for some reason
+    std::cout << "BWD " << m->op_name << std::endl;
+  }
+  if (data_type == DT_HALF) {
+    backward_kernel<half>(output_grad.get_half_ptr(),
+                          input_grad.get_half_ptr(),
+                          output_grad.domain.get_volume());
+  } else if (data_type == DT_FLOAT) {
+    backward_kernel<float>(output_grad.get_float_ptr(),
+                           input_grad.get_float_ptr(),
+                           output_grad.domain.get_volume());
+  } else if (data_type == DT_DOUBLE) {
+    backward_kernel<double>(output_grad.get_double_ptr(),
+                            input_grad.get_double_ptr(),
+                            output_grad.domain.get_volume());
+  } else if (data_type == DT_INT32) {
+    backward_kernel<int32_t>(output_grad.get_int32_ptr(),
+                             input_grad.get_int32_ptr(),
+                             output_grad.domain.get_volume());
+  } else if (data_type == DT_INT64) {
+    backward_kernel<int64_t>(output_grad.get_int64_ptr(),
+                             input_grad.get_int64_ptr(),
+                             output_grad.domain.get_volume());
+  } else {
+    assert(false && "Unsupported data type in Combine backward");
+  }
+}
+
 void Combine::backward_task(Task const *task,
                             std::vector<PhysicalRegion> const &regions,
                             Context ctx,
diff --git a/src/parallel_ops/fused_parallel_op.cc b/src/parallel_ops/fused_parallel_op.cc
index 1a76cbfc40..dec7b20fb2 100644
--- a/src/parallel_ops/fused_parallel_op.cc
+++ b/src/parallel_ops/fused_parallel_op.cc
@@ -59,7 +59,7 @@ FusedParallelOpParams FusedParallelOp::get_params() const {
   std::vector<ParallelOpInfo> ops(std::begin(this->parallel_ops),
                                   std::end(this->parallel_ops));
   params.parallel_ops = ops;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cpp b/src/parallel_ops/kernels/allreduce_kernels.cpp
index 8d7e20e395..7067035465 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cpp
+++ b/src/parallel_ops/kernels/allreduce_kernels.cpp
@@ -20,26 +20,23 @@
 namespace FlexFlow {
 
 AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
-                              BatchConfig const *bc,
-                              GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output) {
+void forward_kernel_wrapper(AllReduceMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
-  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  size_t num_elements = bc->num_tokens * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          num_elements,
+                          input.domain.get_volume(),
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -49,19 +46,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
-                            GenericTensorAccessorR const &input,
-                            GenericTensorAccessorW const &output) {
+void backward_kernel_wrapper(AllReduceMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  assert(false && "To be implemented");
+}
+
+void inference_kernel_wrapper(AllReduceMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
   size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          input.domain.get_volume(),
+                          num_elements,
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -71,10 +76,29 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void backward_kernel_wrapper(AllReduceMeta const *m,
+void peft_bwd_kernel_wrapper(AllReduceMeta const *m,
+                             BatchConfig const *bc,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad) {
-  assert(false && "To be implemented");
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(input_grad.domain == output_grad.domain);
+  size_t hidden_dim_size =
+      input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
+#ifdef FF_USE_NCCL
+  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type);
+  checkNCCL(ncclAllReduce(output_grad.ptr,
+                          input_grad.ptr,
+                          num_elements,
+                          nccl_data_type,
+                          ncclSum,
+                          m->handle.ncclComm,
+                          stream));
+#else
+  assert(false && "Must enable FF_USE_NCCL to use AllReduce operators");
+#endif
 }
 
 } // namespace AllReduce
diff --git a/src/parallel_ops/kernels/allreduce_kernels.cu b/src/parallel_ops/kernels/allreduce_kernels.cu
index 2c000137a1..3041f9adf9 100644
--- a/src/parallel_ops/kernels/allreduce_kernels.cu
+++ b/src/parallel_ops/kernels/allreduce_kernels.cu
@@ -13,32 +13,30 @@
  * limitations under the License.
  */
 
+#include "flexflow/ffconst_utils.h"
 #include "flexflow/parallel_ops/kernels/allreduce_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
 AllReduceMeta::AllReduceMeta(FFHandler handle, AllReduce const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace AllReduce {
 
-void inference_kernel_wrapper(AllReduceMeta const *m,
-                              BatchConfig const *bc,
-                              GenericTensorAccessorR const &input,
-                              GenericTensorAccessorW const &output) {
+void forward_kernel_wrapper(AllReduceMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
-  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
-  size_t num_elements = bc->num_tokens * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          num_elements,
+                          input.domain.get_volume(),
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -48,18 +46,27 @@ void inference_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void forward_kernel_wrapper(AllReduceMeta const *m,
-                            GenericTensorAccessorR const &input,
-                            GenericTensorAccessorW const &output) {
+void backward_kernel_wrapper(AllReduceMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  assert(false && "To be implemented");
+}
+
+void inference_kernel_wrapper(AllReduceMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(input.data_type == output.data_type);
   assert(input.domain == output.domain);
+  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
 #ifdef FF_USE_NCCL
   ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input.data_type);
   checkNCCL(ncclAllReduce(input.ptr,
                           output.ptr,
-                          input.domain.get_volume(),
+                          num_elements,
                           nccl_data_type,
                           ncclSum,
                           m->handle.ncclComm,
@@ -69,10 +76,23 @@ void forward_kernel_wrapper(AllReduceMeta const *m,
 #endif
 }
 
-void backward_kernel_wrapper(AllReduceMeta const *m,
+void peft_bwd_kernel_wrapper(AllReduceMeta const *m,
+                             BatchConfig const *bc,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad) {
-  assert(false && "To be implemented");
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(input_grad.domain == output_grad.domain);
+  size_t hidden_dim_size =
+      input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens();
+  size_t data_size = data_type_size(output_grad.data_type);
+  checkCUDA(cudaMemcpyAsync(input_grad.ptr,
+                            output_grad.ptr,
+                            hidden_dim_size * num_elements * data_size,
+                            cudaMemcpyDeviceToDevice,
+                            stream));
 }
 
 } // namespace AllReduce
diff --git a/src/parallel_ops/kernels/combine_kernels.cpp b/src/parallel_ops/kernels/combine_kernels.cpp
index d6e9568223..2a29be1ad4 100644
--- a/src/parallel_ops/kernels/combine_kernels.cpp
+++ b/src/parallel_ops/kernels/combine_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/parallel_ops/kernels/combine_kernels.h"
+#include "flexflow/parallel_ops/combine.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {}
+CombineMeta::CombineMeta(FFHandler handler, Combine const *comb)
+    : OpMeta(handler, comb) {}
 
 namespace Kernels {
 namespace Combine {
diff --git a/src/parallel_ops/kernels/combine_kernels.cu b/src/parallel_ops/kernels/combine_kernels.cu
index 1ab79a7944..5809e2d4f3 100644
--- a/src/parallel_ops/kernels/combine_kernels.cu
+++ b/src/parallel_ops/kernels/combine_kernels.cu
@@ -13,12 +13,14 @@
  * limitations under the License.
  */
 
+#include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/kernels/combine_kernels.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-CombineMeta::CombineMeta(FFHandler handler) : OpMeta(handler) {}
+CombineMeta::CombineMeta(FFHandler handler, Combine const *comb)
+    : OpMeta(handler, comb) {}
 
 namespace Kernels {
 namespace Combine {
diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cpp b/src/parallel_ops/kernels/parallel_identity_kernels.cpp
new file mode 100644
index 0000000000..8378231fb2
--- /dev/null
+++ b/src/parallel_ops/kernels/parallel_identity_kernels.cpp
@@ -0,0 +1,97 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/utils/hip_helper.h"
+#include <hip/hip_runtime.h>
+
+namespace FlexFlow {
+
+ParallelIdentityMeta::ParallelIdentityMeta(FFHandler handle,
+                                           ParallelIdentity const *reduct)
+    : OpMeta(handle, reduct) {}
+
+namespace Kernels {
+namespace ParallelIdentity {
+
+void forward_kernel_wrapper(ParallelIdentityMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input.data_type == output.data_type);
+  assert(input.domain == output.domain);
+  size_t data_size = data_type_size(input.data_type);
+  // copy input to output
+  checkCUDA(hipMemcpyAsync(output.ptr,
+                           input.ptr,
+                           input.domain.get_volume() * data_size,
+                           hipMemcpyDeviceToDevice,
+                           stream));
+}
+
+void backward_kernel_wrapper(ParallelIdentityMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  assert(false && "To be implemented");
+}
+
+void inference_kernel_wrapper(ParallelIdentityMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input.data_type == output.data_type);
+  assert(input.domain == output.domain);
+  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens();
+  size_t data_size = data_type_size(input.data_type);
+  checkCUDA(hipMemcpyAsync(output.ptr,
+                           input.ptr,
+                           hidden_dim_size * num_elements * data_size,
+                           hipMemcpyDeviceToDevice,
+                           stream));
+}
+
+void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(input_grad.domain == output_grad.domain);
+  size_t hidden_dim_size =
+      input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
+#ifdef FF_USE_NCCL
+  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type);
+  checkNCCL(ncclAllReduce(output_grad.ptr,
+                          input_grad.ptr,
+                          num_elements,
+                          nccl_data_type,
+                          ncclSum,
+                          m->handle.ncclComm,
+                          stream));
+#else
+  assert(false && "Must enable FF_USE_NCCL to use ParallelIdentity operators");
+#endif
+}
+
+} // namespace ParallelIdentity
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/src/parallel_ops/kernels/parallel_identity_kernels.cu b/src/parallel_ops/kernels/parallel_identity_kernels.cu
new file mode 100644
index 0000000000..6800f3ab16
--- /dev/null
+++ b/src/parallel_ops/kernels/parallel_identity_kernels.cu
@@ -0,0 +1,96 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
+#include "flexflow/utils/cuda_helper.h"
+
+namespace FlexFlow {
+
+ParallelIdentityMeta::ParallelIdentityMeta(FFHandler handle,
+                                           ParallelIdentity const *reduct)
+    : OpMeta(handle, reduct) {}
+
+namespace Kernels {
+namespace ParallelIdentity {
+
+void forward_kernel_wrapper(ParallelIdentityMeta const *m,
+                            GenericTensorAccessorR const &input,
+                            GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input.data_type == output.data_type);
+  assert(input.domain == output.domain);
+  size_t data_size = data_type_size(input.data_type);
+  // copy input to output
+  checkCUDA(cudaMemcpyAsync(output.ptr,
+                            input.ptr,
+                            input.domain.get_volume() * data_size,
+                            cudaMemcpyDeviceToDevice,
+                            stream));
+}
+
+void backward_kernel_wrapper(ParallelIdentityMeta const *m,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  assert(false && "To be implemented");
+}
+
+void inference_kernel_wrapper(ParallelIdentityMeta const *m,
+                              BatchConfig const *bc,
+                              GenericTensorAccessorR const &input,
+                              GenericTensorAccessorW const &output) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input.data_type == output.data_type);
+  assert(input.domain == output.domain);
+  size_t hidden_dim_size = input.domain.hi()[0] - input.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens();
+  size_t data_size = data_type_size(input.data_type);
+  checkCUDA(cudaMemcpyAsync(output.ptr,
+                            input.ptr,
+                            hidden_dim_size * num_elements * data_size,
+                            cudaMemcpyDeviceToDevice,
+                            stream));
+}
+
+void peft_bwd_kernel_wrapper(ParallelIdentityMeta const *m,
+                             BatchConfig const *bc,
+                             GenericTensorAccessorW const &input_grad,
+                             GenericTensorAccessorR const &output_grad) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(input_grad.data_type == output_grad.data_type);
+  assert(input_grad.domain == output_grad.domain);
+  size_t hidden_dim_size =
+      input_grad.domain.hi()[0] - input_grad.domain.lo()[0] + 1;
+  size_t num_elements = bc->num_active_tokens() * hidden_dim_size;
+#ifdef FF_USE_NCCL
+  ncclDataType_t nccl_data_type = ff_to_nccl_datatype(input_grad.data_type);
+  checkNCCL(ncclAllReduce(output_grad.ptr,
+                          input_grad.ptr,
+                          num_elements,
+                          nccl_data_type,
+                          ncclSum,
+                          m->handle.ncclComm,
+                          stream));
+#else
+  assert(false && "Must enable FF_USE_NCCL to use ParallelIdentity operators");
+#endif
+}
+
+} // namespace ParallelIdentity
+} // namespace Kernels
+} // namespace FlexFlow
diff --git a/src/parallel_ops/kernels/partition_kernels.cpp b/src/parallel_ops/kernels/partition_kernels.cpp
index cfd76c0f18..bd1c96d4c7 100644
--- a/src/parallel_ops/kernels/partition_kernels.cpp
+++ b/src/parallel_ops/kernels/partition_kernels.cpp
@@ -14,12 +14,14 @@
  */
 
 #include "flexflow/parallel_ops/kernels/partition_kernels.h"
+#include "flexflow/parallel_ops/partition.h"
 #include "flexflow/utils/hip_helper.h"
 #include <hip/hip_runtime.h>
 
 namespace FlexFlow {
 
-RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {}
+RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart)
+    : OpMeta(handler, repart) {}
 
 namespace Kernels {
 namespace Repartition {
diff --git a/src/parallel_ops/kernels/partition_kernels.cu b/src/parallel_ops/kernels/partition_kernels.cu
index 08008f1035..3a39b39fe4 100644
--- a/src/parallel_ops/kernels/partition_kernels.cu
+++ b/src/parallel_ops/kernels/partition_kernels.cu
@@ -14,11 +14,13 @@
  */
 
 #include "flexflow/parallel_ops/kernels/partition_kernels.h"
+#include "flexflow/parallel_ops/partition.h"
 #include "flexflow/utils/cuda_helper.h"
 
 namespace FlexFlow {
 
-RepartitionMeta::RepartitionMeta(FFHandler handler) : OpMeta(handler) {}
+RepartitionMeta::RepartitionMeta(FFHandler handler, Repartition const *repart)
+    : OpMeta(handler, repart) {}
 
 namespace Kernels {
 namespace Repartition {
diff --git a/src/parallel_ops/kernels/reduction_kernels.cpp b/src/parallel_ops/kernels/reduction_kernels.cpp
index 2a3fe5cca1..1f3e8e0962 100644
--- a/src/parallel_ops/kernels/reduction_kernels.cpp
+++ b/src/parallel_ops/kernels/reduction_kernels.cpp
@@ -20,7 +20,7 @@
 namespace FlexFlow {
 
 ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace Reduction {
diff --git a/src/parallel_ops/kernels/reduction_kernels.cu b/src/parallel_ops/kernels/reduction_kernels.cu
index 34ae8007da..df7630976b 100644
--- a/src/parallel_ops/kernels/reduction_kernels.cu
+++ b/src/parallel_ops/kernels/reduction_kernels.cu
@@ -19,7 +19,7 @@
 namespace FlexFlow {
 
 ReductionMeta::ReductionMeta(FFHandler handle, Reduction const *reduct)
-    : OpMeta(handle) {}
+    : OpMeta(handle, reduct) {}
 
 namespace Kernels {
 namespace Reduction {
diff --git a/src/parallel_ops/kernels/replicate_kernels.cpp b/src/parallel_ops/kernels/replicate_kernels.cpp
index 1647f014be..f49e0d4eb0 100644
--- a/src/parallel_ops/kernels/replicate_kernels.cpp
+++ b/src/parallel_ops/kernels/replicate_kernels.cpp
@@ -20,7 +20,7 @@
 namespace FlexFlow {
 
 ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl)
-    : OpMeta(handle) {}
+    : OpMeta(handle, repl) {}
 
 namespace Kernels {
 namespace Replicate {
diff --git a/src/parallel_ops/kernels/replicate_kernels.cu b/src/parallel_ops/kernels/replicate_kernels.cu
index 35bc109bd3..0b5c434aa6 100644
--- a/src/parallel_ops/kernels/replicate_kernels.cu
+++ b/src/parallel_ops/kernels/replicate_kernels.cu
@@ -19,7 +19,7 @@
 namespace FlexFlow {
 
 ReplicateMeta::ReplicateMeta(FFHandler handle, Replicate const *repl)
-    : OpMeta(handle) {}
+    : OpMeta(handle, repl) {}
 
 namespace Kernels {
 namespace Replicate {
diff --git a/src/parallel_ops/parallel_identity.cc b/src/parallel_ops/parallel_identity.cc
new file mode 100644
index 0000000000..883910ae09
--- /dev/null
+++ b/src/parallel_ops/parallel_identity.cc
@@ -0,0 +1,474 @@
+/* Copyright 2023 CMU, Facebook, LANL, MIT, NVIDIA, and Stanford (alphabetical)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flexflow/parallel_ops/parallel_identity.h"
+#include "flexflow/ffconst_utils.h"
+#include "flexflow/model.h"
+#include "flexflow/parallel_ops/kernels/parallel_identity_kernels.h"
+#include "flexflow/utils/hash_utils.h"
+
+namespace FlexFlow {
+// declare Legion names
+using Legion::ArgumentMap;
+using Legion::Context;
+using Legion::coord_t;
+using Legion::Domain;
+using Legion::Future;
+using Legion::FutureMap;
+using Legion::IndexLauncher;
+using Legion::LogicalPartition;
+using Legion::LogicalRegion;
+using Legion::Machine;
+using Legion::Memory;
+using Legion::PhysicalRegion;
+using Legion::Predicate;
+using Legion::Rect;
+using Legion::RegionRequirement;
+using Legion::Runtime;
+using Legion::Task;
+using Legion::TaskArgument;
+using Legion::TaskLauncher;
+
+using namespace FlexFlow::Kernels::ParallelIdentity;
+
+/* Params */
+bool operator==(ParallelIdentityParams const &lhs,
+                ParallelIdentityParams const &rhs) {
+  return lhs.parallel_identity_legion_dim == rhs.parallel_identity_legion_dim &&
+         std::strcmp(lhs.name, rhs.name) == 0;
+}
+
+bool ParallelIdentityParams::is_valid(ParallelTensorShape const &input) const {
+  return input.is_valid();
+}
+
+ParallelIdentityParams ParallelIdentity::get_params() const {
+  ParallelIdentityParams params;
+  params.parallel_identity_legion_dim = this->parallel_identity_dim;
+  if (strlen(this->name) < MAX_OPNAME) {
+    strcpy(params.name, this->name);
+  }
+  return params;
+}
+
+ParallelIdentity::ParallelIdentity(FFModel &model,
+                                   const ParallelTensor _input,
+                                   int _parallel_identity_legion_dim,
+                                   char const *name)
+    : ParallelOp(model, OP_PARALLEL_IDENTITY, name, _input),
+      parallel_identity_dim(_parallel_identity_legion_dim) {
+  int numdim = _input->num_dims;
+  ParallelDim dims[MAX_TENSOR_DIM];
+  for (int i = 0; i < numdim; i++) {
+    dims[i] = _input->dims[i];
+  }
+  assert(dims[parallel_identity_dim].degree > 1);
+  // ParallelTensorBase::update_parallel_ids(numdim, dims);
+  outputs[0] = model.create_parallel_tensor_legion_ordering(
+      numdim, dims, _input->data_type, this);
+}
+
+ParallelIdentity::ParallelIdentity(FFModel &model,
+                                   ParallelIdentityParams const &params,
+                                   ParallelTensor const input,
+                                   char const *name)
+    : ParallelIdentity(
+          model, input, params.parallel_identity_legion_dim, params.name) {}
+
+void ParallelIdentity::create_input_partition(FFModel &ff) {
+  // Do nothing
+  return;
+}
+
+void ParallelIdentity::create_input_partition_inference(
+    FFModel &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs) {
+  assert(ff.config.computationMode == COMP_MODE_INFERENCE);
+  assert(batch_outputs[0]->part != LogicalPartition::NO_PART);
+  assert(batch_inputs[0]->part != LogicalPartition::NO_PART);
+  // Do nothing
+  return;
+}
+
+OpMeta *ParallelIdentity::init_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  ParallelIdentity *ar = (ParallelIdentity *)task->args;
+  FFHandler handle = *((FFHandler const *)task->local_args);
+  ParallelIdentityMeta *meta = new ParallelIdentityMeta(handle, ar);
+  meta->input_type[0] = ar->inputs[0]->data_type;
+  meta->output_type[0] = ar->outputs[0]->data_type;
+  assert(meta->input_type[0] == meta->output_type[0]);
+  std::strcpy(meta->op_name, ar->name);
+  return meta;
+}
+
+void ParallelIdentity::init(FFModel const &ff) {
+  ArgumentMap argmap;
+  parallel_is = outputs[0]->parallel_is;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  set_argumentmap_for_init(ff, argmap);
+  IndexLauncher launcher(PARALLEL_IDENTITY_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(ParallelIdentity)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap(ff, fm);
+}
+
+void ParallelIdentity::forward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = outputs[0]->parallel_is;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  set_argumentmap_for_forward(ff, argmap);
+  IndexLauncher launcher(PARALLEL_IDENTITY_FWD_TASK_ID,
+                         outputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         outputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void ParallelIdentity::forward_task(Task const *task,
+                                    std::vector<PhysicalRegion> const &regions,
+                                    Context ctx,
+                                    Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  ParallelIdentityMeta const *m = *((ParallelIdentityMeta **)task->local_args);
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  forward_kernel_wrapper(m, input, output);
+}
+
+void ParallelIdentity::backward(FFModel const &ff) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  IndexLauncher launcher(PARALLEL_IDENTITY_BWD_TASK_ID,
+                         inputs[0]->parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         inputs[0]->machine_view.hash());
+  launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_WRITE,
+                                                    EXCLUSIVE,
+                                                    inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  runtime->execute_index_space(ctx, launcher);
+}
+
+void ParallelIdentity::backward_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  ParallelIdentityMeta const *m = *((ParallelIdentityMeta **)task->local_args);
+
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  backward_kernel_wrapper(m, input_grad, output_grad);
+}
+
+void ParallelIdentity::init_inference(
+    FFModel const &ff,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  parallel_is = batch_outputs[0]->parallel_is;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  size_t machine_view_hash =
+      mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
+  set_argumentmap_for_init_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(PARALLEL_IDENTITY_INIT_TASK_ID,
+                         parallel_is,
+                         TaskArgument(this, sizeof(ParallelIdentity)),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  FutureMap fm = runtime->execute_index_space(ctx, launcher);
+  fm.wait_all_results();
+  set_opmeta_from_futuremap_inference(ff, fm, batch_outputs[0]);
+}
+
+FutureMap ParallelIdentity::inference(
+    FFModel const &ff,
+    BatchConfigFuture const &bc,
+    std::vector<ParallelTensor> const &batch_inputs,
+    std::vector<ParallelTensor> const &batch_outputs,
+    MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+  size_t machine_view_hash =
+      mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(PARALLEL_IDENTITY_INF_TASK_ID,
+                         batch_outputs[0]->parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(RegionRequirement(batch_inputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    READ_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_inputs[0]->region));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(RegionRequirement(batch_outputs[0]->part,
+                                                    0 /*projection id*/,
+                                                    WRITE_ONLY,
+                                                    EXCLUSIVE,
+                                                    batch_outputs[0]->region));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void ParallelIdentity::inference_task(
+    Task const *task,
+    std::vector<PhysicalRegion> const &regions,
+    Context ctx,
+    Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  ParallelIdentityMeta *m = *((ParallelIdentityMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_tokens() == 0) {
+    return;
+  }
+
+  GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input.data_type == output.data_type);
+  inference_kernel_wrapper(m, bc, input, output);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    ParallelIdentity::save_inference_tensors_to_file(
+        m, shard_id, bc, {input}, {}, {output});
+  }
+}
+
+FutureMap
+    ParallelIdentity::peft_bwd(FFModel const &ff,
+                               BatchConfigFuture const &bc,
+                               std::vector<ParallelTensor> const &batch_inputs,
+                               std::vector<ParallelTensor> const &batch_outputs,
+                               MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  parallel_is = batch_outputs[0]->parallel_is;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+  size_t machine_view_hash =
+      mv ? mv->hash() : batch_outputs[0]->machine_view.hash();
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  IndexLauncher launcher(PARALLEL_IDENTITY_PEFT_BWD_TASK_ID,
+                         batch_outputs[0]->parallel_is,
+                         TaskArgument(nullptr, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_future(bc);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        WRITE_ONLY,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_outputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
+/*static*/
+void ParallelIdentity::peft_bwd_task(Task const *task,
+                                     std::vector<PhysicalRegion> const &regions,
+                                     Context ctx,
+                                     Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+
+  ParallelIdentityMeta *m = *((ParallelIdentityMeta **)task->local_args);
+  BatchConfig const *bc = BatchConfig::from_future(task->futures[0]);
+  if (bc->num_active_peft_tokens() == 0) {
+    return;
+  }
+  GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
+      m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
+      m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  assert(input_grad.data_type == output_grad.data_type);
+  peft_bwd_kernel_wrapper(m, bc, input_grad, output_grad);
+  if (m->inference_debugging) {
+    assert(task->index_point.get_dim() == 1);
+    int shard_id = task->index_point.point_data[0];
+    ParallelIdentity::save_inference_tensors_to_file(
+        m, shard_id, bc, {input_grad}, {}, {output_grad}, false);
+  }
+}
+
+bool ParallelIdentity::measure_operator_cost(Simulator *sim,
+                                             MachineView const &pc,
+                                             CostMetrics &cost_metrics) const {
+  cost_metrics = CostMetrics();
+  cost_metrics.forward_time = 0.0f;
+  cost_metrics.backward_time = 0.0f;
+
+  cost_metrics.sync_time = 0;
+  cost_metrics.inputs_memory = 0;
+  cost_metrics.outputs_memory = 0;
+  cost_metrics.weights_memory = 0;
+  return true;
+}
+
+bool ParallelIdentity::get_int_parameter(PMParameter para, int *value) const {
+  switch (para) {
+    case PM_PARALLEL_IDENTITY_DIM:
+      *value = parallel_identity_dim;
+      return true;
+    default:
+      return Op::get_int_parameter(para, value);
+  }
+}
+
+bool ParallelIdentity::append_parallel_op_info(
+    std::vector<ParallelOpInfo> &parallel_ops) const {
+  ParallelOpInfo ret;
+  ret.op_type = op_type;
+  ret.parallel_dim = parallel_identity_dim;
+  ret.parallel_degree = -1; // ParallelIdentity does not affect parallel degree
+  parallel_ops.push_back(ret);
+  return true;
+}
+
+}; // namespace FlexFlow
+
+namespace std {
+size_t hash<FlexFlow::ParallelIdentityParams>::operator()(
+    FlexFlow::ParallelIdentityParams const &params) const {
+  size_t key = 0;
+  hash_combine(key, params.parallel_identity_legion_dim);
+  return key;
+}
+
+} // namespace std
diff --git a/src/parallel_ops/partition.cc b/src/parallel_ops/partition.cc
index e6ab09d088..fddf739599 100644
--- a/src/parallel_ops/partition.cc
+++ b/src/parallel_ops/partition.cc
@@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Repartition;
 /* Params */
 bool operator==(RepartitionParams const &lhs, RepartitionParams const &rhs) {
   return lhs.repartition_legion_dim == rhs.repartition_legion_dim &&
-         lhs.repartition_degree == rhs.repartition_degree;
+         lhs.repartition_degree == rhs.repartition_degree &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool RepartitionParams::is_valid(ParallelTensorShape const &input) const {
@@ -60,7 +61,7 @@ RepartitionParams Repartition::get_params() const {
   RepartitionParams params;
   params.repartition_legion_dim = this->repartition_dim;
   params.repartition_degree = this->repartition_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -200,6 +201,11 @@ void Repartition::create_input_partition_inference(
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  ff.create_disjoint_partition(batch_inputs[0]->num_dims,
+                               batch_inputs[0]->dims,
+                               batch_inputs[0]->parallel_is,
+                               batch_outputs[0]->region_grad,
+                               inference_output_grad_lps[batch_outputs[0]]);
 }
 
 FutureMap
diff --git a/src/parallel_ops/reduction.cc b/src/parallel_ops/reduction.cc
index 5ca2b1301c..7306e04334 100644
--- a/src/parallel_ops/reduction.cc
+++ b/src/parallel_ops/reduction.cc
@@ -45,7 +45,8 @@ using namespace FlexFlow::Kernels::Reduction;
 /* Params */
 bool operator==(ReductionParams const &lhs, ReductionParams const &rhs) {
   return lhs.reduction_legion_dim == rhs.reduction_legion_dim &&
-         lhs.reduction_degree == rhs.reduction_degree;
+         lhs.reduction_degree == rhs.reduction_degree &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool ReductionParams::is_valid(ParallelTensorShape const &input) const {
@@ -56,7 +57,7 @@ ReductionParams Reduction::get_params() const {
   ReductionParams params;
   params.reduction_legion_dim = this->reduction_dim;
   params.reduction_degree = this->reduction_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -125,6 +126,13 @@ void Reduction::create_input_partition_inference(
                                batch_outputs[0]->parallel_is,
                                batch_inputs[0]->region,
                                inference_input_lps[batch_inputs[0]]);
+  // output_grad_lp is an aliased partitioning along the replica dim
+  ff.create_aliased_partition(batch_inputs[0]->num_dims,
+                              batch_inputs[0]->dims,
+                              reduction_dim,
+                              batch_inputs[0]->parallel_is,
+                              batch_outputs[0]->region_grad,
+                              inference_output_grad_lps[batch_outputs[0]]);
 }
 
 OpMeta *Reduction::init_task(Task const *task,
@@ -137,6 +145,7 @@ OpMeta *Reduction::init_task(Task const *task,
   meta->input_type[0] = reduct->inputs[0]->data_type;
   meta->output_type[0] = reduct->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
+  std::strcpy(meta->op_name, reduct->name);
   return meta;
 }
 
@@ -372,6 +381,10 @@ void Reduction::forward_task(Task const *task,
   GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
       m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
 
+  if (m->inference_debugging) {
+    std::cout << "INF " << m->op_name << std::endl;
+  }
+
   assert(input.data_type == output.data_type);
   if (input.data_type == DT_HALF) {
     forward_kernel<half>(input.get_half_ptr(),
diff --git a/src/parallel_ops/replicate.cc b/src/parallel_ops/replicate.cc
index ba7bb6677f..38215fc903 100644
--- a/src/parallel_ops/replicate.cc
+++ b/src/parallel_ops/replicate.cc
@@ -44,7 +44,8 @@ using namespace FlexFlow::Kernels::Replicate;
 /* Params */
 bool operator==(ReplicateParams const &lhs, ReplicateParams const &rhs) {
   return lhs.replicate_legion_dim == rhs.replicate_legion_dim &&
-         lhs.replicate_degree == rhs.replicate_degree;
+         lhs.replicate_degree == rhs.replicate_degree &&
+         std::strcmp(lhs.name, rhs.name) == 0;
 }
 
 bool ReplicateParams::is_valid(ParallelTensorShape const &input) const {
@@ -55,7 +56,7 @@ ReplicateParams Replicate::get_params() const {
   ReplicateParams params;
   params.replicate_legion_dim = this->replicate_dim;
   params.replicate_degree = this->replicate_degree;
-  if (this->name != nullptr) {
+  if (strlen(this->name) < MAX_OPNAME) {
     strcpy(params.name, this->name);
   }
   return params;
@@ -125,6 +126,12 @@ void Replicate::create_input_partition_inference(
                               batch_outputs[0]->parallel_is,
                               batch_inputs[0]->region,
                               inference_input_lps[batch_inputs[0]]);
+  // output_grad_lp is a disjoint partition
+  ff.create_disjoint_partition(batch_inputs[0]->num_dims,
+                               batch_inputs[0]->dims,
+                               batch_inputs[0]->parallel_is,
+                               batch_outputs[0]->region_grad,
+                               inference_output_grad_lps[batch_outputs[0]]);
 }
 
 OpMeta *Replicate::init_task(Task const *task,
@@ -137,6 +144,7 @@ OpMeta *Replicate::init_task(Task const *task,
   meta->input_type[0] = repl->inputs[0]->data_type;
   meta->output_type[0] = repl->outputs[0]->data_type;
   assert(meta->input_type[0] == meta->output_type[0]);
+  std::strcpy(meta->op_name, repl->name);
   return meta;
 }
 
@@ -276,6 +284,51 @@ void Replicate::forward(FFModel const &ff) {
   runtime->execute_index_space(ctx, launcher);
 }
 
+FutureMap Replicate::peft_bwd(FFModel const &ff,
+                              BatchConfigFuture const &bc,
+                              std::vector<ParallelTensor> const &batch_inputs,
+                              std::vector<ParallelTensor> const &batch_outputs,
+                              MachineView const *mv) {
+  ArgumentMap argmap;
+  Context ctx = ff.config.lg_ctx;
+  Runtime *runtime = ff.config.lg_hlr;
+  assert(numOutputs == 1);
+  assert(numInputs == 1);
+  assert(batch_inputs[0]->data_type == batch_outputs[0]->data_type);
+  DataType data_type = batch_inputs[0]->data_type;
+
+  // Warning: we need to use batch_inputs[0] here, instead of the usual
+  // batch_outputs[0]
+  parallel_is = batch_inputs[0]->parallel_is;
+  MachineView const *view = mv ? mv : &batch_inputs[0]->machine_view;
+
+  set_argumentmap_for_inference(ff, argmap, batch_outputs[0]);
+  size_t machine_view_hash = view->hash();
+  IndexLauncher launcher(REPLICATE_PEFT_BWD_TASK_ID,
+                         parallel_is,
+                         TaskArgument(NULL, 0),
+                         argmap,
+                         Predicate::TRUE_PRED,
+                         false /*must*/,
+                         0 /*mapper_id*/,
+                         machine_view_hash);
+  launcher.add_region_requirement(
+      RegionRequirement(inference_output_grad_lps[batch_outputs[0]],
+                        0 /*projection id*/,
+                        READ_ONLY,
+                        EXCLUSIVE,
+                        batch_outputs[0]->region_grad));
+  launcher.add_field(0, FID_DATA);
+  launcher.add_region_requirement(
+      RegionRequirement(batch_inputs[0]->part_grad,
+                        0 /*projection id*/,
+                        READ_WRITE,
+                        EXCLUSIVE,
+                        batch_inputs[0]->region_grad));
+  launcher.add_field(1, FID_DATA);
+  return runtime->execute_index_space(ctx, launcher);
+}
+
 void Replicate::backward(FFModel const &ff) {
   ArgumentMap argmap;
   Context ctx = ff.config.lg_ctx;
@@ -350,6 +403,9 @@ void Replicate::forward_task(Task const *task,
   assert(task->regions.size() == 2);
 
   ReplicateMeta const *m = *((ReplicateMeta **)task->local_args);
+  if (m->inference_debugging) {
+    std::cout << "INF " << m->op_name << std::endl;
+  }
 
   Domain input_domain = runtime->get_index_space_domain(
       ctx, task->regions[0].region.get_index_space());
@@ -381,6 +437,37 @@ void Replicate::forward_task(Task const *task,
   }
 }
 
+void Replicate::peft_bwd_task(Task const *task,
+                              std::vector<PhysicalRegion> const &regions,
+                              Context ctx,
+                              Runtime *runtime) {
+  assert(regions.size() == 2);
+  assert(task->regions.size() == 2);
+  Domain output_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[0].region.get_index_space());
+  Domain input_grad_domain = runtime->get_index_space_domain(
+      ctx, task->regions[1].region.get_index_space());
+  // Currently only support the outter most dimension
+  for (int i = 0; i < output_grad_domain.get_dim() - 1; i++) {
+    assert(output_grad_domain.lo()[i] == input_grad_domain.lo()[i]);
+    assert(output_grad_domain.hi()[i] == input_grad_domain.hi()[i]);
+  }
+  size_t num_elements = input_grad_domain.get_volume();
+  size_t num_replicas = output_grad_domain.get_volume() / num_elements;
+  float const *output_grad_ptr = helperGetTensorPointerRO<float>(
+      regions[0], task->regions[0], FID_DATA, ctx, runtime);
+  float *input_grad_ptr = helperGetTensorPointerRW<float>(
+      regions[1], task->regions[1], FID_DATA, ctx, runtime);
+
+  ReplicateMeta const *m = *((ReplicateMeta **)task->local_args);
+  if (m->inference_debugging) {
+    std::cout << "BWD " << m->op_name << std::endl;
+  }
+
+  backward_kernel<float>(
+      output_grad_ptr, input_grad_ptr, num_elements, num_replicas);
+}
+
 void Replicate::backward_task(Task const *task,
                               std::vector<PhysicalRegion> const &regions,
                               Context ctx,
diff --git a/src/runtime/batch_config.cc b/src/runtime/batch_config.cc
index 7989b0799e..4c339750c7 100644
--- a/src/runtime/batch_config.cc
+++ b/src/runtime/batch_config.cc
@@ -25,7 +25,35 @@ Legion::Logger log_bc("BatchConfig");
 using Legion::Future;
 using Legion::Memory;
 
-BatchConfig::BatchConfig() : num_tokens(0) {
+void set_optimizer_tasks(OptimizerTasks &tasks,
+                         int max_training_steps,
+                         int completed_training_steps,
+                         int gradient_accumulation_steps) {
+  assert(max_training_steps > 0);
+  assert(completed_training_steps >= 0);
+  assert(gradient_accumulation_steps > 0);
+  assert(completed_training_steps < max_training_steps);
+  // Compute gradients should always be true
+  tasks.compute_gradients = true;
+
+  // Reset gradients to zero in the first iteration and after weight updates
+  tasks.reset_gradients_to_zero =
+      (completed_training_steps == 0) ||
+      (completed_training_steps % gradient_accumulation_steps == 0);
+
+  // Update weights every gradient_accumulation_steps
+  tasks.update_weights =
+      ((completed_training_steps + 1) % gradient_accumulation_steps == 0);
+
+  // Save updated weights only in the very last training step
+  tasks.save_updated_weights =
+      (completed_training_steps == max_training_steps - 1);
+  if (tasks.save_updated_weights) {
+    assert(tasks.update_weights);
+  }
+}
+
+BatchConfig::BatchConfig() : num_tokens(0), num_peft_tokens(0) {
   for (int i = 0; i < MAX_NUM_REQUESTS; i++) {
     requestsInfo[i].first_token_depth_in_request = 0;
     requestsInfo[i].first_token_offset_in_batch = 0;
@@ -74,6 +102,14 @@ int BatchConfig::num_active_tokens() const {
   return num_tokens;
 }
 
+int BatchConfig::num_active_infr_tokens() const {
+  return num_tokens;
+}
+
+int BatchConfig::num_active_peft_tokens() const {
+  return num_peft_tokens;
+}
+
 /*static*/
 int BatchConfig::max_requests_per_batch() {
   return RequestManager::get_request_manager()->get_max_requests_per_batch();
@@ -107,8 +143,13 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
   os << "Max number of tokens: " << bc.max_tokens_per_batch() << std::endl;
   os << "Max sequence length: " << bc.max_sequence_length() << std::endl;
   // Current values
-  os << "Number of tokens: " << bc.num_active_tokens() << std::endl;
+  os << "Number of active tokens: " << bc.num_active_tokens() << std::endl;
+  os << "Number of inference tokens: " << bc.num_active_infr_tokens()
+     << std::endl;
+  os << "Number of peft tokens: " << bc.num_active_peft_tokens() << std::endl;
   os << "Number of requests: " << bc.num_active_requests() << std::endl;
+  os << "Number of generation tokens: " << bc.num_generation_tokens
+     << std::endl;
 
   // Per-request info
   os << "Per-request info:\n";
@@ -121,9 +162,27 @@ std::ostream &operator<<(std::ostream &os, BatchConfig const &bc) {
          << bc.requestsInfo[i].first_token_offset_in_batch << std::endl;
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
-      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
+      os << "    BatchConfig Req ID: "
+         << bc.requestsInfo[i].batch_config_request_id << std::endl;
+      os << "    Prompt phase: " << bc.requestsInfo[i].prompt_phase
+         << std::endl;
+      os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      // PEFT values
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
+         << std::endl;
+      os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
+      os << "    optimizer_tasks: {"
+         << "compute_gradients: " << std::boolalpha
+         << bc.requestsInfo[i].optimizer_tasks.compute_gradients
+         << ", reset_gradients_to_zero: "
+         << bc.requestsInfo[i].optimizer_tasks.reset_gradients_to_zero
+         << ", update_weights: "
+         << bc.requestsInfo[i].optimizer_tasks.update_weights
+         << ", save_updated_weights: "
+         << bc.requestsInfo[i].optimizer_tasks.save_updated_weights << "}"
+         << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
       os << "    Request running: " << bc.request_running[i] << std::endl;
     }
diff --git a/src/runtime/beam_search_batch_config.cc b/src/runtime/beam_search_batch_config.cc
index 0509c23afe..b10f8e82ab 100644
--- a/src/runtime/beam_search_batch_config.cc
+++ b/src/runtime/beam_search_batch_config.cc
@@ -137,6 +137,10 @@ std::ostream &operator<<(std::ostream &os, BeamSearchBatchConfig const &bc) {
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      // PEFT values
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
+         << std::endl;
+      os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
diff --git a/src/runtime/cuda_helper.cu b/src/runtime/cuda_helper.cu
index 57bc5a0458..386a0c940b 100644
--- a/src/runtime/cuda_helper.cu
+++ b/src/runtime/cuda_helper.cu
@@ -36,7 +36,8 @@ cudaError_t get_legion_stream(cudaStream_t *stream) {
 
 using FlexFlow::get_legion_stream;
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+template <typename DT>
+__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
@@ -271,18 +272,10 @@ __host__ void print_beam_tensor(T const *ptr,
 template <>
 __host__ void
     save_tensor(float const *ptr, size_t num_elements, char const *file_name) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  float *host_ptr;
-  checkCUDA(cudaHostAlloc(&host_ptr,
-                          sizeof(float) * num_elements,
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  checkCUDA(cudaMemcpyAsync(host_ptr,
-                            ptr,
-                            sizeof(float) * num_elements,
-                            cudaMemcpyDeviceToHost,
-                            stream));
+  float *host_ptr = (float *)calloc(num_elements, sizeof(float));
   checkCUDA(cudaDeviceSynchronize());
+  checkCUDA(cudaMemcpy(
+      host_ptr, ptr, sizeof(float) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
@@ -293,26 +286,17 @@ __host__ void
       fprintf(tensor_file, "%.9f", host_ptr[i]);
     }
   }
-
   fclose(tensor_file);
-  checkCUDA(cudaFreeHost(host_ptr));
+  free(host_ptr);
 }
 
 template <>
 __host__ void
     save_tensor(half const *ptr, size_t num_elements, char const *file_name) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  half *host_ptr;
-  checkCUDA(cudaHostAlloc(&host_ptr,
-                          sizeof(half) * num_elements,
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  checkCUDA(cudaMemcpyAsync(host_ptr,
-                            ptr,
-                            sizeof(half) * num_elements,
-                            cudaMemcpyDeviceToHost,
-                            stream));
+  half *host_ptr = (half *)calloc(num_elements, sizeof(half));
   checkCUDA(cudaDeviceSynchronize());
+  checkCUDA(cudaMemcpy(
+      host_ptr, ptr, sizeof(half) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
@@ -323,27 +307,18 @@ __host__ void
       fprintf(tensor_file, "%.9f", (float)host_ptr[i]);
     }
   }
-
   fclose(tensor_file);
-  checkCUDA(cudaFreeHost(host_ptr));
+  free(host_ptr);
 }
 
 template <>
 __host__ void save_tensor(int32_t const *ptr,
                           size_t num_elements,
                           char const *file_name) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  int32_t *host_ptr;
-  checkCUDA(cudaHostAlloc(&host_ptr,
-                          sizeof(int32_t) * num_elements,
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  checkCUDA(cudaMemcpyAsync(host_ptr,
-                            ptr,
-                            sizeof(int32_t) * num_elements,
-                            cudaMemcpyDeviceToHost,
-                            stream));
+  int32_t *host_ptr = (int32_t *)calloc(num_elements, sizeof(int32_t));
   checkCUDA(cudaDeviceSynchronize());
+  checkCUDA(cudaMemcpy(
+      host_ptr, ptr, sizeof(int32_t) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
@@ -354,27 +329,18 @@ __host__ void save_tensor(int32_t const *ptr,
       fprintf(tensor_file, "%d", host_ptr[i]);
     }
   }
-
   fclose(tensor_file);
-  checkCUDA(cudaFreeHost(host_ptr));
+  free(host_ptr);
 }
 
 template <>
 __host__ void save_tensor(int64_t const *ptr,
                           size_t num_elements,
                           char const *file_name) {
-  cudaStream_t stream;
-  checkCUDA(get_legion_stream(&stream));
-  int64_t *host_ptr;
-  checkCUDA(cudaHostAlloc(&host_ptr,
-                          sizeof(int64_t) * num_elements,
-                          cudaHostAllocPortable | cudaHostAllocMapped));
-  checkCUDA(cudaMemcpyAsync(host_ptr,
-                            ptr,
-                            sizeof(int64_t) * num_elements,
-                            cudaMemcpyDeviceToHost,
-                            stream));
+  int64_t *host_ptr = (int64_t *)calloc(num_elements, sizeof(int64_t));
   checkCUDA(cudaDeviceSynchronize());
+  checkCUDA(cudaMemcpy(
+      host_ptr, ptr, sizeof(int64_t) * num_elements, cudaMemcpyDeviceToHost));
   FILE *tensor_file;
   tensor_file = fopen(file_name, "w");
   assert(tensor_file != NULL);
@@ -385,13 +351,12 @@ __host__ void save_tensor(int64_t const *ptr,
       fprintf(tensor_file, "%ld", host_ptr[i]);
     }
   }
-
   fclose(tensor_file);
-  checkCUDA(cudaFreeHost(host_ptr));
+  free(host_ptr);
 }
 
 template <typename T>
-__host__ T *download_tensor(T const *ptr, size_t num_elements) {
+__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   T *host_ptr;
@@ -404,14 +369,25 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) {
 }
 
 template <typename T>
-__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) {
+__host__ void
+    copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
   cudaStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(dst != nullptr);
   checkCUDA(cudaMemcpyAsync(
       dst, ptr, sizeof(T) * num_elements, cudaMemcpyDeviceToHost, stream));
-  return true;
 }
+
+template <typename T>
+__host__ void
+    copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
+  cudaStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(src != nullptr);
+  checkCUDA(cudaMemcpyAsync(
+      dst, src, sizeof(T) * num_elements, cudaMemcpyHostToDevice, stream));
+}
+
 cudnnStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax(
     cudnnTensorDescriptor_t tensor, Domain domain, DataType data_type) {
   int dims[MAX_TENSOR_DIM];
@@ -609,6 +585,48 @@ cudnnDataType_t cuda_to_cudnn_datatype(cudaDataType_t type) {
   return CUDNN_DATA_FLOAT;
 }
 
+void check_device_vs_host_ptr(void const *maybe_devicePtr) {
+  cudaPointerAttributes attributes;
+  cudaError_t cudaStatus =
+      cudaPointerGetAttributes(&attributes, maybe_devicePtr);
+
+  if (cudaStatus == cudaSuccess) {
+    // Check attributes and perform actions accordingly
+    if (attributes.type == cudaMemoryTypeDevice) {
+      printf("Pointer is allocated in device memory.\n");
+    } else if (attributes.type == cudaMemoryTypeHost) {
+      printf("Pointer is allocated in host memory.\n");
+    } else if (attributes.type == cudaMemoryTypeUnregistered) {
+      printf("Pointer is unregistered.\n");
+    } else if (attributes.type == cudaMemoryTypeManaged) {
+      printf("Pointer is managed.\n");
+    } else {
+      printf("Pointer is not allocated in recognized memory type.\n");
+    }
+  } else {
+    fprintf(stderr,
+            "cudaPointerGetAttributes failed: %s\n",
+            cudaGetErrorString(cudaStatus));
+  }
+}
+
+void check_ptr_alignment(void const *ptr) {
+  if (!ptr) {
+    printf("Pointer is NULL\n");
+    return;
+  }
+  bool aligned2 = ((uintptr_t)ptr % 2 == 0);
+  bool aligned4 = ((uintptr_t)ptr % 4 == 0);
+  bool aligned8 = ((uintptr_t)ptr % 8 == 0);
+  bool aligned16 = ((uintptr_t)ptr % 16 == 0);
+  printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n",
+         ptr,
+         (aligned2 ? "yes" : "no"),
+         (aligned4 ? "yes" : "no"),
+         (aligned8 ? "yes" : "no"),
+         (aligned16 ? "yes" : "no"));
+}
+
 template __global__ void
     assign_kernel<half>(half *ptr, coord_t size, half value);
 template __global__ void
@@ -620,6 +638,13 @@ template __global__ void
 template __global__ void
     assign_kernel<int64_t>(int64_t *ptr, coord_t size, int64_t value);
 
+template __global__ void
+    scale_kernel<half>(half *ptr, coord_t size, half a, half b);
+template __global__ void
+    scale_kernel<float>(float *ptr, coord_t size, float a, float b);
+template __global__ void
+    scale_kernel<double>(double *ptr, coord_t size, double a, double b);
+
 template __global__ void
     add_kernel<half>(half *dst, half const *src, size_t size);
 template __global__ void
@@ -716,26 +741,43 @@ template __host__ void save_tensor<int64_t>(int64_t const *ptr,
 template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
 
-template __host__ float *download_tensor<float>(float const *ptr,
-                                                size_t num_elements);
-template __host__ half *download_tensor<half>(half const *ptr,
-                                              size_t num_elements);
-template __host__ double *download_tensor<double>(double const *ptr,
-                                                  size_t num_elements);
-template __host__ int32_t *download_tensor<int32_t>(int32_t const *ptr,
-                                                    size_t num_elements);
-template __host__ int64_t *download_tensor<int64_t>(int64_t const *ptr,
-                                                    size_t num_elements);
-template __host__ bool
-    download_tensor<float>(float const *ptr, float *dst, size_t num_elements);
-template __host__ bool
-    download_tensor<half>(half const *ptr, half *dst, size_t num_elements);
-template __host__ bool download_tensor<double>(double const *ptr,
-                                               double *dst,
-                                               size_t num_elements);
-template __host__ bool download_tensor<int32_t>(int32_t const *ptr,
-                                                int32_t *dst,
-                                                size_t num_elements);
-template __host__ bool download_tensor<int64_t>(int64_t const *ptr,
-                                                int64_t *dst,
-                                                size_t num_elements);
+template __host__ float *copy_tensor_dev_to_host<float>(float const *ptr,
+                                                        size_t num_elements);
+template __host__ half *copy_tensor_dev_to_host<half>(half const *ptr,
+                                                      size_t num_elements);
+template __host__ double *copy_tensor_dev_to_host<double>(double const *ptr,
+                                                          size_t num_elements);
+template __host__ int32_t *
+    copy_tensor_dev_to_host<int32_t>(int32_t const *ptr, size_t num_elements);
+template __host__ int64_t *
+    copy_tensor_dev_to_host<int64_t>(int64_t const *ptr, size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<float>(float const *ptr,
+                                                      float *dst,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<half>(half const *ptr,
+                                                     half *dst,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<double>(double const *ptr,
+                                                       double *dst,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
+                                                        int32_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
+                                                        int64_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<float>(float *dst,
+                                                      float const *src,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<half>(half *dst,
+                                                     half const *src,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<double>(double *dst,
+                                                       double const *src,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst,
+                                                        int32_t const *src,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst,
+                                                        int64_t const *src,
+                                                        size_t num_elements);
diff --git a/src/runtime/ffconst_utils.cc b/src/runtime/ffconst_utils.cc
index c7b6e1257a..5a7d98b4dc 100644
--- a/src/runtime/ffconst_utils.cc
+++ b/src/runtime/ffconst_utils.cc
@@ -188,6 +188,9 @@ std::string get_operator_type_name(OperatorType type) {
       return "Sampling";
     case OP_ARGMAX:
       return "ArgMax";
+    // PEFT Ops
+    case OP_LORA:
+      return "Lora Layer";
     // Parallel Ops
     case OP_REPARTITION:
       return "Repartition";
@@ -199,6 +202,8 @@ std::string get_operator_type_name(OperatorType type) {
       return "Reduction";
     case OP_ALLREDUCE:
       return "AllReduce";
+    case OP_PARALLEL_IDENTITY:
+      return "ParallelIdentity";
     case OP_PIPELINE:
       return "Pipeline";
     case OP_FUSED_PARALLEL:
diff --git a/src/runtime/fftype.cc b/src/runtime/fftype.cc
index 819e6527e5..8213726e8a 100644
--- a/src/runtime/fftype.cc
+++ b/src/runtime/fftype.cc
@@ -30,4 +30,29 @@ bool operator==(LayerID const &lhs, LayerID const &rhs) {
   return lhs.id == rhs.id;
 }
 
+const PEFTModelID PEFTModelID::NO_ID = PEFTModelID();
+
+PEFTModelID::PEFTModelID() : id(0) {}
+
+PEFTModelID::PEFTModelID(size_t _id) : id(_id) {
+  assert(is_valid_id());
+}
+
+bool PEFTModelID::is_valid_id() const {
+  return (id >= PEFT_MODEL_ID_FIRST_VALID && id <= PEFT_MODEL_ID_LAST_VALID);
+}
+
+bool operator==(PEFTModelID const &lhs, PEFTModelID const &rhs) {
+  return lhs.id == rhs.id;
+}
+
+std::ostream &operator<<(std::ostream &os, PEFTModelID const &peft_model_id) {
+  if (peft_model_id == PEFTModelID::NO_ID) {
+    os << "NO_ID";
+  } else {
+    os << peft_model_id.id;
+  }
+  return os;
+}
+
 }; // namespace FlexFlow
diff --git a/src/runtime/file_loader.cc b/src/runtime/file_loader.cc
index 43ce9d7005..c373e0da9b 100644
--- a/src/runtime/file_loader.cc
+++ b/src/runtime/file_loader.cc
@@ -136,12 +136,12 @@ void load_attention_bias_v2(DT *ptr,
                             bool final_bias,
                             std::string layer_name,
                             std::string weights_folder) {
-  std::string q_file = layer_name + "_wq_bias";
-  std::string k_file = layer_name + "_wk_bias";
-  std::string v_file = layer_name + "_wv_bias";
+  std::string q_file = layer_name + ".q_proj.bias";
+  std::string k_file = layer_name + ".k_proj.bias";
+  std::string v_file = layer_name + ".v_proj.bias";
   std::vector<std::string> bias_files = {q_file, k_file, v_file};
   if (final_bias) {
-    std::string o_file = layer_name + "_wo_bias";
+    std::string o_file = layer_name + ".o_proj.bias";
     bias_files.push_back(o_file);
   }
 
@@ -217,12 +217,10 @@ void load_attention_weights_v2(DT *ptr,
                                std::string weights_folder,
                                size_t volume,
                                int tensor_parallelism_degree) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file};
   int file_index = 0;
 
@@ -407,12 +405,10 @@ void load_attention_weights_quantized(char *ptr,
                                       std::string weights_folder,
                                       DataType data_type,
                                       bool use_full_precision) {
-  // layers_0_attention_wq_weight
-  // layers_0_self_attn_q_proj_weight
-  std::string q_file = layer_name + "_wq_weight";
-  std::string k_file = layer_name + "_wk_weight";
-  std::string v_file = layer_name + "_wv_weight";
-  std::string o_file = layer_name + "_wo_weight";
+  std::string q_file = layer_name + ".q_proj.weight";
+  std::string k_file = layer_name + ".k_proj.weight";
+  std::string v_file = layer_name + ".v_proj.weight";
+  std::string o_file = layer_name + ".o_proj.weight";
   std::vector<std::string> weight_filenames = {q_file, k_file, v_file, o_file};
 
   int file_index = 0;
@@ -690,7 +686,7 @@ void FileDataLoader::load_quantization_weight(FFModel *ff,
     if (weight_idx > 0) {
       assert(weight_idx == 0 || weight_idx == 1);
       if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
     }
     load_from_quantized_file(data,
@@ -734,44 +730,34 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
     if (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION ||
         l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION) {
-      if (weight_filename.find("self_attention") != std::string::npos) {
-        load_attention_weights_multi_query(
-            data, weight_filename, weights_folder, hidden_dim, num_heads);
-      } else if (weight_filename.find("attention") != std::string::npos &&
-                 weight_filename.rfind("attention") ==
-                     weight_filename.length() - strlen("attention")) {
-        if (weight_idx == 0) {
-          load_attention_weights_v2(data,
-                                    num_heads,
-                                    num_kv_heads,
-                                    hidden_dim,
-                                    qkv_inner_dim,
-                                    weight_filename,
-                                    weights_folder,
-                                    volume,
-                                    tensor_parallelism_degree);
-        } else {
-          long long value;
-          l->get_int_property("final_bias", value);
-          bool final_bias = (bool)value;
-          load_attention_bias_v2(data,
-                                 num_heads,
-                                 num_kv_heads,
-                                 hidden_dim,
-                                 qkv_inner_dim,
-                                 final_bias,
-                                 weight_filename,
-                                 weights_folder);
-        }
-
+      if (weight_idx == 0) {
+        load_attention_weights_v2(data,
+                                  num_heads,
+                                  num_kv_heads,
+                                  hidden_dim,
+                                  qkv_inner_dim,
+                                  weight_filename,
+                                  weights_folder,
+                                  volume,
+                                  tensor_parallelism_degree);
       } else {
-        assert(false);
+        long long value;
+        l->get_int_property("final_bias", value);
+        bool final_bias = (bool)value;
+        load_attention_bias_v2(data,
+                               num_heads,
+                               num_kv_heads,
+                               hidden_dim,
+                               qkv_inner_dim,
+                               final_bias,
+                               weight_filename,
+                               weights_folder);
       }
     } else if (l->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM) {
       assert(weight_idx >= 0 || weight_idx <= 2);
       weight_filename += (weight_idx == 0)
-                             ? "_attn_bias"
-                             : ((weight_idx == 1) ? "_weight" : "_bias");
+                             ? ".attn_bias"
+                             : ((weight_idx == 1) ? ".weight" : ".bias");
       std::cout << "Loading weight file " << weight_filename << std::endl;
       std::string weight_filepath =
           join_path({weights_folder, weight_filename});
@@ -781,7 +767,7 @@ void FileDataLoader::load_single_weight_tensor(FFModel *ff,
       assert(weight_idx == 0 || weight_idx == 1);
       // handle exception
       if (weight_filename != "embed_tokens_weight_lm_head") {
-        weight_filename += weight_idx == 0 ? "_weight" : "_bias";
+        weight_filename += weight_idx == 0 ? ".weight" : ".bias";
       }
       std::cout << "Loading weight file " << weight_filename << std::endl;
       std::string weight_filepath =
@@ -809,6 +795,10 @@ void FileDataLoader::load_weights(FFModel *ff) {
       if (weight == NULL) {
         continue;
       }
+      // TODO: currently skip Lora layers
+      if (l->op_type == OP_LORA) {
+        continue;
+      }
       switch (weight->data_type) {
         case DT_HALF:
           load_single_weight_tensor<half>(ff, l, i);
diff --git a/src/runtime/graph.cc b/src/runtime/graph.cc
index b023aced6e..1a38782e81 100644
--- a/src/runtime/graph.cc
+++ b/src/runtime/graph.cc
@@ -36,6 +36,7 @@
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/ops/noop.h"
 #include "flexflow/ops/pool_2d.h"
 #include "flexflow/ops/reduce.h"
@@ -54,6 +55,7 @@
 #include "flexflow/parallel_ops/allreduce.h"
 #include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/fused_parallel_op.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
@@ -1992,6 +1994,7 @@ std::pair<std::unique_ptr<Graph>, std::unordered_map<Node, MachineView>>
         mv.device_type = MachineView::GPU;
         mv.ndims = 1;
         int total_parallel_degree = 1;
+        assert(op->numOutputs > 0);
         for (int i = 0; i < op->outputs[0]->num_dims; i++) {
           total_parallel_degree *= op->outputs[0]->dims[i].degree;
         }
@@ -2434,6 +2437,13 @@ GraphOptimalViewSerialized
         sez.serialize(allreduce->name, strlen(allreduce->name));
         break;
       }
+      case OP_PARALLEL_IDENTITY: {
+        ParallelIdentity *parallel_identity = (ParallelIdentity *)op;
+        sez.serialize(parallel_identity->parallel_identity_dim);
+        sez.serialize(strlen(parallel_identity->name));
+        sez.serialize(parallel_identity->name, strlen(parallel_identity->name));
+        break;
+      }
       case OP_FUSED_PARALLEL: {
         FusedParallelOp *fused = (FusedParallelOp *)op;
         sez.serialize(fused->num_parallel_ops);
@@ -2475,6 +2485,7 @@ namespace FlexFlow {
 using PCG::Edge;
 using PCG::Graph;
 using PCG::GraphCostResult;
+using PCG::log_graph;
 using PCG::Node;
 
 void FFModel::register_all_machine_views(
@@ -2759,6 +2770,10 @@ void FFModel::deserialize_graph_optimal_view(
         node = Linear::deserialize(*this, dez, inputs, num_inputs);
         break;
       }
+      case OP_LORA: {
+        node = LoraLinear::deserialize(*this, dez, inputs, num_inputs);
+        break;
+      }
       case OP_MULTIHEAD_ATTENTION: {
         assert(num_inputs == 3);
         int embed_dim, num_heads, k_dim, v_dim;
@@ -3042,8 +3057,11 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<Combine>(inputs[0],
-                                           {combine_dim, combine_degree});
+        CombineParams params;
+        params.combine_legion_dim = combine_dim;
+        params.combine_degree = combine_degree;
+        strcpy(params.name, name);
+        node = get_or_create_node<Combine>(inputs[0], params);
         break;
       }
       case OP_REPARTITION: {
@@ -3055,8 +3073,11 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<Repartition>(
-            inputs[0], {repartition_dim, repartition_degree});
+        RepartitionParams params;
+        params.repartition_legion_dim = repartition_dim;
+        params.repartition_degree = repartition_degree;
+        strcpy(params.name, name);
+        node = get_or_create_node<Repartition>(inputs[0], params);
         break;
       }
       case OP_REPLICATE: {
@@ -3068,8 +3089,11 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<Replicate>(inputs[0],
-                                             {replicate_dim, replicate_degree});
+        ReplicateParams params;
+        params.replicate_legion_dim = replicate_dim;
+        params.replicate_degree = replicate_degree;
+        strcpy(params.name, name);
+        node = get_or_create_node<Replicate>(inputs[0], params);
         break;
       }
       case OP_REDUCTION: {
@@ -3081,8 +3105,11 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<Reduction>(inputs[0],
-                                             {reduction_dim, reduction_degree});
+        ReductionParams params;
+        params.reduction_legion_dim = reduction_dim;
+        params.reduction_degree = reduction_degree;
+        strcpy(params.name, name);
+        node = get_or_create_node<Reduction>(inputs[0], params);
         break;
       }
       case OP_ALLREDUCE: {
@@ -3093,24 +3120,43 @@ void FFModel::deserialize_graph_optimal_view(
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<AllReduce>(inputs[0], {allreduce_dim});
+        AllReduceParams params;
+        params.allreduce_legion_dim = allreduce_dim;
+        strcpy(params.name, name);
+        node = get_or_create_node<AllReduce>(inputs[0], params);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(num_inputs == 1);
+        int parallel_identity_dim;
+        dez.deserialize(parallel_identity_dim);
+        size_t name_len;
+        char name[MAX_OPNAME] = {0};
+        dez.deserialize(name_len);
+        dez.deserialize(name, name_len);
+        ParallelIdentityParams params;
+        params.parallel_identity_legion_dim = parallel_identity_dim;
+        strcpy(params.name, name);
+        node = get_or_create_node<ParallelIdentity>(inputs[0], params);
         break;
       }
       case OP_FUSED_PARALLEL: {
         assert(num_inputs == 1);
-        std::vector<ParallelOpInfo> parallel_ops;
+        FusedParallelOpParams params;
         int num_parallel_ops;
         dez.deserialize(num_parallel_ops);
         for (int i = 0; i < num_parallel_ops; i++) {
           ParallelOpInfo info;
           dez.deserialize(info);
-          parallel_ops.push_back(info);
+          params.parallel_ops.push_back(info);
         }
         size_t name_len;
         char name[MAX_OPNAME] = {0};
         dez.deserialize(name_len);
         dez.deserialize(name, name_len);
-        node = get_or_create_node<FusedParallelOp>(inputs[0], {parallel_ops});
+        strcpy(params.name, name);
+
+        node = get_or_create_node<FusedParallelOp>(inputs[0], params);
         break;
       }
       default: {
@@ -3149,20 +3195,20 @@ void FFModel::deserialize_graph_optimal_view(
     optimal_views[guid_to_nodes[guid]] = view;
   }
   assert(dez.get_remaining_bytes() == 0);
-  printf("Deserialized Views...\n");
+  log_graph.debug("Deserialized Views...\n");
   for (auto const &it : optimal_views) {
-    printf("node[%zu]: type(%s) view(%d %d %d) ",
-           it.first.guid,
-           it.first.to_string().c_str(),
-           it.second.ndims,
-           it.second.dim[0],
-           it.second.start_device_id);
+    log_graph.debug("node[%zu]: type(%s) view(%d %d %d) ",
+                    it.first.guid,
+                    it.first.to_string().c_str(),
+                    it.second.ndims,
+                    it.second.dim[0],
+                    it.second.start_device_id);
     auto const &list = graph->inEdges.at(it.first);
     for (auto const &it2 : list) {
       Edge e = it2;
-      printf(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
+      log_graph.debug(" inEdge(node(%zu) idx(%d))", e.srcOp.guid, e.srcIdx);
     }
-    printf("\n");
+    log_graph.debug("\n");
   }
 }
 
diff --git a/src/runtime/hip_helper.cpp b/src/runtime/hip_helper.cpp
index 613df1cbcf..057be8f443 100644
--- a/src/runtime/hip_helper.cpp
+++ b/src/runtime/hip_helper.cpp
@@ -29,7 +29,8 @@ hipError_t get_legion_stream(hipStream_t *stream) {
 
 using FlexFlow::get_legion_stream;
 
-__global__ void scale_kernel(float *ptr, coord_t size, float a, float b) {
+template <typename DT>
+__global__ void scale_kernel(DT *ptr, coord_t size, DT a, DT b) {
   CUDA_KERNEL_LOOP(i, size) {
     ptr[i] = (b - a) * ptr[i] + a;
   }
@@ -55,6 +56,14 @@ __global__ void copy_kernel(DT *dst, const DT *src, coord_t size) {
   }
 }
 
+template <typename DT>
+__global__ void
+    copy_kernel_discrete(DT *dst, const DT *src, coord_t size, size_t *index) {
+  CUDA_KERNEL_LOOP(i, size) {
+    dst[i] = src[index[i]];
+  }
+}
+
 template <typename DT>
 __global__ void reluBackward(DT *grad_ptr, const DT *output, size_t n) {
   CUDA_KERNEL_LOOP(i, n) {
@@ -224,22 +233,24 @@ __host__ void updateGAS(float *para_ptr,
 }
 
 template <typename T>
-__host__ void
-    print_tensor(T const *ptr, size_t num_elements, char const *prefix) {
-  // device synchronize to make sure the data are ready
-  // checkCUDA(hipDeviceSynchronize());
+__host__ void print_tensor(T const *ptr,
+                           size_t num_elements,
+                           char const *prefix,
+                           int shard_id) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
   T *host_ptr;
-  checkCUDA(hipHostMalloc((void **)&host_ptr,
+  checkCUDA(hipHostMalloc(&host_ptr,
                           sizeof(T) * num_elements,
                           hipHostMallocPortable | hipHostMallocMapped));
-  checkCUDA(hipMemcpy(
-      host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost));
-  // checkCUDA(hipDeviceSynchronize());
+  checkCUDA(hipMemcpyAsync(
+      host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream));
+  checkCUDA(hipDeviceSynchronize());
   int idx = 0;
-  printf("%s", prefix);
+  printf("%s, %d---->", prefix, shard_id);
   for (idx = 0; idx < num_elements; idx++) {
-    printf(" %.4lf", (float)host_ptr[idx]);
-    if (idx >= 16) {
+    printf(" %.20lf", (float)host_ptr[idx]);
+    if (idx >= 100) {
       break;
     }
   }
@@ -247,6 +258,40 @@ __host__ void
   checkCUDA(hipHostFree(host_ptr));
 }
 
+template <typename T>
+__host__ void print_beam_tensor(T const *ptr,
+                                size_t num_elements,
+                                int skip,
+                                int channel,
+                                char const *prefix) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  T *host_ptr;
+  checkCUDA(hipHostMalloc(&host_ptr,
+                          sizeof(T) * channel * skip,
+                          hipHostMallocPortable | hipHostMallocMapped));
+  checkCUDA(hipMemcpyAsync(host_ptr,
+                           ptr,
+                           sizeof(T) * channel * skip,
+                           hipMemcpyDeviceToHost,
+                           stream));
+  // checkCUDA(hipDeviceSynchronize());
+  int idx = 0;
+  printf("%s", prefix);
+
+  for (int i = 0; i < channel; i += 1) {
+    for (idx = 0; idx < num_elements; idx++) {
+      printf(" %.20lf", (float)host_ptr[idx + i * skip]);
+      if (idx >= 100) {
+        break;
+      }
+    }
+    printf("\n-----***********------\n");
+  }
+
+  checkCUDA(hipHostFree(host_ptr));
+}
+
 template <>
 __host__ void
     save_tensor(float const *ptr, size_t num_elements, char const *file_name) {
@@ -370,9 +415,7 @@ __host__ void save_tensor(int64_t const *ptr,
 }
 
 template <typename T>
-__host__ T *download_tensor(T const *ptr, size_t num_elements) {
-  // device synchronize to make sure the data are ready
-  // checkCUDA(hipDeviceSynchronize());
+__host__ T *copy_tensor_dev_to_host(T const *ptr, size_t num_elements) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   T *host_ptr;
@@ -381,21 +424,27 @@ __host__ T *download_tensor(T const *ptr, size_t num_elements) {
                           hipHostMallocPortable | hipHostMallocMapped));
   checkCUDA(hipMemcpyAsync(
       host_ptr, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream));
-  // checkCUDA(hipDeviceSynchronize());
   return host_ptr;
 }
 
 template <typename T>
-__host__ bool download_tensor(T const *ptr, T *dst, size_t num_elements) {
-  // device synchronize to make sure the data are ready
-  // checkCUDA(hipDeviceSynchronize());
+__host__ void
+    copy_tensor_dev_to_host(T const *ptr, T *dst, size_t num_elements) {
   hipStream_t stream;
   checkCUDA(get_legion_stream(&stream));
   assert(dst != nullptr);
   checkCUDA(hipMemcpyAsync(
       dst, ptr, sizeof(T) * num_elements, hipMemcpyDeviceToHost, stream));
-  // checkCUDA(hipDeviceSynchronize());
-  return true;
+}
+
+template <typename T>
+__host__ void
+    copy_tensor_host_to_dev(T *dst, T const *src, size_t num_elements) {
+  hipStream_t stream;
+  checkCUDA(get_legion_stream(&stream));
+  assert(src != nullptr);
+  checkCUDA(hipMemcpyAsync(
+      dst, src, sizeof(T) * num_elements, hipMemcpyHostToDevice, stream));
 }
 
 miopenStatus_t cudnnSetTensorDescriptorFromDomain(
@@ -450,22 +499,23 @@ miopenStatus_t cudnnSetTensorDescriptorFromDomain(
   return miopenStatusBadParm;
 }
 
-miopenStatus_t
-    cudnnSetTensorDescriptorFromDomain4SoftMax(miopenTensorDescriptor_t tensor,
-                                               Domain domain) {
+miopenStatus_t cudnnSetTensorDescriptorFromDomain4SoftMax(
+    miopenTensorDescriptor_t tensor, Domain domain, DataType data_type) {
   int dims[MAX_TENSOR_DIM];
+  miopenDataType_t cudnn_data_type = ff_to_cudnn_datatype(data_type);
   switch (domain.get_dim()) {
     case 1: {
       Rect<1> rect = domain;
       dims[0] = rect.hi[0] - rect.lo[0] + 1;
-      return miopenSet4dTensorDescriptor(tensor, miopenFloat, dims[0], 1, 1, 1);
+      return miopenSet4dTensorDescriptor(
+          tensor, cudnn_data_type, dims[0], 1, 1, 1);
     }
     case 2: {
       Rect<2> rect = domain;
       dims[0] = rect.hi[0] - rect.lo[0] + 1;
       dims[1] = rect.hi[1] - rect.lo[1] + 1;
       return miopenSet4dTensorDescriptor(
-          tensor, miopenFloat, dims[1], dims[0], 1, 1);
+          tensor, cudnn_data_type, dims[1], dims[0], 1, 1);
     }
     case 3: {
       Rect<3> rect = domain;
@@ -473,7 +523,7 @@ miopenStatus_t
       dims[1] = rect.hi[1] - rect.lo[1] + 1;
       dims[2] = rect.hi[2] - rect.lo[2] + 1;
       return miopenSet4dTensorDescriptor(
-          tensor, miopenFloat, dims[2] * dims[1], dims[0], 1, 1);
+          tensor, cudnn_data_type, dims[2] * dims[1], dims[0], 1, 1);
     }
     case 4: {
       Rect<4> rect = domain;
@@ -482,7 +532,7 @@ miopenStatus_t
       dims[2] = rect.hi[2] - rect.lo[2] + 1;
       dims[3] = rect.hi[3] - rect.lo[3] + 1;
       return miopenSet4dTensorDescriptor(
-          tensor, miopenFloat, dims[3] * dims[2] * dims[1], dims[0], 1, 1);
+          tensor, cudnn_data_type, dims[3] * dims[2] * dims[1], dims[0], 1, 1);
     }
     case 5: {
       Rect<5> rect = domain;
@@ -493,7 +543,7 @@ miopenStatus_t
       dims[2] = rect.hi[2] - rect.lo[2] + 1;
       dims[3] = rect.hi[3] - rect.lo[3] + 1;
       return miopenSet4dTensorDescriptor(
-          tensor, miopenFloat, dims[3], dims[2], dims[1], dims[0]);
+          tensor, cudnn_data_type, dims[3], dims[2], dims[1], dims[0]);
     }
     default:
       assert(false && "Unsupported dim number");
@@ -553,6 +603,49 @@ void handle_unimplemented_hip_kernel(OperatorType op_type) {
   throw std::runtime_error("Unimplemented hip kernel for Operator: " +
                            FlexFlow::get_operator_type_name(op_type));
 }
+void check_device_vs_host_ptr(void const *maybe_devicePtr) {
+  hipPointerAttribute_t attributes;
+  hipError_t hipStatus = hipPointerGetAttributes(&attributes, maybe_devicePtr);
+
+  if (hipStatus == hipSuccess) {
+    // Check attributes and perform actions accordingly
+    if (attributes.memoryType == hipMemoryTypeDevice) {
+      printf("Pointer is allocated in device memory.\n");
+    } else if (attributes.memoryType == hipMemoryTypeHost) {
+      printf("Pointer is allocated in host memory.\n");
+    } else if (attributes.memoryType == hipMemoryTypeArray) {
+      printf("Pointer points to array memory, physically located on device.\n");
+    } else if (attributes.memoryType == hipMemoryTypeManaged) {
+      printf("Pointer points to managed memory, automaticallly managed by the "
+             "unified memory system.\n");
+    } else if (attributes.memoryType == hipMemoryTypeUnified) {
+      printf("Pointer points to unified memory (not supported currently) \n");
+    } else {
+      printf("Pointer is not allocated in recognized memory type.\n");
+    }
+  } else {
+    fprintf(stderr,
+            "hipPointerGetAttributes failed: %s\n",
+            hipGetErrorString(hipStatus));
+  }
+}
+
+void check_ptr_alignment(void const *ptr) {
+  if (!ptr) {
+    printf("Pointer is NULL\n");
+    return;
+  }
+  bool aligned2 = ((uintptr_t)ptr % 2 == 0);
+  bool aligned4 = ((uintptr_t)ptr % 4 == 0);
+  bool aligned8 = ((uintptr_t)ptr % 8 == 0);
+  bool aligned16 = ((uintptr_t)ptr % 16 == 0);
+  printf("Pointer %p is aligned as follows: 2=%s, 4=%s, 8=%s, 16=%s\n",
+         ptr,
+         (aligned2 ? "yes" : "no"),
+         (aligned4 ? "yes" : "no"),
+         (aligned8 ? "yes" : "no"),
+         (aligned16 ? "yes" : "no"));
+}
 
 template __global__ void
     assign_kernel<half>(half *ptr, coord_t size, half value);
@@ -565,6 +658,13 @@ template __global__ void
 template __global__ void
     assign_kernel<int64_t>(int64_t *ptr, coord_t size, int64_t value);
 
+template __global__ void
+    scale_kernel<half>(half *ptr, coord_t size, half a, half b);
+template __global__ void
+    scale_kernel<float>(float *ptr, coord_t size, float a, float b);
+template __global__ void
+    scale_kernel<double>(double *ptr, coord_t size, double a, double b);
+
 template __global__ void
     add_kernel<half>(half *dst, half const *src, size_t size);
 template __global__ void
@@ -587,6 +687,15 @@ template __global__ void
 template __global__ void
     copy_kernel<int64_t>(int64_t *dst, int64_t const *src, coord_t size);
 
+template __global__ void copy_kernel_discrete<float>(float *dst,
+                                                     float const *src,
+                                                     coord_t size,
+                                                     size_t *index);
+template __global__ void copy_kernel_discrete<int64_t>(int64_t *dst,
+                                                       int64_t const *src,
+                                                       coord_t size,
+                                                       size_t *index);
+
 template __global__ void apply_add_with_scale<float>(float *data_ptr,
                                                      float const *grad_ptr,
                                                      size_t size,
@@ -604,16 +713,42 @@ template __global__ void apply_add_with_scale<int64_t>(int64_t *data_ptr,
                                                        size_t size,
                                                        int64_t scale);
 
-template __host__ void
-    print_tensor<float>(float const *ptr, size_t rect, char const *prefix);
-template __host__ void
-    print_tensor<double>(double const *ptr, size_t rect, char const *prefix);
-template __host__ void
-    print_tensor<int32_t>(int32_t const *ptr, size_t rect, char const *prefix);
-template __host__ void
-    print_tensor<int64_t>(int64_t const *ptr, size_t rect, char const *prefix);
-template __host__ void
-    print_tensor<half>(half const *ptr, size_t rect, char const *prefix);
+template __host__ void print_tensor<float>(float const *ptr,
+                                           size_t rect,
+                                           char const *prefix,
+                                           int shard_id);
+template __host__ void print_tensor<double>(double const *ptr,
+                                            size_t rect,
+                                            char const *prefix,
+                                            int shard_id);
+template __host__ void print_tensor<int32_t>(int32_t const *ptr,
+                                             size_t rect,
+                                             char const *prefix,
+                                             int shard_id);
+template __host__ void print_tensor<int64_t>(int64_t const *ptr,
+                                             size_t rect,
+                                             char const *prefix,
+                                             int shard_id);
+template __host__ void print_tensor<half>(half const *ptr,
+                                          size_t rect,
+                                          char const *prefix,
+                                          int shard_id);
+
+template __host__ void print_beam_tensor<float>(float const *ptr,
+                                                size_t num_elements,
+                                                int skip,
+                                                int channel,
+                                                char const *prefix);
+template __host__ void print_beam_tensor<int32_t>(int32_t const *ptr,
+                                                  size_t num_elements,
+                                                  int skip,
+                                                  int channel,
+                                                  char const *prefix);
+template __host__ void print_beam_tensor<int64_t>(int64_t const *ptr,
+                                                  size_t num_elements,
+                                                  int skip,
+                                                  int channel,
+                                                  char const *prefix);
 
 template __host__ void
     save_tensor<float>(float const *ptr, size_t rect, char const *file_name);
@@ -626,24 +761,43 @@ template __host__ void save_tensor<int64_t>(int64_t const *ptr,
 template __host__ void
     save_tensor<half>(half const *ptr, size_t rect, char const *file_name);
 
-template __host__ float *download_tensor<float>(float const *ptr,
-                                                size_t num_elements);
-template __host__ half *download_tensor<half>(half const *ptr,
-                                              size_t num_elements);
-template __host__ double *download_tensor<double>(double const *ptr,
-                                                  size_t num_elements);
-template __host__ int32_t *download_tensor<int32_t>(int32_t const *ptr,
-                                                    size_t num_elements);
-template __host__ int64_t *download_tensor<int64_t>(int64_t const *ptr,
-                                                    size_t num_elements);
-template __host__ bool
-    download_tensor<float>(float const *ptr, float *dst, size_t num_elements);
-template __host__ bool download_tensor<double>(double const *ptr,
-                                               double *dst,
-                                               size_t num_elements);
-template __host__ bool download_tensor<int32_t>(int32_t const *ptr,
-                                                int32_t *dst,
-                                                size_t num_elements);
-template __host__ bool download_tensor<int64_t>(int64_t const *ptr,
-                                                int64_t *dst,
-                                                size_t num_elements);
+template __host__ float *copy_tensor_dev_to_host<float>(float const *ptr,
+                                                        size_t num_elements);
+template __host__ half *copy_tensor_dev_to_host<half>(half const *ptr,
+                                                      size_t num_elements);
+template __host__ double *copy_tensor_dev_to_host<double>(double const *ptr,
+                                                          size_t num_elements);
+template __host__ int32_t *
+    copy_tensor_dev_to_host<int32_t>(int32_t const *ptr, size_t num_elements);
+template __host__ int64_t *
+    copy_tensor_dev_to_host<int64_t>(int64_t const *ptr, size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<float>(float const *ptr,
+                                                      float *dst,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<half>(half const *ptr,
+                                                     half *dst,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<double>(double const *ptr,
+                                                       double *dst,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<int32_t>(int32_t const *ptr,
+                                                        int32_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_dev_to_host<int64_t>(int64_t const *ptr,
+                                                        int64_t *dst,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<float>(float *dst,
+                                                      float const *src,
+                                                      size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<half>(half *dst,
+                                                     half const *src,
+                                                     size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<double>(double *dst,
+                                                       double const *src,
+                                                       size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int32_t>(int32_t *dst,
+                                                        int32_t const *src,
+                                                        size_t num_elements);
+template __host__ void copy_tensor_host_to_dev<int64_t>(int64_t *dst,
+                                                        int64_t const *src,
+                                                        size_t num_elements);
diff --git a/src/runtime/inference_manager.cc b/src/runtime/inference_manager.cc
index 3d299aeedd..1b65dfd869 100644
--- a/src/runtime/inference_manager.cc
+++ b/src/runtime/inference_manager.cc
@@ -54,10 +54,31 @@ bool parallel_tensor_list_overlaps(std::vector<ParallelTensor> const &list1,
 }
 
 void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
+
+  // Check if the model object exists
+  if (model == nullptr) {
+    std::cout << "###PEFT DEBUGGING### Model object does not exist."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### Model object exists." << std::endl;
+  }
+
   // TODO: currently assume there is a single data-parallel pipeline
   // (i.e., data-parallel-degree == 1)
   assert(model->config.data_parallelism_degree == 1);
   model->config.batchSize = BatchConfig::max_tokens_per_batch();
+
+  // Check if the model object exists after importing config
+  if (model == nullptr) {
+    std::cout << "###PEFT DEBUGGING### Model object does not exist after "
+                 "setting config and batch size."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### Model object still exists." << std::endl;
+  }
+
   model->compile_inference();
   Context ctx = model->config.lg_ctx;
   Runtime *runtime = model->config.lg_hlr;
@@ -117,7 +138,28 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
     for (int i = 0; i < op->numOutputs; i++) {
       ParallelTensor pt_base = op->outputs[i];
       assert(tensor_buffer.find(pt_base) == tensor_buffer.end());
-
+      // no need to map inplace tensor
+      // A tensor is inplace if it shares the same region as another tensor
+      {
+        bool inplace = false;
+        for (int j = 0; j < op->numInputs; j++) {
+          if (op->inputs[j]->region == op->outputs[i]->region) {
+            assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
+            tensor_buffer[pt_base] = tensor_buffer[op->inputs[j]];
+            inplace = true;
+          }
+        }
+        for (int j = 0; j < i; j++) {
+          if (op->outputs[j]->region == op->outputs[i]->region) {
+            assert(tensor_buffer.find(op->outputs[j]) != tensor_buffer.end());
+            tensor_buffer[pt_base] = tensor_buffer[op->outputs[j]];
+            inplace = true;
+          }
+        }
+        if (inplace) {
+          continue;
+        }
+      }
       if (op->op_type == OP_REPLICATE) {
         assert(op->numInputs == 1 && op->numOutputs == 1);
       }
@@ -175,7 +217,7 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
           }
         }
         if (!found_parallel_tensor) {
-          log_offload.print(
+          log_offload.debug(
               "Cannot find a previous tensor for operator(%d) output_idx(%d)",
               op_idx,
               i);
@@ -191,6 +233,13 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
                                              pt_base->region.get_field_space());
           pt->part = runtime->get_logical_partition(
               ctx, pt->region, pt_base->part.get_index_partition());
+
+          pt->region_grad =
+              runtime->create_logical_region(ctx,
+                                             pt_base->region.get_index_space(),
+                                             pt_base->region.get_field_space());
+          pt->part_grad = runtime->get_logical_partition(
+              ctx, pt->region_grad, pt_base->part.get_index_partition());
           pt->machine_view = machine_views[j];
           // std::cout << "output mv: " << pt->machine_view << std::endl;
           Domain part_domain =
@@ -205,6 +254,30 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
     // std::cout << std::endl;
   }
 
+  // Check whether we need to reset input grads
+  // We use a parallel tensor's region as the key
+  std::set<LogicalRegion> reset_inputs;
+  for (int l = model->operators.size() - 1; l >= 0; l--) {
+    Op *op = model->operators[l];
+    for (int i = 0; i < op->numInputs; i++) {
+      assert(op->inputs[i]->region != LogicalRegion::NO_REGION);
+      if (reset_inputs.find(op->inputs[i]->region) != reset_inputs.end()) {
+        // We should not reset input grads since other operators have already
+        // saved gradients into the region
+        op->reset_input_grads[i] = false;
+      } else if (i == 0 && (op->op_type == OP_RESIDUAL_LAYERNORM ||
+                            op->op_type == OP_RESIDUAL_RMS_NORM ||
+                            op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)) {
+        if (reset_inputs.find(op->outputs[0]->region) != reset_inputs.end()) {
+          op->reset_input_grads[0] = false;
+        }
+        reset_inputs.insert(op->inputs[i]->region);
+      } else {
+        reset_inputs.insert(op->inputs[i]->region);
+      }
+    }
+  }
+
   // Perform fusion optimizations
   if (model->config.perform_fusion) {
     fprintf(stderr, "Applying fusion optimizations during compilation...\n");
@@ -235,34 +308,35 @@ void InferenceManager::compile_model_and_allocate_buffer(FFModel *model) {
     if (op->op_type == OP_INPUT || op->op_type == OP_WEIGHT) {
       continue;
     }
-    printf("operator[%zu]: type(%s) guid(%lu)\n",
-           i,
-           get_operator_type_name(model->operators[i]->op_type).c_str(),
-           model->operators[i]->op_guid);
+    log_inf_mgr.debug(
+        "operator[%zu]: type(%s) guid(%lu)\n",
+        i,
+        get_operator_type_name(model->operators[i]->op_type).c_str(),
+        model->operators[i]->op_guid);
     for (int j = 0; j < op->numInputs; j++) {
       assert(tensor_buffer.find(op->inputs[j]) != tensor_buffer.end());
       LogicalRegion handle = tensor_buffer[op->inputs[j]][0]->region;
-      printf("\tinputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
+      log_inf_mgr.debug("\tinputs[%d] mapped_region(%d,%d,%d)\n",
+                        j,
+                        handle.get_index_space().get_id(),
+                        handle.get_field_space().get_id(),
+                        handle.get_tree_id());
     }
     for (int j = 0; j < op->numOutputs; j++) {
       LogicalRegion handle = tensor_buffer[op->outputs[j]][0]->region;
-      printf("\toutputs[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
+      log_inf_mgr.debug("\toutputs[%d] mapped_region(%d,%d,%d)\n",
+                        j,
+                        handle.get_index_space().get_id(),
+                        handle.get_field_space().get_id(),
+                        handle.get_tree_id());
     }
     for (int j = 0; j < op->numWeights; j++) {
       LogicalRegion handle = op->weights[j]->region;
-      printf("\tweights[%d] mapped_region(%d,%d,%d)\n",
-             j,
-             handle.get_index_space().get_id(),
-             handle.get_field_space().get_id(),
-             handle.get_tree_id());
+      log_inf_mgr.debug("\tweights[%d] mapped_region(%d,%d,%d)\n",
+                        j,
+                        handle.get_index_space().get_id(),
+                        handle.get_field_space().get_id(),
+                        handle.get_tree_id());
     }
   }
 }
@@ -290,9 +364,9 @@ void InferenceManager::init_operators_inference(FFModel *model) {
         assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE);
         assert(tensor_buffer[op->outputs[i]].size() > batch_index);
         outputs[i] = tensor_buffer[op->outputs[i]][batch_index];
-        if (i > 0) {
-          assert(outputs[0]->machine_view == outputs[i]->machine_view);
-        }
+        // if (i > 0) {
+        //   assert(outputs[0]->machine_view == outputs[i]->machine_view);
+        // }
         assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE);
       }
       if (op->is_parallel_op()) {
@@ -332,11 +406,12 @@ FutureMap InferenceManager::inference(FFModel *model,
 FutureMap InferenceManager::inference(FFModel *model,
                                       int index,
                                       BatchConfigFuture const &bc) {
-  // log_inf_mgr.print("mode(%d) num_active_tokens(%d) num_active_requests(%d)",
+  // log_inf_mgr.print("mode(%d) num_active_infr_tokens(%d)
+  // num_active_requests(%d)",
   //                   bc.get_mode(),
-  //                   bc.num_active_tokens(),
+  //                   bc.num_active_infr_tokens(),
   //                   bc.num_active_requests());
-  //  assert(bc.num_active_tokens() > 0 && bc.num_active_requests() > 0);
+  //  assert(bc.num_active_infr_tokens() > 0 && bc.num_active_requests() > 0);
   //  We currently assume that the index-th batch will be placed
   //  on the device_index-th device (except for the experts layers)
   int batch_index = index % model->config.data_parallelism_degree;
@@ -390,6 +465,53 @@ FutureMap InferenceManager::inference(FFModel *model,
   return fm;
 };
 
+void InferenceManager::peft_bwd(FFModel *model,
+                                int index,
+                                BatchConfigFuture const &bc) {
+  int batch_index = index % model->config.data_parallelism_degree;
+  FutureMap fm;
+  bool found_input_operator = false;
+  int last_op = model->operators.size() - 1;
+  // Assert that the last operator must be argmax or sampling
+  assert(model->operators[last_op]->op_type == OP_ARGMAX ||
+         model->operators[last_op]->op_type == OP_ARG_TOPK ||
+         model->operators[last_op]->op_type == OP_SAMPLING);
+  last_op -= 1;
+  while (model->operators[last_op]->op_type == OP_WEIGHT && last_op > 0) {
+    last_op -= 1;
+  }
+  for (int o = last_op; o >= 0; o--) {
+    Op *op = model->operators[o];
+    if (op->op_type == OP_WEIGHT) {
+      continue;
+    }
+    if (op->op_type == OP_INPUT) {
+      continue;
+    }
+    std::vector<ParallelTensor> inputs(op->numInputs);
+    std::vector<ParallelTensor> outputs(op->numOutputs);
+    for (int i = 0; i < op->numInputs; i++) {
+      assert(op->inputs[i] != nullptr);
+      assert(op->inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      assert(tensor_buffer[op->inputs[i]].size() > batch_index);
+      inputs[i] = tensor_buffer[op->inputs[i]][batch_index];
+      assert(inputs[i]->parallel_is != IndexSpace::NO_SPACE);
+    }
+    for (int i = 0; i < op->numOutputs; i++) {
+      assert(op->outputs[i] != nullptr);
+      assert(op->outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+      if (op->op_type == OP_INPUT &&
+          tensor_buffer[op->outputs[i]].size() == 0) {
+        continue;
+      }
+      assert(tensor_buffer[op->outputs[i]].size() > batch_index);
+      outputs[i] = tensor_buffer[op->outputs[i]][batch_index];
+      assert(outputs[i]->parallel_is != IndexSpace::NO_SPACE);
+    }
+    op->peft_bwd(*model, bc, inputs, outputs);
+  }
+};
+
 void InferenceManager::load_input_tokens_from_batch_config(
     FFModel *model,
     BatchConfigFuture const &bc,
@@ -509,17 +631,26 @@ void FFModel::set_position_offset(int offset) {
 }
 
 void FFModel::compile_inference() {
+  std::cout << "###PEFT DEBUGGING### Entering compile_inference." << std::endl;
+
   // Request at least four CPU processors for inference runs
   assert(
       config.cpusPerNode >= 4 &&
       "FlexFlow Serve requires at least four CPU cores per node, please add "
       "`-ll:cpu 4` in the command line if you are using the C++ interface or "
       "set `num_cpus` in `ff.init` if you are using the Python interface");
+
+  std::cout << "###PEFT DEBUGGING### Configuration check passed: At least four "
+               "CPU cores per node."
+            << std::endl;
   Context ctx = config.lg_ctx;
   Runtime *runtime = config.lg_hlr;
   config.computationMode = COMP_MODE_INFERENCE;
   create_operators_from_layers();
+
   // Launch the graph optimize task
+  std::cout << "###PEFT DEBUGGING### Launching graph optimization task."
+            << std::endl;
   {
     FFModel *model = this;
     TaskLauncher launcher(GRAPH_OPTIMIZE_TASK_ID,
@@ -535,7 +666,7 @@ void FFModel::compile_inference() {
     deserialize_graph_optimal_view(dez, best_graph, optimal_views);
     operators.clear();
     convert_graph_to_operators(best_graph, optimal_views);
-    best_graph->print_dot();
+    // best_graph->print_dot();
     delete best_graph;
     for (auto const &layer : layers) {
       // map inputs to parallel tensor
@@ -570,6 +701,14 @@ void FFModel::compile_inference() {
       }
     }
   }
+
+  std::cout
+      << "###PEFT DEBUGGING### Operators reconstructed from optimized graph."
+      << std::endl;
+  // Perform inplace optimizations
+  std::cout << "###PEFT DEBUGGING### Starting inplace optimizations."
+            << std::endl;
+
   loss_op = nullptr;
   metrics_op = nullptr;
   // Perform inplace optimizations
@@ -609,6 +748,8 @@ void FFModel::compile_inference() {
     }
   }
 
+  // Output tensor mapping
+  std::cout << "###PEFT DEBUGGING### Mapping output tensors." << std::endl;
   for (size_t l = 0; l < operators.size(); l++) {
     Op *op = operators[l];
 
@@ -634,11 +775,14 @@ void FFModel::compile_inference() {
   }
 
 #ifdef FF_USE_NCCL
+  std::cout << "###PEFT DEBUGGING### Setting up NCCL communications."
+            << std::endl;
   for (size_t l = 0; l < operators.size(); l++) {
     // Only create nccl for allreduce and fusedop for inference
     // (fusedop may include allreduces)
     if (operators[l]->op_type == OP_ALLREDUCE ||
-        operators[l]->op_type == OP_FUSED) {
+        operators[l]->op_type == OP_PARALLEL_IDENTITY ||
+        operators[l]->op_type == OP_LORA || operators[l]->op_type == OP_FUSED) {
       MachineView view = operators[l]->outputs[0]->machine_view;
       if (view_hash_to_nccl_comms.find(view.hash()) ==
           view_hash_to_nccl_comms.end()) {
@@ -670,6 +814,8 @@ void FFModel::compile_inference() {
     }
   }
 #endif
+  std::cout << "###PEFT DEBUGGING### compile_inference completed successfully."
+            << std::endl;
 }
 
 std::string join_path(std::vector<std::string> const &paths) {
diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index 4c67de1aa9..f46630db3c 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -47,6 +47,7 @@
 #include "flexflow/ops/inc_multihead_self_attention.h"
 #include "flexflow/ops/layer_norm.h"
 #include "flexflow/ops/linear.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/ops/noop.h"
 #include "flexflow/ops/pool_2d.h"
 #include "flexflow/ops/reduce.h"
@@ -66,6 +67,7 @@
 #include "flexflow/parallel_ops/allreduce.h"
 #include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/fused_parallel_op.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
@@ -77,6 +79,7 @@
 #include <dirent.h>
 #include <queue>
 #include <unordered_set>
+#include <wordexp.h>
 
 namespace FlexFlow {
 
@@ -135,19 +138,21 @@ Op::Op(FFModel &model,
   std::string pcname;
   if (_name == NULL) {
     pcname = get_operator_type_name(op_type);
+    pcname = pcname + "_" + std::to_string(op_guid);
   } else {
     pcname = std::string(_name);
   }
-  pcname = pcname + "_" + std::to_string(op_guid);
   assert(pcname.length() < MAX_OPNAME);
+  // std::cout << "Creating operator: " << pcname << std::endl;
   std::strcpy(name, pcname.c_str());
+  // std::cout << "copied name into name var: " << this->name << std::endl;
   for (int i = 0; i < numInputs; i++) {
     assert(tensors[i] != NULL);
     inputs[i] = tensors[i];
   }
   for (int i = 0; i < numInputs; i++) {
-    trainableInputs[i] = true;
-    // resetInputGrads[i] = true;
+    trainable_inputs[i] = true;
+    reset_input_grads[i] = true;
   }
   for (int i = 0; i < MAX_NUM_OUTPUTS; i++) {
     outputs[i] = nullptr;
@@ -191,8 +196,8 @@ Op::Op(FFModel &model,
     }
   }
   for (int i = 0; i < numInputs; i++) {
-    trainableInputs[i] = true;
-    // resetInputGrads[i] = true;
+    trainable_inputs[i] = true;
+    reset_input_grads[i] = true;
   }
   for (int i = 0; i < MAX_NUM_OUTPUTS; i++) {
     outputs[i] = NULL;
@@ -1245,7 +1250,8 @@ void Op::set_argumentmap_for_init_inference(FFModel const &ff,
     int idx = 0;                                                               \
     for (PointInRectIterator<DIM> it(rect); it(); it++) {                      \
       FFHandler handle = ff.handlers[view.get_device_id(*it)];                 \
-      if (op_type == OP_ALLREDUCE) {                                           \
+      if (op_type == OP_ALLREDUCE || op_type == OP_LORA ||                     \
+          op_type == OP_PARALLEL_IDENTITY) {                                   \
         ncclComm_t *nccl_comms = ff.find_nccl_comms(view);                     \
         handle.ncclComm = nccl_comms[idx++];                                   \
       }                                                                        \
@@ -1475,10 +1481,12 @@ bool Op::get_weight_parameter(TNParameter tnp,
   return true;
 }
 
+#ifdef DEADCODE
 OpMeta::OpMeta(FFHandler _handle)
     : handle(_handle), profiling(false), inference_debugging(false) {
   for (int i = 0; i < MAX_NUM_INPUTS; i++) {
-    trainableInputs[i] = true;
+    trainable_inputs[i] = true;
+    reset_input_grads[i] = true;
   }
   for (int i = 0; i < MAX_NUM_INPUTS; i++) {
     input_type[i] = DT_NONE;
@@ -1490,9 +1498,17 @@ OpMeta::OpMeta(FFHandler _handle)
     output_type[i] = DT_NONE;
   }
   decoding_step = 0;
+  bwd_step = 0;
 }
+#endif
 
-OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) {
+OpMeta::OpMeta(FFHandler _handle, Op const *op)
+    : handle(_handle), profiling(op->profiling),
+      inference_debugging(op->inference_debugging) {
+  for (int i = 0; i < op->numInputs; i++) {
+    trainable_inputs[i] = op->trainable_inputs[i];
+    reset_input_grads[i] = op->reset_input_grads[i];
+  }
   for (int i = 0; i < op->numInputs; i++) {
     input_type[i] = op->inputs[i]->data_type;
   }
@@ -1503,6 +1519,7 @@ OpMeta::OpMeta(FFHandler _handle, Op const *op) : OpMeta(_handle) {
     output_type[i] = op->outputs[i]->data_type;
   }
   decoding_step = 0;
+  bwd_step = 0;
 }
 
 FFRuntime::FFRuntime(FFConfig &config) {
@@ -1520,6 +1537,10 @@ FFRuntime::FFRuntime(FFConfig &config) {
     info.workSpaceSize = config.workSpaceSize;
     info.offload_reserve_space_size =
         config.cpu_offload ? config.offload_reserve_space_size : 0;
+    info.peft_activation_reserve_space_size =
+        config.enable_peft ? config.peft_activation_reserve_space_size : 0;
+    info.peft_weight_reserve_space_size =
+        config.enable_peft ? config.peft_weight_reserve_space_size : 0;
     info.quantization_type = config.quantization_type;
     info.allowTensorOpMathConversion = config.allow_tensor_op_math_conversion;
     argmap.set_point(*it, TaskArgument(&info, sizeof(FFInitInfo)));
@@ -1546,9 +1567,32 @@ FFRuntime *ffruntime_singleton = nullptr;
 
 int FFModel::model_counter = 0;
 
+void make_debug_dirs() {
+  char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+  std::string debug_dir_ =
+      ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow"
+                    : std::string("~/.cache/flexflow/debug/flexflow");
+  wordexp_t p;
+  wordexp(debug_dir_.c_str(), &p, 0);
+  debug_dir_ = p.we_wordv[0];
+  wordfree(&p);
+  fs::path debug_dir = debug_dir_;
+  if (fs::exists(debug_dir)) {
+    fs::remove_all(debug_dir);
+  }
+  fs::create_directories(debug_dir);
+  assert(fs::is_directory(debug_dir));
+  std::vector<std::string> debug_subdirs = {"fwd", "bwd", "optim", "weights"};
+  for (auto const &subdir : debug_subdirs) {
+    fs::path subdir_path = debug_dir / subdir;
+    fs::create_directory(subdir_path);
+  }
+}
+
 FFModel::FFModel(FFConfig &_config, bool cpu_offload)
     : op_global_guid(OP_GUID_FIRST_VALID),
       layer_global_guid(LAYER_GUID_FIRST_VALID),
+      peft_model_global_guid(PEFT_MODEL_ID_FIRST_VALID),
       tensor_global_guid(TENSOR_GUID_FIRST_VALID),
       parallel_tensor_global_guid(PARALLEL_TENSOR_GUID_FIRST_VALID),
       node_global_guid(NODE_GUID_FIRST_VALID), current_transformer_layer_id(0),
@@ -1586,6 +1630,9 @@ FFModel::FFModel(FFConfig &_config, bool cpu_offload)
   for (int idx = 0; idx < config.workersPerNode * config.numNodes; idx++) {
     handlers[idx] = ffruntime_singleton->handlers[idx];
   }
+  if (config.inference_debugging) {
+    make_debug_dirs();
+  }
   model_id = model_counter++;
 }
 
@@ -2932,7 +2979,8 @@ bool FFModel::apply_fusion(
     // don't fuse parallel op except allReduce since they have different
     // parallel_is in forward/backward
     if (operators[l]->is_parallel_op() &&
-        operators[l]->op_type != OP_ALLREDUCE) {
+        operators[l]->op_type != OP_ALLREDUCE &&
+        operators[l]->op_type != OP_PARALLEL_IDENTITY) {
       continue;
     }
     size_t start = 0;
@@ -2978,7 +3026,8 @@ bool FFModel::apply_fusion(
           // don't fuse parallel op except allReduce since they have different
           // parallel_is in forward/backward
           if (operators[i]->is_parallel_op() &&
-              operators[i]->op_type != OP_ALLREDUCE) {
+              operators[i]->op_type != OP_ALLREDUCE &&
+              operators[i]->op_type != OP_PARALLEL_IDENTITY) {
             continue;
           }
           fused_op = new FusedOp(*this, operators[i]);
@@ -3010,8 +3059,19 @@ bool FFModel::apply_fusion(
                     found = k;
                   }
                 }
-                assert(found >= 0);
-                op->inputs[idx] = fused_op->outputs[found];
+                if (found >= 0) {
+                  op->inputs[idx] = fused_op->outputs[found];
+                } else {
+                  for (int k = 0; k < fused_op->numInputs; k++) {
+                    if (fused_op->inputs[k]->region ==
+                        op->inputs[idx]->region) {
+                      assert(found == -1);
+                      found = k;
+                    }
+                  }
+                  assert(found >= 0);
+                  op->inputs[idx] = fused_op->inputs[found];
+                }
               }
             }
             // Insert op
@@ -3287,6 +3347,12 @@ Op *FFModel::create_operator_from_layer(
       operators.push_back(op);
       return op;
     }
+    // PEFT layers
+    case OP_LORA: {
+      Op *op = LoraLinear::create_operator_from_layer(*this, layer, inputs);
+      operators.push_back(op);
+      return op;
+    }
     default:
       assert(false);
   }
@@ -3313,9 +3379,123 @@ bool FFModel::is_mlp_block(int layer_idx) const {
   return false;
 }
 
+bool FFModel::need_to_add_combine(int layer_idx) const {
+  if (config.computationMode != COMP_MODE_INFERENCE ||
+      config.tensor_parallelism_degree == 1 || layers.size() <= 2) {
+    return false;
+  }
+  auto const &l = layers[layer_idx];
+  // softmax followed by argmax/arg_topk: add combine before softmax
+  if (layer_idx == layers.size() - 2) {
+    auto const &l_next = layers[layer_idx + 1];
+    if (l->op_type == OP_SOFTMAX &&
+        (l_next->op_type == OP_ARG_TOPK || l_next->op_type == OP_ARGMAX)) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+  // argmax/arg_topk not precedent by softmax: add combine before
+  // argmax/arg_topk
+  if (layer_idx == layers.size() - 1 &&
+      (l->op_type == OP_ARG_TOPK || l->op_type == OP_ARGMAX)) {
+    auto const &l_prev = layers[layer_idx - 1];
+    if (l_prev->op_type == OP_SOFTMAX) {
+      return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+bool FFModel::need_to_add_allreduce(int layer_idx) const {
+  auto const &l = layers[layer_idx];
+  if (config.computationMode == COMP_MODE_INFERENCE &&
+      config.tensor_parallelism_degree > 1 &&
+      (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+       l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
+       // mlp layer
+       is_mlp_block(layer_idx) ||
+       // llama mlp layer
+       (l->op_type == OP_LINEAR && layer_idx >= 2 &&
+        layers[layer_idx - 1]->op_type == OP_GELU &&
+        layers[layer_idx - 2]->op_type == OP_LINEAR) ||
+       // LLAMA without element-wise operator fusion
+       (l->op_type == OP_LINEAR && layer_idx >= 5 &&
+        layers[layer_idx - 1]->op_type == OP_EW_MUL &&
+        layers[layer_idx - 2]->op_type == OP_EW_MUL &&
+        layers[layer_idx - 3]->op_type == OP_SIGMOID &&
+        layers[layer_idx - 4]->op_type == OP_LINEAR &&
+        layers[layer_idx - 5]->op_type == OP_LINEAR) ||
+       // LLAMA with element-wise operator fusion
+       (l->op_type == OP_LINEAR && layer_idx >= 3 &&
+        layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
+        layers[layer_idx - 2]->op_type == OP_LINEAR &&
+        layers[layer_idx - 3]->op_type == OP_LINEAR))) {
+    return true;
+  }
+  return false;
+}
+
+#ifdef DEADCODE
+bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
+  auto const &l = layers[layer_idx];
+  // add parallel identity (allreduce in the backward pass) before the lm head
+  // we find the lm head by looking for the linear layer right after a residual
+  // rms norm / layer norm, and before a softmax, followed by
+  // argmax/argtopk/sampling
+  if (config.computationMode == COMP_MODE_INFERENCE &&
+      config.tensor_parallelism_degree > 1 &&
+      ((l->op_type == OP_RESIDUAL_RMS_NORM ||
+        l->op_type == OP_RESIDUAL_LAYERNORM) &&
+       // there are at least 2 layers before the norm, and at least 3 following
+       // the norm
+       layer_idx >= 2 && layer_idx < layers.size() - 3 &&
+       // norm is followed by linear layer (lm head)
+       layers[layer_idx + 1]->op_type == OP_LINEAR &&
+       // lm head is followed by softmax
+       layers[layer_idx + 2]->op_type == OP_SOFTMAX &&
+       // softmax is followed by argmax/argtopk/sampling
+       (layers[layer_idx + 3]->op_type == OP_ARG_TOPK ||
+        layers[layer_idx + 3]->op_type == OP_SAMPLING ||
+        layers[layer_idx + 3]->op_type == OP_ARGMAX ||
+        layers[layer_idx + 3]->op_type == OP_SCALAR_TRUE_DIV))) {
+    return true;
+  }
+  return false;
+}
+#endif
+bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
+  auto const &l = layers[layer_idx];
+  // add parallel identity (allreduce in the backward pass) before the lm head
+  // we find the lm head by looking for the linear layer right after a residual
+  // rms norm / layer norm, and before a softmax, followed by
+  // argmax/argtopk/sampling
+  if (config.computationMode == COMP_MODE_INFERENCE &&
+      config.tensor_parallelism_degree > 1 &&
+      ((l->op_type == OP_RMS_NORM || l->op_type == OP_RESIDUAL_RMS_NORM ||
+        l->op_type == OP_LAYERNORM || l->op_type == OP_RESIDUAL_LAYERNORM) &&
+       // there are at least 2 layers before the norm, and at least 1 following
+       // the norm
+       layer_idx >= 2 && layer_idx < layers.size() - 1 &&
+       // norm is followed by linear layer or attention
+       (layers[layer_idx + 1]->op_type == OP_LINEAR ||
+        layers[layer_idx + 1]->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
+        layers[layer_idx + 1]->op_type ==
+            OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
+        layers[layer_idx + 1]->op_type ==
+            OP_SPEC_INC_MULTIHEAD_SELF_ATTENTION))) {
+    return true;
+  }
+  return false;
+}
+
 void FFModel::create_operators_from_layers() {
   std::map<const Tensor, ParallelTensor> tensors_to_parallel_tensors;
-  // for (auto const &l : layers) {
+  std::map<const Tensor, ParallelTensor>
+      op_before_allreduce_tensors_to_parallel_tensors;
+  std::map<size_t, int> transformer_layer_allreduce_count;
+  std::map<size_t, int> transformer_layer_parallel_identity_count;
   for (int layer_idx = 0; layer_idx < layers.size(); layer_idx++) {
     auto const &l = layers[layer_idx];
     std::vector<ParallelTensor> inputs;
@@ -3323,14 +3503,19 @@ void FFModel::create_operators_from_layers() {
       // create new input tensors
       assert(tensors_to_parallel_tensors.find(l->inputs[i]) !=
              tensors_to_parallel_tensors.end());
-      inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]);
+      if (l->op_type == OP_LORA &&
+          op_before_allreduce_tensors_to_parallel_tensors.find(l->inputs[i]) !=
+              op_before_allreduce_tensors_to_parallel_tensors.end()) {
+        inputs.push_back(
+            op_before_allreduce_tensors_to_parallel_tensors[l->inputs[i]]);
+      } else {
+        inputs.push_back(tensors_to_parallel_tensors[l->inputs[i]]);
+      }
     }
     Op *op = nullptr;
-    // add a combine before arg_topk
-    if (config.computationMode == COMP_MODE_INFERENCE &&
-        config.tensor_parallelism_degree > 1 &&
-        (l->op_type == OP_ARG_TOPK || l->op_type == OP_SOFTMAX ||
-         l->op_type == OP_ARGMAX)) {
+    // add a combine before last arg_max / arg_topk or before second-to-last
+    // softmax
+    if (need_to_add_combine(layer_idx)) {
       std::vector<ParallelTensor> partitioned_inputs;
       assert(inputs.size() == 1);
       Combine *comb = new Combine(*this,
@@ -3353,37 +3538,97 @@ void FFModel::create_operators_from_layers() {
       //                                 config.tensor_parallelism_degree);
       // operators.push_back(repl);
       // op = repl;
-    } else if (config.computationMode == COMP_MODE_INFERENCE &&
-               config.tensor_parallelism_degree > 1 &&
-               (l->op_type == OP_INC_MULTIHEAD_SELF_ATTENTION ||
-                l->op_type == OP_TREE_INC_MULTIHEAD_SELF_ATTENTION ||
-                // mlp layer
-                is_mlp_block(layer_idx) ||
-                // llama mlp layer
-                (l->op_type == OP_LINEAR && layer_idx >= 2 &&
-                 layers[layer_idx - 1]->op_type == OP_GELU &&
-                 layers[layer_idx - 2]->op_type == OP_LINEAR) ||
-                // LLAMA without element-wise operator fusion
-                (l->op_type == OP_LINEAR && layer_idx >= 5 &&
-                 layers[layer_idx - 1]->op_type == OP_EW_MUL &&
-                 layers[layer_idx - 2]->op_type == OP_EW_MUL &&
-                 layers[layer_idx - 3]->op_type == OP_SIGMOID &&
-                 layers[layer_idx - 4]->op_type == OP_LINEAR &&
-                 layers[layer_idx - 5]->op_type == OP_LINEAR) ||
-                // LLAMA with element-wise operator fusion
-                (l->op_type == OP_LINEAR && layer_idx >= 3 &&
-                 layers[layer_idx - 1]->op_type == OP_SIGMOID_SILU_MULTI &&
-                 layers[layer_idx - 2]->op_type == OP_LINEAR &&
-                 layers[layer_idx - 3]->op_type == OP_LINEAR))) {
+      assert(op->numOutputs == l->numOutputs);
+      for (int i = 0; i < op->numOutputs; i++) {
+        assert(tensors_to_parallel_tensors.find(l->outputs[i]) ==
+               tensors_to_parallel_tensors.end());
+        tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i];
+      }
+    } else if (need_to_add_allreduce(layer_idx)) {
       assert(op->numOutputs == 1);
-      AllReduce *allreduce =
-          new AllReduce(*this, op->outputs[0], op->outputs[0]->num_dims - 1);
+      size_t transformer_layer_id = op->layer_guid.transformer_layer_id;
+      if (transformer_layer_allreduce_count.find(transformer_layer_id) ==
+          transformer_layer_allreduce_count.end()) {
+        transformer_layer_allreduce_count[transformer_layer_id] = 0;
+      }
+      std::string allreduce_name = std::string(
+          "layers." + std::to_string(transformer_layer_id) + ".allreduce." +
+          std::to_string(
+              transformer_layer_allreduce_count[transformer_layer_id]));
+      transformer_layer_allreduce_count[transformer_layer_id]++;
+      AllReduce *allreduce = new AllReduce(*this,
+                                           op->outputs[0],
+                                           op->outputs[0]->num_dims - 1,
+                                           allreduce_name.c_str());
       operators.push_back(allreduce);
+      op_before_allreduce_tensors_to_parallel_tensors[l->outputs[0]] =
+          op->outputs[0];
       op = allreduce;
+      assert(op->numOutputs == l->numOutputs);
+      for (int i = 0; i < op->numOutputs; i++) {
+        assert(tensors_to_parallel_tensors.find(l->outputs[i]) ==
+               tensors_to_parallel_tensors.end());
+        tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i];
+      }
+    } else if (need_to_add_parallel_identity(layer_idx)) {
+      assert(op->numOutputs == 1 || op->numOutputs == 2);
+      size_t transformer_layer_id = op->layer_guid.transformer_layer_id;
+      if (transformer_layer_parallel_identity_count.find(
+              transformer_layer_id) ==
+          transformer_layer_parallel_identity_count.end()) {
+        transformer_layer_parallel_identity_count[transformer_layer_id] = 0;
+      }
+      std::string parallel_identity_name = std::string(
+          "layers." + std::to_string(transformer_layer_id) +
+          ".parallel_identity." +
+          std::to_string(
+              transformer_layer_parallel_identity_count[transformer_layer_id]));
+      transformer_layer_parallel_identity_count[transformer_layer_id]++;
+      ParallelIdentity *parallel_identity = nullptr;
+      if (op->numOutputs == 1) {
+        parallel_identity =
+            new ParallelIdentity(*this,
+                                 op->outputs[0],
+                                 op->outputs[0]->num_dims - 1,
+                                 parallel_identity_name.c_str());
+      } else if (op->numOutputs == 2) {
+        parallel_identity =
+            new ParallelIdentity(*this,
+                                 op->outputs[1],
+                                 op->outputs[1]->num_dims - 1,
+                                 parallel_identity_name.c_str());
+        // output 0 is taken from the residual rms norm
+        assert(tensors_to_parallel_tensors.find(l->outputs[0]) ==
+               tensors_to_parallel_tensors.end());
+        tensors_to_parallel_tensors[l->outputs[0]] = op->outputs[0];
+      } else {
+        assert(false &&
+               "Op needing ParallelIdentity has unexpected number of outputs");
+      }
+      operators.push_back(parallel_identity);
+      assert(op->numOutputs == l->numOutputs);
+      // last output is taken from the parallel identity
+      assert(tensors_to_parallel_tensors.find(l->outputs[op->numOutputs - 1]) ==
+             tensors_to_parallel_tensors.end());
+      tensors_to_parallel_tensors[l->outputs[l->numOutputs - 1]] =
+          parallel_identity->outputs[0];
+      op = parallel_identity;
+    } else {
+      assert(op->numOutputs == l->numOutputs);
+      for (int i = 0; i < op->numOutputs; i++) {
+        assert(tensors_to_parallel_tensors.find(l->outputs[i]) ==
+               tensors_to_parallel_tensors.end());
+        tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i];
+      }
     }
-    assert(op->numOutputs == l->numOutputs);
-    for (int i = 0; i < op->numOutputs; i++) {
-      tensors_to_parallel_tensors[l->outputs[i]] = op->outputs[i];
+    // if the operator has op_type==OP_LORA, and the second-to-last operator in
+    // the operators vector has op_type==OP_ALLREDUCE, move the operator before
+    // the ALLREDUCE
+    if (op->op_type == OP_LORA && operators.size() > 1 &&
+        operators[operators.size() - 2]->op_type == OP_ALLREDUCE) {
+      Op *tmp = operators[operators.size() - 2];
+      operators[operators.size() - 2] = operators[operators.size() - 1];
+      operators[operators.size() - 1] = tmp;
     }
   }
 }
@@ -3424,7 +3669,7 @@ void FFModel::compile(LossType loss_type,
     deserialize_graph_optimal_view(dez, best_graph, optimal_views);
     operators.clear();
     convert_graph_to_operators(best_graph, optimal_views);
-    best_graph->print_dot();
+    // best_graph->print_dot();
     delete best_graph;
     for (auto const &layer : layers) {
       // map inputs to parallel tensor
@@ -3549,7 +3794,7 @@ void FFModel::compile(LossType loss_type,
     for (int i = 0; i < op->numInputs; i++) {
       assert(op->inputs[i]->owner_op != nullptr);
       if (op->inputs[i]->owner_op->op_type == OP_INPUT) {
-        op->trainableInputs[i] = false;
+        op->trainable_inputs[i] = false;
       }
     }
   }
@@ -3745,9 +3990,18 @@ bool FFModel::check_operators_integrity(
         }
         for (int i = 0; i < fused->op_num_outputs[op]; i++) {
           int my_off = fused->op_output_idx[i + ooff];
-          assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT);
-          assert(FusedOp::use_same_regions(
-              fused->outputs[my_off], old_op->outputs[i], pt_mapping));
+          assert(fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT ||
+                 (fused->op_output_source[i + ooff] == FusedOp::SOURCE_INPUT &&
+                  (old_op->op_type == OP_RESIDUAL_LAYERNORM ||
+                   old_op->op_type == OP_RESIDUAL_RMS_NORM ||
+                   old_op->op_type == OP_ADD_BIAS_RESIDUAL_LAYERNORM)));
+          if (fused->op_output_source[i + ooff] == FusedOp::SOURCE_OUTPUT) {
+            assert(FusedOp::use_same_regions(
+                fused->outputs[my_off], old_op->outputs[i], pt_mapping));
+          } else {
+            assert(FusedOp::use_same_regions(
+                fused->inputs[my_off], old_op->outputs[i], pt_mapping));
+          }
         }
         ioff += fused->op_num_inputs[op];
         woff += fused->op_num_weights[op];
@@ -4086,6 +4340,12 @@ struct DefaultConfig {
   const static bool searchOverlapBackwardUpdate = false;
   const static size_t offloadReserveSpaceSize =
       (size_t)8 * 1024 * 1024 * 1024; // 8 GB
+  // PEFT related fields
+  const static bool enablePeft = false;
+  const static size_t peftActivationReserveSpaceSize =
+      (size_t)1 * 1024 * 1024 * 1024; // 1GB
+  const static size_t peftWeightReserveSpaceSize =
+      (size_t)1 * 1024 * 1024 * 1024; // 1GB
   const static bool cpuOffload = false;
   const static bool onlyDataParallel = true;
   const static bool enableSampleParallel = true;
@@ -4122,6 +4382,11 @@ FFConfig::FFConfig() {
   computationMode = COMP_MODE_TRAINING;
   cpu_offload = DefaultConfig::cpuOffload;
   offload_reserve_space_size = DefaultConfig::offloadReserveSpaceSize;
+  // PEFT related fields
+  enable_peft = DefaultConfig::enablePeft;
+  peft_activation_reserve_space_size =
+      DefaultConfig::peftActivationReserveSpaceSize;
+  peft_weight_reserve_space_size = DefaultConfig::peftWeightReserveSpaceSize;
   quantization_type = DT_NONE;
   only_data_parallel = DefaultConfig::onlyDataParallel;
   data_parallelism_degree = 1;
@@ -4248,6 +4513,18 @@ void FFConfig::parse_args(char **argv, int argc) {
       quantization_type = DT_INT8;
       continue;
     }
+    if ((!strcmp(argv[i], "-enable-peft"))) {
+      enable_peft = true;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-activation-reserve-space-size")) {
+      peft_activation_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
+      continue;
+    }
+    if (!strcmp(argv[i], "-peft-weight-reserve-space-size")) {
+      peft_weight_reserve_space_size = atoll(argv[++i]) * 1024 * 1024;
+      continue;
+    }
     if ((!strcmp(argv[i], "--only-data-parallel"))) {
       only_data_parallel = true;
       continue;
@@ -5383,6 +5660,38 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                                   "residual_layernorm_bwd_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualLayerNorm::backward_task>(
+          registrar, "residual_layernorm_backward_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualLayerNorm::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                                   "residual_layernorm_peft_bwd_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualLayerNorm::peft_bwd_task>(
+          registrar, "residual_layernorm_peft_bwd_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualLayerNorm::peft_bwd_task>(
+          registrar);
+    }
+  }
   // AddBiasResidualLayerNorm task
   {
     TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_INIT_TASK_ID,
@@ -5419,6 +5728,40 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_BWD_TASK_ID,
+                                   "AddBiasResidualLayerNorm Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          AddBiasResidualLayerNorm::backward_task>(
+          registrar, "AddBiasResidualLayerNorm Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AddBiasResidualLayerNorm::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(ADD_BIAS_RESIDUAL_LAYERNORM_PEFT_BWD_TASK_ID,
+                                   "AddBiasResidualLayerNorm PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          AddBiasResidualLayerNorm::peft_bwd_task>(
+          registrar, "AddBiasResidualLayerNorm PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AddBiasResidualLayerNorm::peft_bwd_task>(
+          registrar);
+    }
+  }
   // SigmoidSiluMulti task
   {
     TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_INIT_TASK_ID,
@@ -5452,6 +5795,38 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_BWD_TASK_ID,
+                                   "SigmoidSiluMulti Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<SigmoidSiluMulti::backward_task>(
+          registrar, "SigmoidSiluMulti Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<SigmoidSiluMulti::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(SIGMOID_SILU_MULTI_PEFT_BWD_TASK_ID,
+                                   "SigmoidSiluMulti PEFT Bwd");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<SigmoidSiluMulti::peft_bwd_task>(
+          registrar, "SigmoidSiluMulti PEFT Bwd Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<SigmoidSiluMulti::peft_bwd_task>(
+          registrar);
+    }
+  }
   // rms norm task
   {
     TaskVariantRegistrar registrar(RMSNORM_INIT_TASK_ID, "rmsnorm_init_task");
@@ -5495,7 +5870,36 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<RMSNorm::inference_task>(registrar);
     }
   }
-  // rms norm task
+  {
+    TaskVariantRegistrar registrar(RMSNORM_BWD_TASK_ID, "RMS Norm Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<RMSNorm::backward_task>(
+          registrar, "RMS Norm Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<RMSNorm::backward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(RMSNORM_PEFT_BWD_TASK_ID,
+                                   "RMS Norm PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<RMSNorm::peft_bwd_task>(
+          registrar, "RMS Norm PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<RMSNorm::peft_bwd_task>(registrar);
+    }
+  }
+  // residual rms norm task
   {
     TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_INIT_TASK_ID,
                                    "Residual RMS Norm Init");
@@ -5519,7 +5923,7 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     registrar.set_leaf();
     if (pre_register) {
       Runtime::preregister_task_variant<ResidualRMSNorm::inference_task>(
-          registrar, "RMS Norm Inference Task");
+          registrar, "Residual RMS Norm Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
@@ -5528,6 +5932,51 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_BWD_TASK_ID,
+                                   "Residual RMS Norm Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualRMSNorm::backward_task>(
+          registrar, "Residual RMS Norm Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualRMSNorm::backward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(RESIDUAL_RMSNORM_PEFT_BWD_TASK_ID,
+                                   "Residual RMS Norm PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ResidualRMSNorm::peft_bwd_task>(
+          registrar, "Residual RMS Norm PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ResidualRMSNorm::peft_bwd_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LAYERNORM_PEFT_BWD_TASK_ID,
+                                   "layernorm_peft_bwd_task");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LayerNorm::peft_bwd_task>(
+          registrar, "peft_bwd_task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LayerNorm::peft_bwd_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(LAYERNORM_BWD_TASK_ID, "layernorm_bwd_task");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
@@ -5571,6 +6020,21 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Linear::inference_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(LINEAR_PEFT_BWD_TASK_ID,
+                                   "Linear PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Linear::peft_bwd_task>(
+          registrar, "Linear PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Linear::peft_bwd_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(LINEAR_FWD_TASK_ID, "Linear Forward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
@@ -5699,6 +6163,22 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Softmax::inference_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(SOFTMAX_PEFT_BWD_TASK_ID,
+                                   "Softmax PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Softmax::peft_bwd_task>(
+          registrar, "Softmax PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Softmax::peft_bwd_task>(registrar);
+    }
+  }
+
   // compute Loss
   {
     TaskVariantRegistrar registrar(LOSS_BWD_TASK_ID, "Loss Backward");
@@ -6303,6 +6783,24 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(
+        INC_MULTIHEAD_SELF_ATTENTION_PEFT_BWD_TASK_ID,
+        "IncMultiHeadSelfAttention PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<
+          IncMultiHeadSelfAttention::peft_bwd_task>(
+          registrar, "IncMultiHeadSelfAttention PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<IncMultiHeadSelfAttention::peft_bwd_task>(
+          registrar);
+    }
+  }
   // speculative MultiHeadAttention task
   {
     TaskVariantRegistrar registrar(
@@ -6380,6 +6878,54 @@ void register_flexflow_internal_tasks(Runtime *runtime,
           TreeIncMultiHeadSelfAttention::inference_task>(registrar);
     }
   }
+  // PEFT tasks
+  // LoraLinear tasks
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_INIT_TASK_ID, "LoraLinear Init");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<OpMeta *, LoraLinear::init_task>(
+          registrar, "LoraLinear Init Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<OpMeta *, LoraLinear::init_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_INF_TASK_ID,
+                                   "LoraLinear Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LoraLinear::inference_task>(
+          registrar, "LoraLinear Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LoraLinear::inference_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(LORA_LINEAR_PEFT_BWD_TASK_ID,
+                                   "LoraLinear PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<LoraLinear::peft_bwd_task>(
+          registrar, "LoraLinear PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<LoraLinear::peft_bwd_task>(registrar);
+    }
+  }
+
   // NoOp
   {
     TaskVariantRegistrar registrar(NOOP_INIT_TASK_ID, "Weight NCCL Init");
@@ -6411,31 +6957,47 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
+    TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<FusedOp::forward_task>(
-          registrar, "FusedOp Forward Task");
+      Runtime::preregister_task_variant<FusedOp::inference_task>(
+          registrar, "FusedOp Inference Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<FusedOp::forward_task>(registrar);
+      runtime->register_task_variant<FusedOp::inference_task>(registrar);
     }
   }
   {
-    TaskVariantRegistrar registrar(FUSEDOP_INF_TASK_ID, "FusedOp Inference");
+    TaskVariantRegistrar registrar(FUSEDOP_PEFT_BWD_TASK_ID,
+                                   "FusedOp PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<FusedOp::inference_task>(
-          registrar, "FusedOp Inference Task");
+      Runtime::preregister_task_variant<FusedOp::peft_bwd_task>(
+          registrar, "FusedOp PEFT Backward Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<FusedOp::inference_task>(registrar);
+      runtime->register_task_variant<FusedOp::peft_bwd_task>(registrar);
+    }
+  }
+
+  {
+    TaskVariantRegistrar registrar(FUSEDOP_FWD_TASK_ID, "FusedOp Forward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<FusedOp::forward_task>(
+          registrar, "FusedOp Forward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<FusedOp::forward_task>(registrar);
     }
   }
   {
@@ -6529,6 +7091,20 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Combine::forward_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(COMBINE_INF_TASK_ID, "Combine Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Combine::inference_task>(
+          registrar, "Combine Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Combine::inference_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(COMBINE_BWD_TASK_ID, "Combine Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
@@ -6543,6 +7119,21 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Combine::backward_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(COMBINE_PEFT_BWD_TASK_ID,
+                                   "Combine PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Combine::peft_bwd_task>(
+          registrar, "Combine PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Combine::peft_bwd_task>(registrar);
+    }
+  }
   // Replicate
   {
     TaskVariantRegistrar registrar(REPLICATE_INIT_TASK_ID, "Replicate Init");
@@ -6586,6 +7177,21 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<Replicate::backward_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(REPLICATE_PEFT_BWD_TASK_ID,
+                                   "Replicate PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<Replicate::peft_bwd_task>(
+          registrar, "Replicate PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<Replicate::peft_bwd_task>(registrar);
+    }
+  }
   // Reduction
   {
     TaskVariantRegistrar registrar(REDUCTION_INIT_TASK_ID, "Reduction Init");
@@ -6644,6 +7250,34 @@ void register_flexflow_internal_tasks(Runtime *runtime,
       runtime->register_task_variant<OpMeta *, AllReduce::init_task>(registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<AllReduce::forward_task>(
+          registrar, "AllReduce Forward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AllReduce::forward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<AllReduce::backward_task>(
+          registrar, "AllReduce Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<AllReduce::backward_task>(registrar);
+    }
+  }
   {
     TaskVariantRegistrar registrar(ALLREDUCE_INF_TASK_ID,
                                    "AllReduce Inference");
@@ -6660,33 +7294,101 @@ void register_flexflow_internal_tasks(Runtime *runtime,
     }
   }
   {
-    TaskVariantRegistrar registrar(ALLREDUCE_FWD_TASK_ID, "AllReduce Forward");
+    TaskVariantRegistrar registrar(ALLREDUCE_PEFT_BWD_TASK_ID,
+                                   "AllReduce PEFT Backward");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<AllReduce::forward_task>(
-          registrar, "AllReduce Forward Task");
+      Runtime::preregister_task_variant<AllReduce::peft_bwd_task>(
+          registrar, "AllReduce PEFT Backward Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<AllReduce::forward_task>(registrar);
+      runtime->register_task_variant<AllReduce::peft_bwd_task>(registrar);
     }
   }
+  // ParallelIdentity
   {
-    TaskVariantRegistrar registrar(ALLREDUCE_BWD_TASK_ID, "AllReduce Backward");
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_INIT_TASK_ID,
+                                   "ParallelIdentity Init");
     registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
     registrar.set_leaf();
     if (pre_register) {
-      Runtime::preregister_task_variant<AllReduce::backward_task>(
-          registrar, "AllReduce Backward Task");
+      Runtime::preregister_task_variant<OpMeta *, ParallelIdentity::init_task>(
+          registrar, "ParallelIdentity init Task");
     } else {
       if (enable_control_replication) {
         registrar.global_registration = false;
       }
-      runtime->register_task_variant<AllReduce::backward_task>(registrar);
+      runtime->register_task_variant<OpMeta *, ParallelIdentity::init_task>(
+          registrar);
     }
   }
+  {
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_FWD_TASK_ID,
+                                   "ParallelIdentity Forward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ParallelIdentity::forward_task>(
+          registrar, "ParallelIdentity Forward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ParallelIdentity::forward_task>(registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_BWD_TASK_ID,
+                                   "ParallelIdentity Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ParallelIdentity::backward_task>(
+          registrar, "ParallelIdentity Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ParallelIdentity::backward_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_INF_TASK_ID,
+                                   "ParallelIdentity Inference");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ParallelIdentity::inference_task>(
+          registrar, "ParallelIdentity Inference Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ParallelIdentity::inference_task>(
+          registrar);
+    }
+  }
+  {
+    TaskVariantRegistrar registrar(PARALLEL_IDENTITY_PEFT_BWD_TASK_ID,
+                                   "ParallelIdentity PEFT Backward");
+    registrar.add_constraint(ProcessorConstraint(Processor::TOC_PROC));
+    registrar.set_leaf();
+    if (pre_register) {
+      Runtime::preregister_task_variant<ParallelIdentity::peft_bwd_task>(
+          registrar, "ParallelIdentity PEFT Backward Task");
+    } else {
+      if (enable_control_replication) {
+        registrar.global_registration = false;
+      }
+      runtime->register_task_variant<ParallelIdentity::peft_bwd_task>(
+          registrar);
+    }
+  }
+
   // FusedParallelOp
   {
     TaskVariantRegistrar registrar(FUSED_PARALLELOP_FWD_TASK_ID,
diff --git a/src/runtime/model.cpp b/src/runtime/model.cpp
index 62f6b89b7f..9f3e2fbb10 100644
--- a/src/runtime/model.cpp
+++ b/src/runtime/model.cpp
@@ -165,8 +165,8 @@ FFHandler
                                            0,
                                            Realm::ProfilingRequestSet())
         .wait();
-    handle.batch_config_metadata =
-        workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.batch_config_metadata = static_cast<CombinedBatchConfigMetaStruct *>(
+        workspaceInst.pointer_untyped(0, sizeof(char)));
   } else {
     handle.batch_config_metadata = nullptr;
   }
diff --git a/src/runtime/model.cu b/src/runtime/model.cu
index fd39ed0db0..5dab73e1a4 100644
--- a/src/runtime/model.cu
+++ b/src/runtime/model.cu
@@ -14,6 +14,8 @@
  */
 #include "flexflow/model.h"
 #include "flexflow/utils/cuda_helper.h"
+#include "flexflow/utils/memory_allocator.h"
+#include "flexflow/utils/peft_weight_allocator.h"
 
 namespace FlexFlow {
 // declare Legion names
@@ -161,12 +163,51 @@ FFHandler
                                            0,
                                            Realm::ProfilingRequestSet())
         .wait();
-    handle.batch_config_metadata =
-        workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.batch_config_metadata = static_cast<CombinedBatchConfigMetaStruct *>(
+        workspaceInst.pointer_untyped(0, sizeof(char)));
   } else {
     handle.batch_config_metadata = nullptr;
   }
 
+  if (info->peft_activation_reserve_space_size > 0) {
+    // allocate memory for peft activation reserve space
+    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                         .only_kind(Memory::GPU_FB_MEM)
+                         .best_affinity_to(task->target_proc)
+                         .first();
+    Realm::RegionInstance workspaceInst;
+    handle.peft_activation_allocator = new MemoryAllocator(gpu_mem);
+    handle.peft_activation_allocator->create_legion_instance(
+        workspaceInst, info->peft_activation_reserve_space_size);
+  } else {
+    handle.peft_activation_allocator = nullptr;
+  }
+
+  if (info->peft_weight_reserve_space_size > 0) {
+    // allocate memory for peft weight reserve space
+    Memory gpu_mem = Machine::MemoryQuery(Machine::get_machine())
+                         .only_kind(Memory::GPU_FB_MEM)
+                         .best_affinity_to(task->target_proc)
+                         .first();
+    Realm::Rect<1, coord_t> bounds(
+        Realm::Point<1, coord_t>(0),
+        Realm::Point<1, coord_t>(info->peft_weight_reserve_space_size - 1));
+    std::vector<size_t> field_sizes;
+    field_sizes.push_back(sizeof(char));
+    Realm::RegionInstance workspaceInst;
+    Realm::RegionInstance::create_instance(workspaceInst,
+                                           gpu_mem,
+                                           bounds,
+                                           field_sizes,
+                                           0,
+                                           Realm::ProfilingRequestSet())
+        .wait();
+    void *ptr = workspaceInst.pointer_untyped(0, sizeof(char));
+    handle.peft_weight_allocator =
+        new PEFTWeightAllocator(ptr, info->peft_weight_reserve_space_size);
+  } else {
+    handle.peft_weight_allocator = nullptr;
+  }
   // checkCUDA(cudaMalloc(&handle.workSpace, handle.workSpaceSize));
 #ifdef FF_USE_NCCL
   handle.ncclComm = NULL;
diff --git a/src/runtime/operator.cc b/src/runtime/operator.cc
index 36ac02a3a3..dcac52397a 100644
--- a/src/runtime/operator.cc
+++ b/src/runtime/operator.cc
@@ -2,14 +2,7 @@
 #include "flexflow/ffconst_utils.h"
 #include "flexflow/simulator.h"
 #include <stdexcept>
-
-#include <sys/stat.h>
-#include <sys/types.h>
-#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
-#include "flexflow/utils/cuda_helper.h"
-#else
-#include "flexflow/utils/hip_helper.h"
-#endif
+#include <wordexp.h>
 
 namespace FlexFlow {
 
@@ -25,4 +18,31 @@ size_t Op::get_params_hash() const {
       get_operator_type_name(this->op_type));
 }
 
+fs::path get_dst_folder(std::string const &subdir,
+                        int step_idx,
+                        int shard_idx,
+                        bool before_kernel) {
+  std::vector<std::string> debug_subdirs = {"fwd", "bwd", "optim", "weights"};
+  assert(std::find(debug_subdirs.begin(), debug_subdirs.end(), subdir) !=
+         debug_subdirs.end());
+  std::string step_substr = "step_" + std::to_string(step_idx);
+  if (before_kernel) {
+    step_substr += "_pre";
+  }
+  char const *ff_cache_path = std::getenv("FF_CACHE_PATH");
+  std::string debug_dir_ =
+      ff_cache_path ? std::string(ff_cache_path) + "/debug/flexflow"
+                    : std::string("~/.cache/flexflow/debug/flexflow");
+  wordexp_t p;
+  wordexp(debug_dir_.c_str(), &p, 0);
+  debug_dir_ = p.we_wordv[0];
+  wordfree(&p);
+  fs::path debug_dir = debug_dir_;
+  assert(fs::is_directory(debug_dir));
+  fs::path dst_folder =
+      debug_dir / subdir / step_substr / ("shard_" + std::to_string(shard_idx));
+  fs::create_directories(dst_folder);
+  return dst_folder;
+}
+
 }; // namespace FlexFlow
\ No newline at end of file
diff --git a/src/runtime/operator_params.cc b/src/runtime/operator_params.cc
index 6b2d223f54..e9feb86eb5 100644
--- a/src/runtime/operator_params.cc
+++ b/src/runtime/operator_params.cc
@@ -42,6 +42,7 @@
 #include "flexflow/parallel_ops/allreduce.h"
 #include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/fused_parallel_op.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
@@ -119,6 +120,8 @@ tl::optional<OperatorParameters> get_op_parameters(Op const *op) {
       return ((Combine *)op)->get_params();
     case OP_ALLREDUCE:
       return ((AllReduce *)op)->get_params();
+    case OP_PARALLEL_IDENTITY:
+      return ((ParallelIdentity *)op)->get_params();
     case OP_FUSED_PARALLEL:
       return ((FusedParallelOp *)op)->get_params();
     case OP_TRANSPOSE:
diff --git a/src/runtime/request_manager.cc b/src/runtime/request_manager.cc
index bada87ab19..31a32dd3c8 100644
--- a/src/runtime/request_manager.cc
+++ b/src/runtime/request_manager.cc
@@ -14,6 +14,8 @@
  */
 
 #include "flexflow/request_manager.h"
+#include "flexflow/ops/fused.h"
+#include "flexflow/ops/lora_linear.h"
 #include "flexflow/parallel_ops/parallel_op.h"
 // #include "flexflow/tokenizers.h"
 #include <bitset>
@@ -21,6 +23,7 @@
 #include <future>
 #include <iomanip>
 #include <new>
+#include <nlohmann/json.hpp>
 #include <stack>
 #include <stdexcept>
 
@@ -28,12 +31,16 @@ namespace FlexFlow {
 
 using namespace Legion;
 using tokenizers::Tokenizer;
+using json = nlohmann::json;
 
 Legion::Logger log_req_mgr("RequestManager");
 
 std::string LoadBytesFromFile(std::string const &path) {
   std::ifstream fs(path, std::ios::in | std::ios::binary);
-  assert(!fs.fail() && "no such file");
+  if (fs.fail()) {
+    std::cerr << "Failed to open file: " << path << std::endl;
+    assert(false);
+  }
   std::string data;
   fs.seekg(0, std::ios::end);
   size_t size = static_cast<size_t>(fs.tellg());
@@ -43,6 +50,52 @@ std::string LoadBytesFromFile(std::string const &path) {
   return data;
 }
 
+std::ostream &operator<<(std::ostream &os, Request const &req) {
+  os << "Request {\n";
+  os << "  guid: " << req.guid << "\n";
+  os << "  peft_model_id: " << req.peft_model_id << "\n";
+  os << "  max_sequence_length: " << req.max_sequence_length << "\n";
+  os << "  initial_len: " << req.initial_len << "\n";
+  os << "  ssm_cache_size: " << req.ssm_cache_size << "\n";
+  os << "  llm_cache_size: " << req.llm_cache_size << "\n";
+  os << "  status: " << static_cast<int>(req.status) << "\n";
+  os << "  tokens: [";
+  for (auto const &token : req.tokens) {
+    os << token << " ";
+  }
+  os << "]\n";
+  os << "  prompt: " << req.prompt << "\n";
+  // os << "  beam_trees: [";
+  // for (const auto& tree : req.beam_trees) {
+  //     // Assuming BeamTree has its own << operator defined
+  //     os << tree << " ";
+  // }
+  // os << "]\n";
+  os << "  req_type: " << static_cast<int>(req.req_type) << "\n";
+  os << "  completed_training_steps: " << req.completed_training_steps << "\n";
+  os << "  gradient_accumulation_steps: " << req.gradient_accumulation_steps
+     << "\n";
+  os << "  max_training_steps: " << req.max_training_steps << "\n";
+  os << "  dataset_filepath: " << req.dataset_filepath << "\n";
+  os << "  dataset: [";
+  for (auto const &pair : req.dataset) {
+    os << "[";
+    for (auto const &token : pair.first) {
+      os << token << " ";
+    }
+    os << "], [";
+    for (auto const &token : pair.second) {
+      os << token << " ";
+    }
+    os << "] ";
+  }
+  os << "]\n";
+  os << "}\n";
+  return os;
+}
+
+bool RequestManager::inference_finished = false;
+
 RequestManager::RequestManager()
     : request_manager_status(INITIALIZED), verbose(false),
       next_available_guid(1000000), num_processed_requests(0),
@@ -114,6 +167,14 @@ void RequestManager::push_spec_infer_tree_width(int tree_width) {
   spec_infer_tree_width.emplace_back(tree_width);
 }
 
+void RequestManager::set_enable_peft_finetuning(bool enable_peft_finetuning_) {
+  enable_peft_finetuning = enable_peft_finetuning_;
+}
+
+void RequestManager::set_inference_finished(bool finished) {
+  inference_finished = finished;
+}
+
 void RequestManager::register_tokenizer(ModelType type,
                                         int bos_token_id,
                                         int eos_token_id,
@@ -121,33 +182,45 @@ void RequestManager::register_tokenizer(ModelType type,
   this->model_type = type;
   this->bos_token_id = bos_token_id;
   this->eos_token_id = eos_token_id;
-  std::string tokenizer_folder =
-      (!path.empty() && path.back() != '/') ? path + '/' : path;
+  std::filesystem::path tokenizer_folder(path);
+
   if (model_type == ModelType::LLAMA) {
-    bool path_to_file = !path.empty() &&
-                        (path.size() >= strlen("tokenizer.model")) &&
-                        path.find("tokenizer.model") ==
-                            (path.size() - strlen("tokenizer.model"));
-    std::string tokenizer_filepath =
-        path_to_file ? path : tokenizer_folder + "tokenizer.model";
-    this->tokenizer_ =
-        Tokenizer::FromBlobSentencePiece(LoadBytesFromFile(tokenizer_filepath));
+    std::filesystem::path tokenizer_model_path;
+    if (std::filesystem::is_directory(tokenizer_folder)) {
+      tokenizer_model_path =
+          std::filesystem::path(tokenizer_folder) / "tokenizer.model";
+    } else {
+      tokenizer_model_path = tokenizer_folder;
+    }
+    if (std::filesystem::exists(tokenizer_model_path)) {
+      // load from tokenizer.model
+      this->tokenizer_ = Tokenizer::FromBlobSentencePiece(
+          LoadBytesFromFile(tokenizer_model_path.string()));
+    } else {
+      // load from tokenizer.json
+      std::filesystem::path tokenizer_json_path =
+          tokenizer_folder / "tokenizer.json";
+      if (!std::filesystem::exists(tokenizer_json_path)) {
+        std::cerr << "Failed to open file: " << tokenizer_json_path
+                  << std::endl;
+        assert(false);
+      }
+      this->tokenizer_ = Tokenizer::FromBlobJSON(
+          LoadBytesFromFile(tokenizer_json_path.string()));
+    }
   } else if (model_type == ModelType::OPT) {
-    std::string vocab_file = tokenizer_folder + "vocab.json";
-    std::string merges_file = tokenizer_folder + "merges.txt";
-    std::string added_tokens_file =
-        tokenizer_folder + "special_tokens_map.json";
-    std::filesystem::path path1(vocab_file);
-    std::filesystem::path path2(merges_file);
-    std::filesystem::path path3(added_tokens_file);
-    assert(std::filesystem::exists(path1) &&
+    std::filesystem::path vocab_file = tokenizer_folder / "vocab.json";
+    std::filesystem::path merges_file = tokenizer_folder / "merges.txt";
+    std::filesystem::path added_tokens_file =
+        tokenizer_folder / "special_tokens_map.json";
+    assert(std::filesystem::exists(vocab_file) &&
            "Vocab file vocab.json does not exist at the specified path");
-    assert(std::filesystem::exists(path2) &&
+    assert(std::filesystem::exists(merges_file) &&
            "Merge file merges.txt does not exist at the specified path");
     // opt_tokenizer = new OptTokenizer(vocab_file, merges_file);
-    std::string vocab = LoadBytesFromFile(path1.string());
-    std::string merges = LoadBytesFromFile(path2.string());
-    std::string added_tokens = LoadBytesFromFile(path3.string());
+    std::string vocab = LoadBytesFromFile(vocab_file.string());
+    std::string merges = LoadBytesFromFile(merges_file.string());
+    std::string added_tokens = LoadBytesFromFile(added_tokens_file.string());
 
     this->tokenizer_ =
         Tokenizer::FromBlobByteLevelBPE(vocab, merges, added_tokens);
@@ -182,28 +255,40 @@ size_t RequestManager::get_num_ssms() {
 }
 
 RequestManager::RequestGuid
-    RequestManager::register_new_request(std::vector<TokenId> const &prompt,
-                                         int max_sequence_length) {
+    RequestManager::register_new_request(Request const &request_) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
-
   // Add a new request
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
-
-  if (prompt.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << prompt.size() << ".\n";
-
-    printf("tokens size: %zu\n", request.tokens.size());
-    return INVALID_GUID;
+  request.max_sequence_length = request_.max_sequence_length;
+  request.peft_model_id = request_.peft_model_id;
+  request.warmup = request_.warmup;
+  if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+    request.tokens.push_back(bos_token_id);
+  }
+  if (request_.benchmarking_tokens >= 0) {
+    assert(request_.benchmarking_tokens < get_max_sequence_length());
+    request.benchmarking_tokens = request_.benchmarking_tokens;
+    request.tokens.insert(request.tokens.end(),
+                          request_.benchmarking_tokens,
+                          15); // insert random number
   } else {
-    request.initial_len = prompt.size();
-    request.tokens = prompt;
+    std::vector<int32_t> tokens = this->tokenizer_->Encode(request_.prompt);
+    if (tokens.size() >= get_max_sequence_length()) {
+      std::cout << "Warning: too many tokens in prompt, only load up to "
+                << get_max_sequence_length() << " tokens, but got "
+                << tokens.size() << ".\n";
+      return INVALID_GUID;
+    }
+    for (int i = 0; i < tokens.size(); i++) {
+      std::cout << "[" << i << "]" << tokens.at(i) << "\n";
+    }
+    request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
   }
 
+  request.initial_len = request.tokens.size();
+
   if (get_num_ssms() == 0) {
     std::cout << "No small speculative model registered, using incremental "
                  "decoding."
@@ -216,58 +301,111 @@ RequestManager::RequestGuid
     }
   }
 
-  pending_request_queue.push(request);
+  pending_infr_request_queue.push(request);
   all_requests[request.guid] = request;
   {
     const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
 
-  if (verbose) {
-    std::cout << "new req: " << request.tokens.size() << std::endl;
+  {
+    std::string output = "New request tokens:";
+    output = "[" + std::to_string(request.guid) + "]" + output;
     for (int i = 0; i < request.tokens.size(); i++) {
-      std::cout << i << " : " << request.tokens[i] << std::endl;
+      output = output + " " + std::to_string(request.tokens[i]);
     }
+    log_req_mgr.print("%s", output.c_str());
   }
 
   GenerationResult gr;
   gr.guid = request.guid;
-  gr.input_text = "";
-  gr.input_tokens = prompt;
-  gr.output_text = "";
-  gr.output_tokens = prompt;
+  gr.input_text = request_.prompt;
+  gr.input_tokens = request.tokens;
+  gr.output_text = request_.prompt;
+  gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
 
+  ProfileInfo profile_info;
+  profile_info.registration_time = Realm::Clock::current_time_in_microseconds();
+  profiling_requests[request.guid] = profile_info;
+
   return request.guid;
 }
 
 RequestManager::RequestGuid
-    RequestManager::register_new_request(std::string const &prompt,
-                                         int max_sequence_length) {
+    RequestManager::register_new_peft_request(Request const &request_) {
+  assert(enable_peft_finetuning && "PEFT finetuning is not enabled");
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
   // Add a new request
   Request request;
   request.status = Request::PENDING;
   request.guid = next_available_guid++;
-  request.max_sequence_length = max_sequence_length;
-  if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
-    request.tokens.push_back(bos_token_id);
+  request.initial_len = 0;
+  request.max_sequence_length = request_.max_sequence_length;
+  request.peft_model_id = request_.peft_model_id;
+  request.req_type = RequestType::REQ_FINETUNING;
+  request.completed_training_steps = 0;
+  request.gradient_accumulation_steps = request_.gradient_accumulation_steps;
+  request.max_training_steps = request_.max_training_steps;
+  request.dataset_filepath = request_.dataset_filepath;
+  request.warmup = request_.warmup;
+
+  // Load dataset
+  if (request_.benchmarking_tokens >= 0) {
+    assert(request_.benchmarking_tokens <= get_max_sequence_length());
+    request.benchmarking_tokens = request_.benchmarking_tokens;
+    std::vector<int32_t> input_tokens;
+    std::vector<int32_t> output_tokens;
+    bool bos_added = (bos_token_id >= 0 && model_type != ModelType::FALCON);
+    if (bos_added) {
+      input_tokens.push_back(bos_token_id);
+    }
+    input_tokens.insert(input_tokens.end(),
+                        request_.benchmarking_tokens - (int)bos_added,
+                        15); // insert random number
+    request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
+  } else {
+    using json = nlohmann::json;
+    std::ifstream file_handle(request.dataset_filepath);
+    assert(file_handle.good() && "Dataset file does not exist.");
+    json dataset_json = json::parse(file_handle,
+                                    /*parser_callback_t */ nullptr,
+                                    /*allow_exceptions */ true,
+                                    /*ignore_comments */ true);
+
+    for (auto &prompt : dataset_json) {
+      std::string text = prompt.get<std::string>();
+      std::string output_text("");
+      std::vector<int32_t> input_tokens;
+      input_tokens = this->tokenizer_->Encode(text);
+      if (bos_token_id >= 0 && model_type != ModelType::FALCON) {
+        input_tokens.insert(input_tokens.begin(), bos_token_id);
+      }
+      std::vector<int32_t> output_tokens =
+          this->tokenizer_->Encode(output_text);
+      if (input_tokens.size() + output_tokens.size() >
+          get_max_sequence_length()) {
+        std::cout << "Warning: too many tokens in sample, only load up to "
+                  << get_max_sequence_length() << " tokens, but got "
+                  << input_tokens.size() + output_tokens.size() << ".\n";
+        return INVALID_GUID;
+      } else {
+        request.dataset.push_back(std::make_pair(input_tokens, output_tokens));
+      }
+    }
   }
-  std::vector<int32_t> tokens = this->tokenizer_->Encode(prompt);
-  if (tokens.size() >= get_max_sequence_length()) {
-    std::cout << "Warning: too many tokens in prompt, only load up to "
-              << get_max_sequence_length() << " tokens, but got "
-              << tokens.size() << ".\n";
 
-    printf("tokens size: %zu\n", tokens.size());
-    return INVALID_GUID;
+  if (request.gradient_accumulation_steps == -1) {
+    request.gradient_accumulation_steps = request.dataset.size();
   }
-  for (int i = 0; i < tokens.size(); i++) {
-    std::cout << "[" << i << "]" << tokens.at(i) << "\n";
-  }
-  request.tokens.insert(request.tokens.end(), tokens.begin(), tokens.end());
-  request.initial_len = request.tokens.size();
+  assert(request.gradient_accumulation_steps > 0 &&
+         "Invalid gradient accumulation steps");
+  assert(request.gradient_accumulation_steps <= request.max_training_steps &&
+         "Gradient accumulation steps should be less than or equal to max "
+         "training steps");
 
+  // Currently don't support speculative inference for PEFT
+  assert(get_num_ssms() == 0);
   if (get_num_ssms() == 0) {
     std::cout << "No small speculative model registered, using incremental "
                  "decoding."
@@ -280,29 +418,38 @@ RequestManager::RequestGuid
     }
   }
 
-  pending_request_queue.push(request);
+  pending_peft_request_queue.push(request);
   all_requests[request.guid] = request;
   {
     const std::lock_guard<std::mutex> lock(request_to_promise_mutex);
     request_to_promise[request.guid] = new std::promise<void>();
   }
 
-  {
-    std::string output = "New request tokens:";
-    output = "[" + std::to_string(request.guid) + "]" + output;
-    for (int i = 0; i < request.tokens.size(); i++) {
-      output = output + " " + std::to_string(request.tokens[i]);
+  for (size_t r = 0; r < request.dataset.size(); r++) {
+    std::string input = "[" + std::to_string(r) + "] input:";
+    std::string output = "[" + std::to_string(r) + "] output:";
+    for (size_t i = 0; i < request.dataset[r].first.size(); i++) {
+      input = input + " " + std::to_string(request.dataset[r].first[i]);
     }
+    for (size_t i = 0; i < request.dataset[r].second.size(); i++) {
+      output = output + " " + std::to_string(request.dataset[r].second[i]);
+    }
+    log_req_mgr.print("%s", input.c_str());
     log_req_mgr.print("%s", output.c_str());
   }
 
   GenerationResult gr;
   gr.guid = request.guid;
-  gr.input_text = prompt;
-  gr.input_tokens = request.tokens;
-  gr.output_text = prompt;
-  gr.output_tokens = request.tokens;
+  // gr.input_text = prompt;
+  // gr.input_tokens = request.tokens;
+  // gr.output_text = prompt;
+  // gr.output_tokens = request.tokens;
   request_generation_results[request.guid] = gr;
+
+  ProfileInfo profile_info;
+  profile_info.registration_time = Realm::Clock::current_time_in_microseconds();
+  profiling_requests[request.guid] = profile_info;
+
   return request.guid;
 }
 
@@ -363,51 +510,117 @@ BatchConfig RequestManager::prepare_next_batch_task(
   return rm->prepare_next_batch(*bc, result);
 }
 
+bool RequestManager::check_inf_req_completion(BatchConfig const &old_bc,
+                                              int i) {
+  Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
+  bool request_completed = false;
+  // printf("model_type = %d\n", this->model_type);
+  if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
+    request_completed = true;
+  } else if (request.tokens.back() == eos_token_id) {
+    // Encounter EOS token id
+    request_completed = true;
+  }
+  return request_completed;
+}
+
+void RequestManager::check_batch(BatchConfig const &old_bc,
+                                 BatchConfig const &new_bc) {
+  int num_incomplete_prompts = 0;
+  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
+    if (new_bc.request_completed[i]) {
+      continue;
+    }
+    // ensure there is no request with zero tokens
+    assert(new_bc.requestsInfo[i].num_tokens_in_batch > 0);
+    // ensure there is no more than one incomplete prompt
+    if (new_bc.requestsInfo[i].prompt_phase &&
+        new_bc.requestsInfo[i].num_tokens_in_batch +
+                new_bc.requestsInfo[i].first_token_depth_in_request <
+            all_requests[new_bc.requestsInfo[i].request_guid].tokens.size()) {
+      num_incomplete_prompts++;
+    }
+  }
+  if (num_incomplete_prompts > 1) {
+    std::cout << "Error: more than one incomplete prompt in the batch\n";
+    pid_t pid = getpid();
+    std::string filenamen = "new_bc_" + std::to_string(pid) + ".txt";
+    std::ofstream filen(filenamen);
+    if (filen.is_open()) {
+      filen << new_bc << std::endl;
+      filen.close();
+      std::cout << "String written to file: " << filenamen << std::endl;
+    } else {
+      std::cout << "Unable to open file: " << filenamen << std::endl;
+    }
+    std::string filenameo = "old_bc_" + std::to_string(pid) + ".txt";
+    std::ofstream fileo(filenameo);
+    if (fileo.is_open()) {
+      fileo << old_bc << std::endl;
+      fileo.close();
+      std::cout << "String written to file: " << filenameo << std::endl;
+    } else {
+      std::cout << "Unable to open file: " << filenameo << std::endl;
+    }
+    assert(false);
+  }
+}
+
 BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                                                InferenceResult const &result) {
   const std::lock_guard<std::mutex> lock(request_queue_mutex);
-
   // Step 1: append result from previous iteration to request's tokens
-  for (int i = 0; i < old_bc.num_tokens; i++) {
+  for (int i = 0; i < old_bc.num_active_tokens(); i++) {
     size_t guid =
         old_bc.requestsInfo[old_bc.tokensInfo[i].request_index].request_guid;
     Request &request = all_requests[guid];
+    if (request.req_type == RequestType::REQ_FINETUNING) {
+      continue;
+    }
     if (old_bc.tokensInfo[i].abs_depth_in_request + 1 < request.tokens.size()) {
       // This is a prompt token
       continue;
     } else {
+      // This is a decoding token
       assert(old_bc.tokensInfo[i].abs_depth_in_request + 1 ==
              request.tokens.size());
-      // This is a decoding token
+      if (!profiling_requests[guid].first_token_time_set) {
+        profiling_requests[guid].first_token_time =
+            Realm::Clock::current_time_in_microseconds();
+        profiling_requests[guid].first_token_time_set = true;
+      }
       log_req_mgr.print("Output token is: %d", result.token_ids[i]);
       request.tokens.push_back(result.token_ids[i]);
       // std::string output = this->tokenizer_->Decode(request.tokens);
       // log_req_mgr.print("Output: %s", output.c_str());
     }
   }
+
   int num_generation_tokens = 0;
   int num_active_req = -1;
 
-  // Step 2: prepare the next batch for existing requests
+  // when finetuning is enabled, the last entry in the batch cannot be used for
+  // inference
+  int inference_batch_size =
+      BatchConfig::max_requests_per_batch() - (int)enable_peft_finetuning;
+
+  // Step 2: prepare the next batch for existing inference requests
   BatchConfig new_bc;
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
-    if (old_bc.request_completed[i]) { // add new requests to the next batch
+  for (int i = 0; i < inference_batch_size; i++) {
+    if (old_bc.request_completed[i]) {
+      // no need to carry over tokens to new batch for this request
       continue;
     } else {
       assert(old_bc.requestsInfo[i].num_tokens_in_batch > 0);
       Request &request = all_requests[old_bc.requestsInfo[i].request_guid];
+      assert(request.req_type == RequestType::REQ_INFERENCE &&
+             "Found misplaced finetuning request");
+
       int processed_tokens =
           old_bc.requestsInfo[i].first_token_depth_in_request +
           old_bc.requestsInfo[i].num_tokens_in_batch;
       assert(processed_tokens < request.tokens.size());
-      bool request_completed = false;
-      // printf("model_type = %d\n", this->model_type);
-      if (request.tokens.size() >= old_bc.requestsInfo[i].max_sequence_length) {
-        request_completed = true;
-      } else if (request.tokens.back() == eos_token_id) {
-        // Encounter EOS token id
-        request_completed = true;
-      }
+      bool request_completed = check_inf_req_completion(old_bc, i);
       if (request_completed) {
         std::string output = this->tokenizer_->Decode(request.tokens);
         // Unlike Huggingface, the sentencepiece C++ library automatically
@@ -435,32 +648,40 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
         total_request_run_time +=
             profile_info.finish_time - profile_info.start_time;
         profiling_requests[request.guid] = profile_info;
-        log_req_mgr.print(
-            "[Profile] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
-            "finish(%.1lf) latency(%.1lf)",
-            request.guid,
-            profile_info.llm_decoding_steps,
-            profile_info.start_time,
-            profile_info.finish_time,
-            profile_info.finish_time - profile_info.start_time);
+        log_req_mgr.print("[%s] guid(%zu) llm_decoding_steps(%d) start(%.1lf) "
+                          "finish(%.1lf) latency(%.1lf) ttft(%.1lf)",
+                          request.warmup ? "Warmup" : "Profile",
+                          request.guid,
+                          profile_info.llm_decoding_steps,
+                          profile_info.start_time,
+                          profile_info.finish_time,
+                          profile_info.finish_time - profile_info.start_time,
+                          profile_info.first_token_time -
+                              profile_info.registration_time);
         // Write output to file if needed:
         if (!output_filepath.empty()) {
           std::ofstream outputFile(output_filepath, std::ios::app);
           if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: "
-                       << profile_info.llm_decoding_steps << std::endl;
-            outputFile << "token IDs: ";
-            for (int i = 0; i < request.tokens.size(); i++) {
-              outputFile << request.tokens[i];
-              if (i < request.tokens.size() - 1) {
-                outputFile << ",";
+            outputFile << "[" << (request.warmup ? "Warmup" : "Profile")
+                       << "] guid(" << request.guid << ") llm_decoding_steps("
+                       << profile_info.llm_decoding_steps << ") latency("
+                       << std::fixed << std::setprecision(3)
+                       << (profile_info.finish_time - profile_info.start_time)
+                       << ") ttft(" << std::fixed << std::setprecision(3)
+                       << (profile_info.first_token_time -
+                           profile_info.registration_time)
+                       << ")\n";
+            if (request.benchmarking_tokens <= 0) {
+              outputFile << "token IDs: ";
+              for (int i = 0; i < request.tokens.size(); i++) {
+                outputFile << request.tokens[i];
+                if (i < request.tokens.size() - 1) {
+                  outputFile << ",";
+                }
               }
+              outputFile << std::endl;
+              outputFile << output;
             }
-            outputFile << std::endl;
-            outputFile << output;
             outputFile.close();
           } else {
             std::cout << "Unable to open the output file: " << output_filepath
@@ -468,13 +689,15 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
             assert(false);
           }
         }
-
       } else {
         new_bc.request_completed[i] = false;
         new_bc.requestsInfo[i].first_token_depth_in_request = processed_tokens;
         new_bc.requestsInfo[i].first_token_offset_in_batch = new_bc.num_tokens;
         new_bc.requestsInfo[i].request_guid =
             old_bc.requestsInfo[i].request_guid;
+        new_bc.requestsInfo[i].peft_model_id =
+            old_bc.requestsInfo[i].peft_model_id;
+        new_bc.requestsInfo[i].peft_bwd = old_bc.requestsInfo[i].peft_bwd;
         new_bc.requestsInfo[i].max_sequence_length =
             old_bc.requestsInfo[i].max_sequence_length;
         num_active_req++;
@@ -487,8 +710,25 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
           new_bc.requestsInfo[i].prompt_phase = false;
         } else {
           // Prompt phase
+          assert(old_bc.requestsInfo[i].prompt_phase == true);
+          int space_for_incr_dec_requests = 0;
+          // If the prompt can't fit in the batch, compute how much space we
+          // need to leave out for incomplete requests in decoding phase at
+          // higher indices.
+          for (int ii = i + 1; ii < inference_batch_size; ii++) {
+            if (old_bc.request_completed[ii]) {
+              continue;
+            }
+            Request &old_request =
+                all_requests[old_bc.requestsInfo[ii].request_guid];
+            bool req_completed = check_inf_req_completion(old_bc, ii);
+            if (!req_completed) {
+              space_for_incr_dec_requests++;
+            }
+          }
           new_bc.requestsInfo[i].num_tokens_in_batch =
-              std::min(get_max_tokens_per_batch() - new_bc.num_tokens,
+              std::min(get_max_tokens_per_batch() - new_bc.num_tokens -
+                           space_for_incr_dec_requests,
                        (int)request.tokens.size() -
                            new_bc.requestsInfo[i].first_token_depth_in_request);
           new_bc.requestsInfo[i].prompt_phase = true;
@@ -509,13 +749,14 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
   }
   new_bc.num_generation_tokens = num_generation_tokens;
 
-  // Step 3: add new requests to the next batch
-  for (int i = 0; i < BatchConfig::max_requests_per_batch(); i++) {
+  // Step 3: add new inference requests to the next batch if there is space
+  for (int i = 0; i < inference_batch_size; i++) {
     if (new_bc.request_completed[i]) {
-      if (!pending_request_queue.empty() &&
+      if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
+        Request new_request = pending_infr_request_queue.front();
+        assert(new_request.req_type == RequestType::REQ_INFERENCE);
+        pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
 
         new_bc.requestsInfo[i].first_token_depth_in_request = 0;
@@ -526,15 +767,16 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
                      (int)new_request.tokens.size());
         new_bc.requestsInfo[i].max_sequence_length =
             new_request.max_sequence_length;
+        new_bc.requestsInfo[i].peft_model_id = new_request.peft_model_id;
+        new_bc.requestsInfo[i].peft_bwd = false;
         new_bc.request_completed[i] = false;
         new_bc.requestsInfo[i].prompt_phase = true;
         num_active_req++;
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
-        // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 1;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
+        // add start time to profile_info for the new request
+        profiling_requests[new_request.guid].llm_decoding_steps = 1;
+        profiling_requests[new_request.guid].start_time =
+            Realm::Clock::current_time_in_microseconds();
         for (int j = 0; j < new_bc.requestsInfo[i].num_tokens_in_batch; j++) {
           int depth = new_bc.requestsInfo[i].first_token_depth_in_request + j;
           new_bc.tokensInfo[new_bc.num_tokens].request_index = i;
@@ -551,6 +793,170 @@ BatchConfig RequestManager::prepare_next_batch(BatchConfig const &old_bc,
     }
   }
 
+  if (enable_peft_finetuning &&
+      !old_bc.request_completed[inference_batch_size]) {
+    assert(old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch > 0);
+    Request &request =
+        all_requests[old_bc.requestsInfo[inference_batch_size].request_guid];
+    assert(request.req_type == RequestType::REQ_FINETUNING &&
+           "Found misplaced inference request");
+
+    request.finetuning_losses.push_back(result.finetuning_loss);
+
+    request.dataset_entry_processed_tokens +=
+        old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch;
+    request.processed_finetuning_tokens +=
+        old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch;
+    request.finetuning_tokens_per_batch.push_back(
+        old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch);
+    int dataset_entry =
+        request.completed_training_steps % request.dataset.size();
+    if (old_bc.requestsInfo[inference_batch_size].first_token_depth_in_request +
+            old_bc.requestsInfo[inference_batch_size].num_tokens_in_batch ==
+        request.dataset[dataset_entry].first.size()) {
+      // completed the current dataset entry
+      assert(request.dataset_entry_processed_tokens ==
+             request.dataset[dataset_entry].first.size());
+      request.completed_training_steps += 1;
+      request.dataset_entry_processed_tokens = 0;
+    }
+
+    assert(request.completed_training_steps <= request.max_training_steps);
+    if (request.completed_training_steps == request.max_training_steps ||
+        inference_finished) {
+      // check if the fine tuning request has completed
+      request.status = Request::COMPLETED;
+
+      GenerationResult &gr = request_generation_results[request.guid];
+      assert(gr.guid == request.guid);
+      gr.finetuning_losses = request.finetuning_losses;
+      trigger_request_completion_future(request.guid);
+      num_processed_requests++;
+
+      ProfileInfo profile_info = profiling_requests[request.guid];
+      profile_info.finish_time = Realm::Clock::current_time_in_microseconds();
+      total_request_run_time +=
+          profile_info.finish_time - profile_info.start_time;
+      profiling_requests[request.guid] = profile_info;
+      log_req_mgr.print("[%s] guid(%zu) completed_training_steps(%d) "
+                        "processed_finetuning_tokens(%lu) latency(%.1lf)",
+                        request.warmup ? "Warmup" : "Finetuning",
+                        request.guid,
+                        request.completed_training_steps,
+                        request.processed_finetuning_tokens,
+                        profile_info.finish_time - profile_info.start_time);
+      if (!output_filepath.empty()) {
+        std::ofstream outputFile(output_filepath, std::ios::app);
+        if (outputFile.is_open()) {
+          std::string tokens_str = "[";
+          for (size_t i = 0; i < request.finetuning_tokens_per_batch.size();
+               i++) {
+            tokens_str +=
+                std::to_string(request.finetuning_tokens_per_batch[i]);
+            if (i != request.finetuning_tokens_per_batch.size() - 1) {
+              tokens_str += ", ";
+            }
+          }
+          tokens_str += "]";
+          outputFile << "[" << (request.warmup ? "Warmup" : "Finetuning")
+                     << "] guid(" << request.guid
+                     << ") completed_training_steps("
+                     << request.completed_training_steps
+                     << ") processed_finetuning_tokens("
+                     << request.processed_finetuning_tokens << ") latency("
+                     << std::fixed << std::setprecision(3)
+                     << (profile_info.finish_time - profile_info.start_time)
+                     << ") tokens_per_batch(" << tokens_str << ")\n";
+          outputFile.close();
+        } else {
+          std::cout << "Unable to open the output file: " << output_filepath
+                    << std::endl;
+          assert(false);
+        }
+      }
+    }
+  }
+
+  // Step 4: add PEFT bwd requests, if there is additional space
+  while (pending_peft_request_queue.size() > 0) {
+    Request &request = pending_peft_request_queue.front();
+    // assert(request.req_type = RequestType::REQ_FINETUNING);
+    Request &all_req_handle = all_requests[request.guid];
+    // assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
+    if (all_req_handle.status == Request::COMPLETED) {
+      pending_peft_request_queue.pop();
+    } else {
+      break;
+    }
+  }
+
+  if (pending_peft_request_queue.size() > 0 && !inference_finished) {
+    Request &request = pending_peft_request_queue.front();
+    assert(request.req_type = RequestType::REQ_FINETUNING);
+    assert(request.dataset.size() > 0);
+    // update status and training steps
+    Request &all_req_handle = all_requests[request.guid];
+    assert(all_req_handle.req_type = RequestType::REQ_FINETUNING);
+
+    request.completed_training_steps = all_req_handle.completed_training_steps;
+    request.processed_finetuning_tokens =
+        all_req_handle.processed_finetuning_tokens;
+    request.status = all_req_handle.status;
+    int dataset_entry =
+        request.completed_training_steps % request.dataset.size();
+    request.dataset_entry_processed_tokens =
+        all_req_handle.dataset_entry_processed_tokens;
+    request.gradient_accumulation_steps =
+        all_req_handle.gradient_accumulation_steps;
+
+    assert(request.status != Request::COMPLETED);
+    assert(request.max_training_steps > 0 &&
+           request.completed_training_steps < request.max_training_steps);
+    assert(request.dataset_entry_processed_tokens <=
+           request.dataset[dataset_entry].first.size());
+
+    int num_peft_tokens =
+        min((int)request.dataset[dataset_entry].first.size() -
+                request.dataset_entry_processed_tokens,
+            get_max_tokens_per_batch() - new_bc.num_active_infr_tokens());
+    int num_peft_label_tokens = request.dataset[dataset_entry].second.size();
+    assert(num_peft_label_tokens == 0);
+
+    if (num_peft_tokens > 0) {
+      assert(new_bc.request_completed[inference_batch_size]);
+      // request info
+      new_bc.request_completed[inference_batch_size] = false;
+      new_bc.requestsInfo[inference_batch_size].first_token_depth_in_request =
+          request.dataset_entry_processed_tokens;
+      new_bc.requestsInfo[inference_batch_size].first_token_offset_in_batch =
+          new_bc.num_active_infr_tokens();
+      new_bc.requestsInfo[inference_batch_size].num_tokens_in_batch =
+          num_peft_tokens;
+      new_bc.requestsInfo[inference_batch_size].max_sequence_length =
+          request.max_sequence_length;
+      new_bc.requestsInfo[inference_batch_size].request_guid = request.guid;
+      new_bc.requestsInfo[inference_batch_size].peft_model_id =
+          request.peft_model_id;
+      new_bc.requestsInfo[inference_batch_size].peft_bwd = true;
+      set_optimizer_tasks(
+          new_bc.requestsInfo[inference_batch_size].optimizer_tasks,
+          request.max_training_steps,
+          request.completed_training_steps,
+          request.gradient_accumulation_steps);
+      // tokens info
+      for (size_t i = request.dataset_entry_processed_tokens;
+           i < request.dataset_entry_processed_tokens + num_peft_tokens;
+           i++) {
+        new_bc.tokensInfo[new_bc.num_tokens].token_id =
+            request.dataset[dataset_entry].first[i];
+        new_bc.tokensInfo[new_bc.num_tokens].request_index =
+            inference_batch_size;
+        new_bc.tokensInfo[new_bc.num_tokens].abs_depth_in_request = i;
+        new_bc.num_tokens++;
+        new_bc.num_peft_tokens++;
+      }
+    }
+  }
   return new_bc;
 }
 
@@ -722,11 +1128,17 @@ BeamSearchBatchConfig
         if (!output_filepath.empty()) {
           std::ofstream outputFile(output_filepath, std::ios::app);
           if (outputFile.is_open()) {
-            outputFile << "end-to-end latency: " << std::fixed
-                       << std::setprecision(3) << total_request_run_time
-                       << std::endl;
-            outputFile << "num decoding steps: "
-                       << profile_info.llm_decoding_steps << std::endl;
+            outputFile << "[Profile] guid(" << request.guid
+                       << ") llm_decoding_steps("
+                       << profile_info.llm_decoding_steps << ") latency("
+                       << std::fixed << std::setprecision(3)
+                       << (profile_info.finish_time - profile_info.start_time)
+                       << ")\n";
+            // outputFile << "end-to-end latency: " << std::fixed
+            //            << std::setprecision(3) << total_request_run_time
+            //            << std::endl;
+            // outputFile << "num decoding steps: "
+            //            << profile_info.llm_decoding_steps << std::endl;
             outputFile << "token IDs: ";
             for (int i = 0; i < request.tokens.size(); i++) {
               outputFile << request.tokens[i];
@@ -736,7 +1148,6 @@ BeamSearchBatchConfig
             }
             outputFile << std::endl;
             outputFile << output;
-
             outputFile.close();
           } else {
             std::cout << "Unable to open the output file: " << output_filepath
@@ -884,10 +1295,10 @@ BeamSearchBatchConfig
   // Step 2: Initialize new request
   for (int i = 0; i < BeamSearchBatchConfig::max_requests_per_batch(); i++) {
     if (new_bc.request_completed[i]) {
-      if (!pending_request_queue.empty() &&
+      if (!pending_infr_request_queue.empty() &&
           new_bc.num_tokens < get_max_tokens_per_batch()) {
-        Request new_request = pending_request_queue.front();
-        pending_request_queue.pop();
+        Request new_request = pending_infr_request_queue.front();
+        pending_infr_request_queue.pop();
         // all_requests[new_request.guid] = new_request;
         num_active_req++;
         new_bc.requestsInfo[i].first_token_depth_in_request = 0;
@@ -901,13 +1312,13 @@ BeamSearchBatchConfig
         new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
         // add profile_info for the new request
-        ProfileInfo profile_info;
-        profile_info.llm_decoding_steps = 0;
-        profile_info.ssm_decoding_steps = 0;
-        profile_info.start_time = Realm::Clock::current_time_in_microseconds();
-        profiling_requests[new_request.guid] = profile_info;
+        profiling_requests[new_request.guid].llm_decoding_steps = 0;
+        profiling_requests[new_request.guid].ssm_decoding_steps = 0;
+        profiling_requests[new_request.guid].start_time =
+            Realm::Clock::current_time_in_microseconds();
         // init the beam search metadata per request
-        int ssm_decoding_steps = profile_info.ssm_decoding_steps;
+        int ssm_decoding_steps =
+            profiling_requests[new_request.guid].ssm_decoding_steps;
 
         new_bc.beamRequestsInfo[i].beam_size =
             spec_infer_tree_width.size() > ssm_decoding_steps
@@ -1552,7 +1963,6 @@ TreeVerifyBatchConfig RequestManager::prepare_next_batch_verify(
       new_bc.requestsInfo[num_active_req].batch_config_request_id = i;
 
       new_bc.request_completed[i] = false;
-
       new_bc.requestsInfo[i].num_tokens_in_batch =
           std::min(max_prompt_load_size,
                    (int)request.initial_len -
@@ -2105,7 +2515,7 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
         // must in this branch.
         int layer_slot = i - processed_whole_layer_tokens;
         int layer_slot_total = treeLayers[layer_num];
-        if ((first_layer_slot == layer_slot)) {
+        if (first_layer_slot == layer_slot) {
           verifiedTree.push_back(output);
           new_committed_tokens.push_back(std::make_pair(
               input.second, committed_tokens.at(guid).at(i).second));
@@ -2297,19 +2707,34 @@ std::vector<std::pair<BatchConfig::TokenId, int>>
 }
 
 std::vector<GenerationResult>
-    FFModel::generate(std::vector<std::string> &prompts, int max_seq_length) {
+    FFModel::generate(std::vector<Request> const &requests) {
   RequestManager *rm = RequestManager::get_request_manager();
-  std::vector<RequestManager::RequestGuid> guids;
-  for (int i = 0; i < prompts.size(); i++) {
-    RequestManager::RequestGuid guid =
-        rm->register_new_request(prompts.at(i), max_seq_length);
-    if (guid != RequestManager::INVALID_GUID) {
-      guids.push_back(guid);
+  // reset inference_finished flag
+  rm->set_inference_finished(false);
+  std::vector<RequestManager::RequestGuid> inf_guids, peft_guids;
+  for (int i = 0; i < requests.size(); i++) {
+    RequestManager::RequestGuid guid;
+    if (requests.at(i).req_type == RequestType::REQ_INFERENCE) {
+      guid = rm->register_new_request(requests.at(i));
+      if (guid != RequestManager::INVALID_GUID) {
+        inf_guids.push_back(guid);
+      }
+    } else {
+      guid = rm->register_new_peft_request(requests.at(i));
+      if (guid != RequestManager::INVALID_GUID) {
+        peft_guids.push_back(guid);
+      }
     }
   }
   std::vector<GenerationResult> results;
-  for (int i = 0; i < guids.size(); i++) {
-    results.push_back(rm->get_generation_result(guids[i]));
+  for (int i = 0; i < inf_guids.size(); i++) {
+    results.push_back(rm->get_generation_result(inf_guids[i]));
+  }
+  if (inf_guids.size() > 0) {
+    rm->set_inference_finished();
+  }
+  for (int i = 0; i < peft_guids.size(); i++) {
+    results.push_back(rm->get_generation_result(peft_guids[i]));
   }
   return results;
 }
@@ -2342,6 +2767,18 @@ void RequestManager::background_serving_task(
     std::vector<PhysicalRegion> const &regions,
     Context ctx,
     Runtime *runtime) {
+
+  auto print_timestamped_message = [](std::string const &message) {
+    auto now =
+        std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+    std::cout << std::put_time(std::localtime(&now), "%Y-%m-%d %X") << " - "
+              << message << std::endl;
+  };
+
+  // Print at the start of the task
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Starting background serving task.");
+
   RequestManager *rm = RequestManager::get_request_manager();
   FFModel *llm = *(FFModel **)task->args;
   {
@@ -2358,6 +2795,11 @@ void RequestManager::background_serving_task(
       ssm->config.lg_ctx = ctx;
     }
   }
+
+  // Checkpoint print
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Updated models' configuration.");
+
   if (rm->get_num_ssms() == 0) {
     // No SSMs: perform incremental decoding
     rm->serve_incr_decoding(llm);
@@ -2365,13 +2807,48 @@ void RequestManager::background_serving_task(
     // Registered SSMs: perform speculative inference
     rm->serve_spec_infer(llm);
   }
+
 #ifdef FF_USE_NCCL
   llm->finish_nccl_comms();
 #endif
+
+  // Print at the end of the task
+  print_timestamped_message(
+      "###PEFT DEBUGGING### Background serving task completed.");
+}
+
+std::string find_layer_name_from_guid(FFModel *model, LayerID guid) {
+  for (size_t i = 0; i < model->layers.size(); i++) {
+    if (model->layers[i]->layer_guid == guid) {
+      std::string layer_name(model->layers[i]->name);
+      return layer_name;
+    }
+  }
+  assert(false);
+  return "invalid_layer_name";
+}
+
+bool is_peft_operator_type(OperatorType type) {
+  switch (type) {
+    case OP_LORA:
+      return true;
+    default:
+      return false;
+  }
 }
 
 /*static*/
 void RequestManager::serve_incr_decoding(FFModel *llm) {
+
+  // Check if the model object exists
+  if (llm == nullptr) {
+    std::cout << "###PEFT DEBUGGING### LLM Model object does not exist."
+              << std::endl;
+    return; // Early return to prevent further operations on a nullptr
+  } else {
+    std::cout << "###PEFT DEBUGGING### LLM Model object exists." << std::endl;
+  }
+
   Context ctx = llm->config.lg_ctx;
   Runtime *runtime = llm->config.lg_hlr;
   // Compile the llm
@@ -2419,6 +2896,9 @@ void RequestManager::serve_incr_decoding(FFModel *llm) {
     BatchConfigFuture bcf =
         prepare_next_batch(next_batch.first, next_batch.second, ctx, runtime);
     FutureMap fm = im->inference(llm, 0, bcf);
+    if (llm->config.enable_peft) {
+      im->peft_bwd(llm, 0, bcf);
+    }
     assert(fm.get_future_map_domain().get_volume() == 1);
     InferenceResultFuture irf = fm.get_future(0);
     batch_pipeline.push(std::make_pair(bcf, irf));
diff --git a/src/runtime/request_manager.cpp b/src/runtime/request_manager.cpp
index fadbf80d6d..8e5f302466 100644
--- a/src/runtime/request_manager.cpp
+++ b/src/runtime/request_manager.cpp
@@ -73,74 +73,69 @@ void RequestManager::load_batch_config_task(
 
   // copy meta data to workSpace
   FFHandler handle = *((FFHandler const *)task->local_args);
-  size_t total_copy_size = 0;
-  checkCUDA(hipMemcpyAsync(handle.batch_config_metadata,
+  checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->tokens_info,
                            &(batch_config->tokensInfo),
                            sizeof(BatchConfig::tokensInfo),
                            hipMemcpyHostToDevice,
                            stream));
-  total_copy_size += sizeof(BatchConfig::tokensInfo);
 
-  checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                               total_copy_size,
+  checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->requestsInfo,
                            &(batch_config->requestsInfo),
                            sizeof(BatchConfig::requestsInfo),
                            hipMemcpyHostToDevice,
                            stream));
-  total_copy_size += sizeof(BatchConfig::requestsInfo);
 
   // load speculative metadata
   if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
     BeamSearchBatchConfig const *beam_batch_config =
         static_cast<BeamSearchBatchConfig const *>(batch_config);
 
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamTokenInfo,
                              &(beam_batch_config->beamTokenInfo),
                              sizeof(BeamSearchBatchConfig::beamTokenInfo),
                              hipMemcpyHostToDevice,
                              stream));
 
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
-
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo,
                              &(beam_batch_config->beamRequestsInfo),
                              sizeof(BeamSearchBatchConfig::beamRequestsInfo),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
 
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask,
                              &(beam_batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
 
-    total_copy_size += sizeof(BatchConfig::causalMask);
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed,
+                             &(batch_config->request_completed),
+                             sizeof(BatchConfig::request_completed),
+                             hipMemcpyHostToDevice,
+                             stream));
+
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     TreeVerifyBatchConfig const *tree_batch_config =
         static_cast<TreeVerifyBatchConfig const *>(batch_config);
 
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->causalMask,
                              &(tree_batch_config->causalMask),
                              sizeof(BatchConfig::causalMask),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-    checkCUDA(hipMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                 total_copy_size,
+
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->committed_tokens,
                              &(tree_batch_config->committed_tokens),
                              sizeof(TreeVerifyBatchConfig::committed_tokens),
                              hipMemcpyHostToDevice,
                              stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
-  }
 
-  // add a size check
-  assert(total_copy_size <= handle.batch_config_metadata_size);
+    checkCUDA(hipMemcpyAsync(handle.batch_config_metadata->request_completed,
+                             &(batch_config->request_completed),
+                             sizeof(BatchConfig::request_completed),
+                             hipMemcpyHostToDevice,
+                             stream));
+  }
 }
 
 void RequestManager::load_positions_task(
diff --git a/src/runtime/request_manager.cu b/src/runtime/request_manager.cu
index 8380d6be73..343f1dd6e6 100644
--- a/src/runtime/request_manager.cu
+++ b/src/runtime/request_manager.cu
@@ -40,8 +40,21 @@ void RequestManager::load_tokens_task(
     printf("Warning: too many tokens in prompt, only load up to %d tokens\n",
            BatchConfig::max_tokens_per_batch());
     printf("Got: %d tokens\n", batch_config->num_tokens);
+
+    // pid_t pid = getpid();
+    // std::string filename = "bc_" + std::to_string(pid) + ".txt";
+    // std::ofstream file(filename);
+    // if (file.is_open()) {
+    //     file << *batch_config << std::endl;
+    //     file.close();
+    //     std::cout << "String written to file: " << filename << std::endl;
+    // } else {
+    //     std::cout << "Unable to open file: " << filename << std::endl;
+    // }
+
   } else if (batch_config->num_tokens >
-             BatchConfig::max_verify_tokens_per_batch()) {
+                 BatchConfig::max_verify_tokens_per_batch() &&
+             batch_config->get_mode() != INC_DECODING_MODE) {
     printf("Warning: Speculative decoding. too many tokens in prompt, only "
            "load up to %d tokens\n",
            BatchConfig::max_verify_tokens_per_batch());
@@ -80,91 +93,69 @@ void RequestManager::load_batch_config_task(
 
   // copy meta data to workSpace
   FFHandler handle = *((FFHandler const *)task->local_args);
-  size_t total_copy_size = 0;
-  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata,
+  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->tokens_info,
                             &(batch_config->tokensInfo),
                             sizeof(BatchConfig::tokensInfo),
                             cudaMemcpyHostToDevice,
                             stream));
-  total_copy_size += sizeof(BatchConfig::tokensInfo);
 
-  checkCUDA(cudaMemcpyAsync(static_cast<char *>(handle.batch_config_metadata) +
-                                total_copy_size,
+  checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->requestsInfo,
                             &(batch_config->requestsInfo),
                             sizeof(BatchConfig::requestsInfo),
                             cudaMemcpyHostToDevice,
                             stream));
-  total_copy_size += sizeof(BatchConfig::requestsInfo);
 
   // load speculative metadata
   if (batch_config->get_mode() == BEAM_SEARCH_MODE) {
     BeamSearchBatchConfig const *beam_batch_config =
         static_cast<BeamSearchBatchConfig const *>(batch_config);
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamTokenInfo),
-        sizeof(BeamSearchBatchConfig::beamTokenInfo),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamTokenInfo);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->beamRequestsInfo),
-        sizeof(BeamSearchBatchConfig::beamRequestsInfo),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BeamSearchBatchConfig::beamRequestsInfo);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(beam_batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BatchConfig::request_completed);
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamTokenInfo,
+                              &(beam_batch_config->beamTokenInfo),
+                              sizeof(BeamSearchBatchConfig::beamTokenInfo),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->beamRequestsInfo,
+                              &(beam_batch_config->beamRequestsInfo),
+                              sizeof(BeamSearchBatchConfig::beamRequestsInfo),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask,
+                              &(beam_batch_config->causalMask),
+                              sizeof(BatchConfig::causalMask),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed,
+                              &(batch_config->request_completed),
+                              sizeof(BatchConfig::request_completed),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
   } else if (batch_config->get_mode() == TREE_VERIFY_MODE) {
     TreeVerifyBatchConfig const *tree_batch_config =
         static_cast<TreeVerifyBatchConfig const *>(batch_config);
 
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->causalMask),
-        sizeof(BatchConfig::causalMask),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(BatchConfig::causalMask);
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(tree_batch_config->committed_tokens),
-        sizeof(TreeVerifyBatchConfig::committed_tokens),
-        cudaMemcpyHostToDevice,
-        stream));
-    total_copy_size += sizeof(TreeVerifyBatchConfig::committed_tokens);
-
-    checkCUDA(cudaMemcpyAsync(
-        static_cast<char *>(handle.batch_config_metadata) + total_copy_size,
-        &(batch_config->request_completed),
-        sizeof(BatchConfig::request_completed),
-        cudaMemcpyHostToDevice,
-        stream));
-
-    total_copy_size += sizeof(BatchConfig::request_completed);
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->causalMask,
+                              &(tree_batch_config->causalMask),
+                              sizeof(BatchConfig::causalMask),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->committed_tokens,
+                              &(tree_batch_config->committed_tokens),
+                              sizeof(TreeVerifyBatchConfig::committed_tokens),
+                              cudaMemcpyHostToDevice,
+                              stream));
+
+    checkCUDA(cudaMemcpyAsync(handle.batch_config_metadata->request_completed,
+                              &(batch_config->request_completed),
+                              sizeof(BatchConfig::request_completed),
+                              cudaMemcpyHostToDevice,
+                              stream));
   }
-
-  // add a size check
-  assert(total_copy_size <= handle.batch_config_metadata_size);
 }
 
 void RequestManager::load_positions_task(
diff --git a/src/runtime/simulator.cpp b/src/runtime/simulator.cpp
index 0daf151d2c..56931e0dc7 100644
--- a/src/runtime/simulator.cpp
+++ b/src/runtime/simulator.cpp
@@ -82,17 +82,17 @@ Simulator::Simulator(FFModel const *model,
 
   checkCUDA(hipEventCreate(&start_event));
   checkCUDA(hipEventCreate(&end_event));
-  conv2d_meta = new Conv2DMeta(handler);
-  // linear_meta = new LinearMeta(handler, 4096);
-  pool2d_meta = new Pool2DMeta(handler);
-  ele_unary_meta = new ElementUnaryMeta(handler);
-  // ele_binary_meta = new ElementBinaryMeta(handler);
-  // embedding_meta = new EmbeddingMeta(handler);
-  //  softmax_meta = new SoftmaxMeta(handler);
-  batch_matmul_meta = new BatchMatmulMeta(handler);
-  concat_meta = new ConcatMeta(handler);
-  // dropout_meta = new DropoutMeta(handler);
-  transpose_meta = new TransposeMeta(handler);
+  // conv2d_meta = new Conv2DMeta(handler);
+  //  linear_meta = new LinearMeta(handler, 4096);
+  // pool2d_meta = new Pool2DMeta(handler);
+  // ele_unary_meta = new ElementUnaryMeta(handler);
+  //  ele_binary_meta = new ElementBinaryMeta(handler);
+  //  embedding_meta = new EmbeddingMeta(handler);
+  //   softmax_meta = new SoftmaxMeta(handler);
+  // batch_matmul_meta = new BatchMatmulMeta(handler);
+  // concat_meta = new ConcatMeta(handler);
+  //  dropout_meta = new DropoutMeta(handler);
+  // transpose_meta = new TransposeMeta(handler);
   this->machine = machine;
   segment_size = model->config.simulator_segment_size;
   max_num_segments = model->config.simulator_max_num_segments;
diff --git a/src/runtime/simulator.cu b/src/runtime/simulator.cu
index b44ce1690a..056781f73d 100644
--- a/src/runtime/simulator.cu
+++ b/src/runtime/simulator.cu
@@ -81,17 +81,17 @@ Simulator::Simulator(FFModel const *model,
 
   cudaEventCreate(&start_event);
   cudaEventCreate(&end_event);
-  conv2d_meta = new Conv2DMeta(handler);
+  // conv2d_meta = new Conv2DMeta(handler);
   // linear_meta = new LinearMeta(handler, 4096);
-  pool2d_meta = new Pool2DMeta(handler);
-  ele_unary_meta = new ElementUnaryMeta(handler);
+  // pool2d_meta = new Pool2DMeta(handler);
+  // ele_unary_meta = new ElementUnaryMeta(handler);
   // ele_binary_meta = new ElementBinaryMeta(handler);
   // embedding_meta = new EmbeddingMeta(handler);
   // softmax_meta = new SoftmaxMeta(handler);
-  batch_matmul_meta = new BatchMatmulMeta(handler);
-  concat_meta = new ConcatMeta(handler);
+  // batch_matmul_meta = new BatchMatmulMeta(handler);
+  // concat_meta = new ConcatMeta(handler);
   // dropout_meta = new DropoutMeta(handler);
-  transpose_meta = new TransposeMeta(handler);
+  // transpose_meta = new TransposeMeta(handler);
   this->machine = machine;
   segment_size = model->config.simulator_segment_size;
   max_num_segments = model->config.simulator_max_num_segments;
@@ -103,13 +103,13 @@ Simulator::~Simulator(void) {
   simulatorInst.destroy();
   cudaEventDestroy(start_event);
   cudaEventDestroy(end_event);
-  delete conv2d_meta;
-  delete pool2d_meta;
-  delete ele_unary_meta;
-  delete batch_matmul_meta;
-  delete concat_meta;
-  delete transpose_meta;
-  delete task_manager;
+  // delete conv2d_meta;
+  // delete pool2d_meta;
+  // delete ele_unary_meta;
+  // delete batch_matmul_meta;
+  // delete concat_meta;
+  // delete transpose_meta;
+  // delete task_manager;
 }
 
 __host__ void
diff --git a/src/runtime/substitution.cc b/src/runtime/substitution.cc
index b86964049d..9b6510fe5e 100644
--- a/src/runtime/substitution.cc
+++ b/src/runtime/substitution.cc
@@ -43,6 +43,7 @@
 #include "flexflow/parallel_ops/allreduce.h"
 #include "flexflow/parallel_ops/combine.h"
 #include "flexflow/parallel_ops/fused_parallel_op.h"
+#include "flexflow/parallel_ops/parallel_identity.h"
 #include "flexflow/parallel_ops/partition.h"
 #include "flexflow/parallel_ops/reduction.h"
 #include "flexflow/parallel_ops/replicate.h"
@@ -3754,14 +3755,17 @@ bool FFModel::convert_graph_to_operators(
         assert(inList.size() == 1);
         Softmax *softmax = (Softmax *)node.ptr;
         new_op = new Softmax(
-            *this, softmax->layer_guid, inputs[0], softmax->dim, NULL);
+            *this, softmax->layer_guid, inputs[0], softmax->dim, softmax->name);
         break;
       }
       case OP_COMBINE: {
         assert(inList.size() == 1);
         Combine *combine = (Combine *)node.ptr;
-        new_op = new Combine(
-            *this, inputs[0], combine->combine_dim, combine->combine_degree);
+        new_op = new Combine(*this,
+                             inputs[0],
+                             combine->combine_dim,
+                             combine->combine_degree,
+                             combine->name);
         break;
       }
       case OP_REPARTITION: {
@@ -3770,7 +3774,8 @@ bool FFModel::convert_graph_to_operators(
         new_op = new Repartition(*this,
                                  inputs[0],
                                  repart->repartition_dim,
-                                 repart->repartition_degree);
+                                 repart->repartition_degree,
+                                 repart->name);
         break;
       }
       case OP_REPLICATE: {
@@ -3779,7 +3784,8 @@ bool FFModel::convert_graph_to_operators(
         new_op = new Replicate(*this,
                                inputs[0],
                                replicate->replicate_dim,
-                               replicate->replicate_degree);
+                               replicate->replicate_degree,
+                               replicate->name);
         break;
       }
       case OP_REDUCTION: {
@@ -3788,13 +3794,24 @@ bool FFModel::convert_graph_to_operators(
         new_op = new Reduction(*this,
                                inputs[0],
                                reduction->reduction_dim,
-                               reduction->reduction_degree);
+                               reduction->reduction_degree,
+                               reduction->name);
         break;
       }
       case OP_ALLREDUCE: {
         assert(inList.size() == 1);
         AllReduce *allreduce = (AllReduce *)node.ptr;
-        new_op = new AllReduce(*this, inputs[0], allreduce->allreduce_dim);
+        new_op = new AllReduce(
+            *this, inputs[0], allreduce->allreduce_dim, allreduce->name);
+        break;
+      }
+      case OP_PARALLEL_IDENTITY: {
+        assert(inList.size() == 1);
+        ParallelIdentity *parallel_identity = (ParallelIdentity *)node.ptr;
+        new_op = new ParallelIdentity(*this,
+                                      inputs[0],
+                                      parallel_identity->parallel_identity_dim,
+                                      parallel_identity->name);
         break;
       }
       case OP_FUSED_PARALLEL: {
@@ -3819,8 +3836,9 @@ bool FFModel::convert_graph_to_operators(
                                               abr_ln->elementwise_affine,
                                               abr_ln->use_bias,
                                               abr_ln->eps,
+                                              abr_ln->inplace_residual,
                                               true,
-                                              NULL);
+                                              abr_ln->name);
         break;
       }
       case OP_SIGMOID_SILU_MULTI: {
@@ -3828,7 +3846,7 @@ bool FFModel::convert_graph_to_operators(
         SigmoidSiluMulti *ssm = (SigmoidSiluMulti *)node.ptr;
         SigmoidSiluMultiParams params = ssm->get_params();
         new_op = new SigmoidSiluMulti(
-            *this, ssm->layer_guid, inputs[0], inputs[1], NULL);
+            *this, ssm->layer_guid, inputs[0], inputs[1], ssm->name);
         break;
       }
       default: {
diff --git a/src/runtime/tree_verify_batch_config.cc b/src/runtime/tree_verify_batch_config.cc
index 49d42bb6dd..a71b1070b2 100644
--- a/src/runtime/tree_verify_batch_config.cc
+++ b/src/runtime/tree_verify_batch_config.cc
@@ -54,6 +54,10 @@ std::ostream &operator<<(std::ostream &os, TreeVerifyBatchConfig const &bc) {
       os << "    Number of tokens in batch: "
          << bc.requestsInfo[i].num_tokens_in_batch << std::endl;
       os << "    GUID: " << bc.requestsInfo[i].request_guid << std::endl;
+      // PEFT values
+      os << "    PEFT Model ID: " << bc.requestsInfo[i].peft_model_id
+         << std::endl;
+      os << "    PEFT bwd: " << bc.requestsInfo[i].peft_bwd << std::endl;
       os << "    Max sequence length: "
          << bc.requestsInfo[i].max_sequence_length << std::endl;
       os << "    Request completed: " << bc.request_completed[i] << std::endl;
diff --git a/tests/.gitignore b/tests/.gitignore
deleted file mode 100644
index f3732d54f4..0000000000
--- a/tests/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-inference/python_test_configs/*.json
diff --git a/tests/align/test_all_operators.sh b/tests/align/test_all_operators.sh
index 3fb361f25c..73b0cb30dc 100755
--- a/tests/align/test_all_operators.sh
+++ b/tests/align/test_all_operators.sh
@@ -11,7 +11,7 @@ function generate_torch_tensor(){
     python tests/align/align_create_tensor_torch.py -o "$1"
 }
 
-ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear gather)
+ops=(add concat conv2d cos embedding exp flat getitem identity multiply pool2d reducesum relu reshape scalar_add scalar_multiply scalar_sub scalar_truediv sigmoid sin subtract tanh transpose view_embedding max min linear layernorm gather)
 
 #create flexflow tensors
 conda activate flexflow
diff --git a/tests/cpp_gpu_tests.sh b/tests/cpp_gpu_tests.sh
index 1e8dd4298f..c7206eac93 100755
--- a/tests/cpp_gpu_tests.sh
+++ b/tests/cpp_gpu_tests.sh
@@ -23,8 +23,8 @@ remove_mnist() {
 download_mnist() {
 	if [[ ! -f train-images-idx3-ubyte || ! -f train-labels-idx1-ubyte ]]; then
 		remove_mnist
-		wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
-		wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
+		wget https://mnist-backup.s3.us-east-2.amazonaws.com/train-images-idx3-ubyte.gz
+		wget https://mnist-backup.s3.us-east-2.amazonaws.com/train-labels-idx1-ubyte.gz
 		gzip -d train-images-idx3-ubyte.gz
 		gzip -d train-labels-idx1-ubyte.gz
 	fi
diff --git a/tests/inference/cpp_inference_tests.sh b/tests/inference/cpp_inference_tests.sh
index 8beea55999..a9dd8809ba 100755
--- a/tests/inference/cpp_inference_tests.sh
+++ b/tests/inference/cpp_inference_tests.sh
@@ -10,26 +10,26 @@ cd "${BASH_SOURCE[0]%/*}"
 ###############################################################################################
 
 # LLAMA
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama.txt -pipeline-parallelism-degree 4
 # LLAMA (half precision)
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half.txt -pipeline-parallelism-degree 4
 
 # OPT
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt.txt -pipeline-parallelism-degree 4
 # OPT (half precision)
-../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4
+../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half.txt -pipeline-parallelism-degree 4
 
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -ssm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_llama_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     
     # OPT
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # OPT (half precision)
-    ../../build/inference/spec_infer/spec_infer -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/spec_infer/spec_infer -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -ssm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/spec_inference_opt_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 fi
 
 ###############################################################################################
@@ -37,63 +37,63 @@ fi
 ###############################################################################################
 
 # LLAMA (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 4
 
 ../../build/inference/incr_decoding/incr_decoding -ll:gpu 1 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M.txt -pipeline-parallelism-degree 1
 
 # LLAMA (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half.txt -pipeline-parallelism-degree 4
 
 # LLAMA (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B.txt -pipeline-parallelism-degree 4
 # LLAMA (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half.txt -pipeline-parallelism-degree 4
 
 # OPT (small model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M.txt -pipeline-parallelism-degree 4
 # OPT (small model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half.txt -pipeline-parallelism-degree 4
 
 # OPT (big model)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B.txt -pipeline-parallelism-degree 4
 # OPT (big model, half precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half.txt -pipeline-parallelism-degree 4
 
 # Falcon (full precision)
-../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 40000 --fusion --use-full-precision -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 # Falcon (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model tiiuae/falcon-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_falcon_7B.txt -pipeline-parallelism-degree 4
 
 # # StarCoder (full precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B.txt -pipeline-parallelism-degree 4
 # # StarCoder (half precision)
-# ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4
+# ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model bigcode/starcoderbase-7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_starcoder_7B_half.txt -pipeline-parallelism-degree 4
 
 # Tensor parallelism tests
 if [ "$TENSOR_PARALLELISM_TESTS" = "ON" ]; then
     # LLAMA (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
     # LLAMA (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model JackFram/llama-160m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_160M_half_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
 
     # LLAMA (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # LLAMA (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model meta-llama/Llama-2-7b-hf -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_llama_2_7B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 
     # OPT (small model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_tp4.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
     # OPT (small model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-125m -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_125M_half_tp.txt -pipeline-parallelism-degree 1 -tensor-parallelism-degree 4
 
     # OPT (big model)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion --use-full-precision -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
     # OPT (big model, half precision)
-    ../../build/inference/incr_decoding/incr_decoding -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
+    ../../build/inference/incr_decoding/incr_decoding -ll:cpu 4 -ll:util 4 -ll:gpu 4 -ll:fsize 14000 -ll:zsize 30000 --fusion -llm-model facebook/opt-6.7b -prompt ../../inference/prompt/test.json -output-file ../../inference/output/incr_decoding_opt_6B_half_tp.txt -pipeline-parallelism-degree 2 -tensor-parallelism-degree 2
 fi
 
 ###############################################################################################
diff --git a/tests/inference/huggingface_inference.py b/tests/inference/huggingface_inference.py
index 6857b5cbc1..5e563c9974 100644
--- a/tests/inference/huggingface_inference.py
+++ b/tests/inference/huggingface_inference.py
@@ -77,20 +77,18 @@ def main():
 
     # Set default tensor type depending on argument indicating the float type to use
     if not args.use_full_precision:
-        torch.set_default_tensor_type(torch.HalfTensor)
-
+        torch.set_default_dtype(torch.float16)
+    else:
+        torch.set_default_dtype(torch.float32)
+    
     # Run huggingface model
     cuda_availble = torch.cuda.is_available()
     device = "cuda" if args.gpu and cuda_availble else "cpu"
     # Get Model
-    model = AutoModelForCausalLM.from_pretrained(args.model_name).to(device)
+    model = AutoModelForCausalLM.from_pretrained(args.model_name, trust_remote_code=True).to(device)
     # Get Tokenizer
     hf_config = AutoConfig.from_pretrained(args.model_name, trust_remote_code=True)
-    hf_arch = getattr(hf_config, "architectures")[0]
-    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
-        tokenizer = LlamaTokenizer.from_pretrained(args.model_name, use_fast=True)
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
     generation_config = GenerationConfig.from_pretrained(args.model_name)
     generation_config.do_sample = args.do_sample
     ################# debugging #################
diff --git a/tests/inference/python_inference_tests.sh b/tests/inference/python_inference_tests.sh
index a1ee281914..a83464754f 100755
--- a/tests/inference/python_inference_tests.sh
+++ b/tests/inference/python_inference_tests.sh
@@ -84,12 +84,13 @@ function compare_decoding_steps_spec_infer_incr_decoding {
     local specInf_file="$2"
 
     # Read the number of decoding steps from the second line of the files
-    second_line=$(sed -n '2p' "$incrDec_file")
-    read -r line <<< "$second_line"
-    incrDec=${line#*: }
-    second_line=$(sed -n '2p' "$specInf_file")
-    read -r line <<< "$second_line"
-    specInf=${line#*: }
+    first_line=$(sed -n '1p' "$incrDec_file")
+    incr_dec_steps="${first_line##*llm_decoding_steps(}"
+    incr_dec_steps="${incr_dec_steps%%)*}"
+    
+    first_line=$(sed -n '1p' "$specInf_file")
+    spec_inf_steps="${first_line##*llm_decoding_steps(}"
+    spec_inf_steps="${spec_inf_steps%%)*}"
 
     if ! command -v bc &> /dev/null; then
         echo "bc is not installed. Installing..."
@@ -97,8 +98,8 @@ function compare_decoding_steps_spec_infer_incr_decoding {
     fi
     
     # Perform the comparison
-    threshold=$(bc <<< "$specInf * 1.5")
-    if (( $(echo "$incrDec >= $threshold" | bc -l) )); then
+    threshold=$(bc <<< "$spec_inf_steps * 1.5")
+    if (( $(echo "$incr_dec_steps >= $threshold" | bc -l) )); then
         #echo "The decoding steps in $specInf_file are at least 1.5x less than those in $incrDec_file."
         :
     else
@@ -184,13 +185,13 @@ python3 ./huggingface_inference.py --model-name "facebook/opt-6.7b" --use-full-p
 # Falcon (full precision)
 python3 ./huggingface_inference.py --model-name "tiiuae/falcon-7b" --use-full-precision --prompt-file "../../inference/prompt/test.json" --output-file "../../inference/output/huggingface_falcon_7B.txt" --max-length 128
 
-diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt")
-diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt")
-diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_llama_160M.txt" <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-full_prec-1_tp_4_pp.txt")
+diff <( < ../../inference/output/huggingface_llama_160M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-llama-160m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_llama_7B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-full_prec-1_tp_4_pp.txt")
+diff <( < ../../inference/output/huggingface_llama_7B_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-llama-2-7b-hf-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
 
-diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt")
-diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +4 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
-diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt")
-#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +4 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt")
-diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +4 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt")
+diff "../../inference/output/huggingface_opt_125M.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-full_prec-1_tp_4_pp.txt")
+diff <( < ../../inference/output/huggingface_opt_125M_half.txt tr -s '[:space:]' '\n' | head -n 20) <(tail -n +3 "../../inference/output/incr_dec-python-opt-125m-half_prec-1_tp_4_pp.txt" | tr -s '[:space:]' '\n' | head -n 20)
+diff "../../inference/output/huggingface_opt_6B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-full_prec-1_tp_4_pp.txt")
+#diff "../../inference/output/huggingface_opt_6B_half.txt" <(tail -n +3 "../../inference/output/incr_dec-python-opt-6.7b-half_prec-1_tp_4_pp.txt")
+diff "../../inference/output/huggingface_falcon_7B.txt" <(tail -n +3 "../../inference/output/incr_dec-python-falcon-7b-full_prec-1_tp_4_pp.txt")
diff --git a/tests/inference/python_test_configs/generate_configs.py b/tests/inference/python_test_configs/generate_configs.py
index 41703cf431..0a745c7984 100644
--- a/tests/inference/python_test_configs/generate_configs.py
+++ b/tests/inference/python_test_configs/generate_configs.py
@@ -14,9 +14,12 @@
     "tensor_parallelism_degree": 1,
     "pipeline_parallelism_degree": 4,
     "offload": False,
-    "offload_reserve_space_size": 1024**2,
+    "offload_reserve_space_size": 8 * 1024, # 8 GB
     "use_4bit_quantization": False,
     "use_8bit_quantization": False,
+    "enable_peft": False,
+    "peft_activation_reserve_space_size": 1024, # 1GB
+    "peft_weight_reserve_space_size": 1024, # 1GB
     "profiling": False,
     "benchmarking": False,
     "inference_debugging": False,
diff --git a/tests/peft/alignment/align_test_utils.py b/tests/peft/alignment/align_test_utils.py
new file mode 100644
index 0000000000..93727bdc89
--- /dev/null
+++ b/tests/peft/alignment/align_test_utils.py
@@ -0,0 +1,510 @@
+import os, re, torch
+import numpy as np
+from typing import List
+from enum import Enum
+from dataclasses import dataclass
+
+abs_dirname = os.path.dirname(os.path.abspath(__file__))
+cache_folder = os.path.expanduser(os.getenv("FF_CACHE_PATH", "~/.cache/flexflow"))
+hf_path = os.path.join(cache_folder, "debug/huggingface")
+ff_path = os.path.join(cache_folder, "debug/flexflow")
+
+
+def print_unique_files_list(dirname):
+    files_list = os.listdir(dirname)
+    for f in sorted(files_list):
+        match = re.search(r"layers.\d+", f)
+        if match:
+            if "layers." in match[0]:
+                layer_num = int(match[0].split(".")[1])
+                if layer_num > 0:
+                    files_list.remove(f)
+            elif "layers_" in match[0]:
+                layer_num = int(match[0].split("_")[1])
+                if layer_num > 0 and layer_num != 100:
+                    files_list.remove(f)
+    return sorted(files_list)
+
+
+def compare_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, tolerance=1e-2):
+    """Check whether a HuggingFace tensor and a FlexFlow tensor are equal
+
+    Args:
+        hf_tensor_filepath (str): The file path of the HuggingFace tensor
+        ff_tensor_filepath (str): The file path of the FlexFlow tensor
+        tolerance (float, optional): Floating-point error tolerance for the checks. Defaults to 1e-2.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """
+    if not os.path.exists(hf_tensor_filepath):
+        raise FileNotFoundError(f"HF tensor file: {hf_tensor_filepath} not found")
+    if not os.path.exists(ff_tensor_filepath):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor_filepath} not found")
+    hf_tensor = torch.load(hf_tensor_filepath)
+    if type(hf_tensor) == tuple or type(hf_tensor) == list:
+        assert len(hf_tensor) == 1
+        hf_tensor = hf_tensor[0]
+    hf_tensor = torch.nan_to_num(hf_tensor)
+    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()
+    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=",")
+
+    len_hf_tensor = hf_tensor.shape[0]
+    ff_tensor = ff_tensor[:len_hf_tensor]
+
+    mismatches = []
+    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):
+        print(f"mismatch between {hf_tensor_filepath} and {ff_tensor_filepath}")
+        print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
+        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]
+        print(mismatches)
+        # print(np.nonzero(hf_tensor)[0])
+        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])
+        # print(ff_tensor[36], hf_tensor[36])
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len_hf_tensor
+    print("Ok!")
+
+
+def compare_tensors_difference(
+    hf_tensor_filepath: str,
+    ff_tensor1_filepath: str,
+    ff_tensor2_filepath: str,
+    tolerance: float = 1e-2,
+):
+    """Check whether a HuggingFace tensor is equal to the difference between two FlexFlow tensors
+
+    Args:
+        hf_tensor_filepath (str): The file path of the HuggingFace tensor
+        ff_tensor1_filepath (str): The file path of the first FlexFlow tensor
+        ff_tensor2_filepath (str): The file path of the second FlexFlow tensor
+        tolerance (float, optional): The floating-point error tolerance for the equality check. Defaults to 1e-2.
+    """
+    assert os.path.exists(hf_tensor_filepath)
+    assert os.path.exists(ff_tensor1_filepath)
+    assert os.path.exists(ff_tensor2_filepath)
+    hf_tensor = torch.load(hf_tensor_filepath)
+    if type(hf_tensor) == tuple or type(hf_tensor) == list:
+        assert len(hf_tensor) == 1
+        hf_tensor = hf_tensor[0]
+    hf_tensor = torch.nan_to_num(hf_tensor)
+    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()
+    ff_tensor1 = np.loadtxt(ff_tensor1_filepath, delimiter=",")
+    ff_tensor2 = np.loadtxt(ff_tensor2_filepath, delimiter=",")
+
+    len_hf_tensor = hf_tensor.shape[0]
+    ff_tensor1 = ff_tensor1[:len_hf_tensor]
+    ff_tensor2 = ff_tensor2[:len_hf_tensor]
+    ff_tensor = ff_tensor1 - ff_tensor2
+
+    mismatches = []
+    if not np.allclose(ff_tensor, hf_tensor, atol=tolerance):
+        print(
+            f"mismatch between {hf_tensor_filepath} and {ff_tensor1_filepath} - {ff_tensor2_filepath}"
+        )
+        print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
+        print(np.isclose(ff_tensor, hf_tensor, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor, hf_tensor, atol=tolerance))[0]
+        print(mismatches)
+        # print(np.nonzero(hf_tensor)[0])
+        # print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])
+        # print(ff_tensor[36], hf_tensor[36])
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len_hf_tensor
+    print("Ok!")
+
+
+def compare_hf_tensors(tensor1_fp: str, tensor2_fp: str):
+    """Checks whether two HuggingFace tensors are equal
+
+    Args:
+        tensor1_fp (str): The file path of the first tensor
+        tensor2_fp (str): The file path of the second tensor
+    """
+    if not os.path.exists(tensor1_fp):
+        raise FileNotFoundError(f"HF tensor file: {tensor1_fp} not found")
+    if not os.path.exists(tensor2_fp):
+        raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found")
+    hf_tensor1 = torch.load(tensor1_fp)
+    hf_tensor2 = torch.load(tensor2_fp)
+    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
+        assert len(hf_tensor1) == 1
+        hf_tensor1 = hf_tensor1[0]
+    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:
+        assert len(hf_tensor2) == 1
+        hf_tensor2 = hf_tensor2[0]
+    assert torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape
+    hf_tensor1 = torch.nan_to_num(hf_tensor1)
+    hf_tensor2 = torch.nan_to_num(hf_tensor2)
+    if not (
+        np.allclose(
+            hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()
+        )
+    ):
+        print(f"mismatch between {tensor1_fp} and {tensor2_fp}")
+        print(hf_tensor1)
+        print(hf_tensor2)
+        print(
+            np.isclose(
+                hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()
+            )
+        )
+        mismatches = np.where(
+            ~np.isclose(
+                hf_tensor1.detach().cpu().numpy(), hf_tensor2.detach().cpu().numpy()
+            )
+        )[0]
+        print(mismatches)
+        assert False
+    print("Ok!")
+
+
+def check_hf_sum_tensors(tensor_sum_fp: str, tensor1_fp: str, tensor2_fp: str):
+    """Checks whether a HuggingFace tensor is equal to the sum of two other HuggingFace tensors
+
+    Args:
+        tensor_sum_fp (str): The file path of the sum tensor
+        tensor1_fp (str): The file path of the first tensor
+        tensor2_fp (str): The file path of the second tensor
+    """
+    if not os.path.exists(tensor_sum_fp):
+        raise FileNotFoundError(f"HF tensor file: {tensor_sum_fp} not found")
+    if not os.path.exists(tensor1_fp):
+        raise FileNotFoundError(f"HF tensor file {tensor1_fp} not found")
+    if not os.path.exists(tensor2_fp):
+        raise FileNotFoundError(f"HF tensor file {tensor2_fp} not found")
+    hf_tensor_sum = torch.load(tensor_sum_fp)
+    hf_tensor1 = torch.load(tensor1_fp)
+    hf_tensor2 = torch.load(tensor2_fp)
+    if type(hf_tensor_sum) == tuple or type(hf_tensor_sum) == list:
+        assert len(hf_tensor_sum) == 1
+        hf_tensor_sum = hf_tensor_sum[0]
+    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
+        assert len(hf_tensor1) == 1
+        hf_tensor1 = hf_tensor1[0]
+    if type(hf_tensor2) == tuple or type(hf_tensor2) == list:
+        assert len(hf_tensor2) == 1
+        hf_tensor2 = hf_tensor2[0]
+    assert torch.squeeze(hf_tensor_sum).shape == torch.squeeze(hf_tensor1).shape
+    assert torch.squeeze(hf_tensor1).shape == torch.squeeze(hf_tensor2).shape
+    hf_tensor1 = torch.nan_to_num(hf_tensor1)
+    hf_tensor2 = torch.nan_to_num(hf_tensor2)
+    hf_tensor_sum = torch.nan_to_num(hf_tensor_sum)
+    sum_check_tensor = hf_tensor1 + hf_tensor2
+    if not (
+        np.allclose(
+            sum_check_tensor.detach().cpu().numpy(),
+            hf_tensor_sum.detach().cpu().numpy(),
+        )
+    ):
+        print(f"mismatch between {sum_check_tensor} and {tensor1_fp} + {tensor2_fp}")
+        print(tensor_sum_fp)
+        print(sum_check_tensor)
+        print(hf_tensor1)
+        print(hf_tensor2)
+        print(
+            np.isclose(
+                sum_check_tensor.detach().cpu().numpy(),
+                hf_tensor_sum.detach().cpu().numpy(),
+            )
+        )
+        mismatches = np.where(
+            ~np.isclose(
+                sum_check_tensor.detach().cpu().numpy(),
+                hf_tensor_sum.detach().cpu().numpy(),
+            )
+        )[0]
+        print(mismatches)
+        assert False
+    print("Ok!")
+
+
+def check_hf_zero_tensor(hf_tensor_fp: str):
+    """Check whether a HuggingFace tensor is a zero tensor
+
+    Args:
+        hf_tensor_fp (str): The file path of the HuggingFace tensor
+    """
+    if not os.path.exists(hf_tensor_fp):
+        raise FileNotFoundError(f"HF tensor file: {hf_tensor_fp} not found")
+    hf_tensor1 = torch.load(hf_tensor_fp)
+    if type(hf_tensor1) == tuple or type(hf_tensor1) == list:
+        assert len(hf_tensor1) == 1
+        hf_tensor1 = hf_tensor1[0]
+    assert torch.count_nonzero(torch.nan_to_num(hf_tensor1)).sum() == 0
+
+
+def print_tensors(hf_tensor_filepath: str, ff_tensor_filepath: str, txt: str = ""):
+    """Print the contents of a HuggingFace tensor and a FlexFlow tensor
+
+    Args:
+        hf_tensor_filepath (str): The file path of the HuggingFace tensor
+        ff_tensor_filepath (str): The file path of the FlexFlow tensor
+        txt (str, optional): Additional text to prepend to the tensors. Defaults to "".
+    """
+    assert os.path.exists(hf_tensor_filepath) and os.path.exists(ff_tensor_filepath)
+    hf_tensor = torch.load(hf_tensor_filepath)
+    if type(hf_tensor) == tuple or type(hf_tensor) == list:
+        assert len(hf_tensor) == 1
+        hf_tensor = hf_tensor[0]
+    hf_tensor = torch.nan_to_num(hf_tensor)
+    hf_tensor = hf_tensor.flatten().detach().cpu().numpy()
+    ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=",")
+
+    len_hf_tensor = hf_tensor.shape[0]
+    ff_tensor = ff_tensor[:len_hf_tensor]
+
+    print(f"{txt} - HF tensor:")
+    print(hf_tensor)
+    print(f"{txt} - FF tensor: ")
+    print(ff_tensor)
+
+
+def compare_flexflow_tensors(
+    ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5, max_len: int = -1
+):
+    """Check whether two FlexFlow tensors are equal
+
+    Args:
+        ff_tensor1_fp (str): The file path of the first FlexFlow tensor
+        ff_tensor2_fp (str): The file path of the second FlexFlow tensor
+        tolerance (float, optional): Floating-point error tolernace for the check. Defaults to 1e-5.
+        max_len (int, optional): Maximum number of elements to check (if > 0). Defaults to -1.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """
+    if not os.path.exists(ff_tensor1_fp):
+        raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found")
+    if not os.path.exists(ff_tensor2_fp):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found")
+    assert os.path.exists(ff_tensor1_fp) and os.path.exists(ff_tensor2_fp)
+    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",")
+    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",")
+
+    if ff_tensor1.shape != ff_tensor2.shape:
+        print(ff_tensor1.shape, ff_tensor2.shape)
+    assert ff_tensor1.shape == ff_tensor2.shape
+
+    if max_len > -1:
+        ff_tensor1 = ff_tensor1[:max_len]
+        ff_tensor2 = ff_tensor2[:max_len]
+
+    mismatches = []
+    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):
+        print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}")
+        print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}")
+        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]
+        print(mismatches)
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len(ff_tensor1)
+    print("Ok!")
+
+
+def compare_flexflow_tensors_shortest(
+    ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance: float = 1e-5
+):
+    """Compare two FlexFlow tensors up to the maximum length of the shortest tensor
+
+    Args:
+        ff_tensor1_fp (str): The file path of the first FlexFlow tensor
+        ff_tensor2_fp (str): The file path of the second FlexFlow tensor
+        tolerance (float, optional): Floating point error tolerance for the check. Defaults to 1e-5.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """
+    if not os.path.exists(ff_tensor1_fp):
+        raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found")
+    if not os.path.exists(ff_tensor2_fp):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found")
+    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",")
+    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",")
+    minlen = min(ff_tensor1.shape[0], ff_tensor2.shape[0])
+    ff_tensor1 = ff_tensor1[:minlen]
+    ff_tensor2 = ff_tensor2[:minlen]
+    mismatches = []
+    if not np.allclose(ff_tensor1, ff_tensor2, atol=tolerance):
+        print(f"mismatch between {ff_tensor1_fp} and {ff_tensor2_fp}")
+        print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}")
+        print(np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor1, ff_tensor2, atol=tolerance))[0]
+        print(mismatches)
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len(ff_tensor1)
+    print("Ok!")
+
+
+def check_flexflow_tensors_sum(
+    ff_tensor_sum_fp: str, ff_tensor1_fp: str, ff_tensor2_fp: str, tolerance=1e-5
+):
+    """Check whether a FlexFlow tensor is equal to the sum of two other FlexFlow tensors
+
+    Args:
+        ff_tensor_sum_fp (str): The file path of the FlexFlow sum tensor
+        ff_tensor1_fp (str): The file path of the first FlexFlow tensor
+        ff_tensor2_fp (str): The file path of the second FlexFlow tensor
+        tolerance (_type_, optional): Floating-point error tolerance for the check. Defaults to 1e-5.
+
+    Raises:
+        FileNotFoundError: _description_
+        FileNotFoundError: _description_
+    """
+    if not os.path.exists(ff_tensor1_fp):
+        raise FileNotFoundError(f"FF tensor file: {ff_tensor1_fp} not found")
+    if not os.path.exists(ff_tensor2_fp):
+        raise FileNotFoundError(f"FF tensor file {ff_tensor2_fp} not found")
+    ff_tensor1 = np.loadtxt(ff_tensor1_fp, delimiter=",")
+    ff_tensor2 = np.loadtxt(ff_tensor2_fp, delimiter=",")
+    ff_tensor_sum = np.loadtxt(ff_tensor_sum_fp, delimiter=",")
+
+    ff_sum = ff_tensor1 + ff_tensor2
+    assert ff_tensor1.shape == ff_tensor2.shape
+
+    mismatches = []
+    if not np.allclose(ff_tensor_sum, ff_sum, atol=tolerance):
+        print(
+            f"mismatch between {ff_tensor_sum_fp} and sum of {ff_tensor1_fp} + {ff_tensor2_fp}"
+        )
+        print(f"Tensor1: {ff_tensor1}\nTensor2:{ff_tensor2}")
+        print(f"Sum Tensor: {ff_tensor_sum}\nActual sum:{ff_sum}")
+        print(np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))
+        mismatches = np.where(~np.isclose(ff_tensor_sum, ff_sum, atol=tolerance))[0]
+        print(mismatches)
+    # assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))
+    assert len(mismatches) <= 0.05 * len(ff_tensor1)
+    print("Ok!")
+
+
+def load_ff_tensor(filename: str, shape: List[int]):
+    """Load a FlexFlow tensor from a file as a numpy array
+
+    Args:
+        filename (str): The file path of the FF tensor
+        shape (List[int]): The shape of the FF tensor
+
+    Returns:
+        _type_: The FF tensor as a numpy array
+    """
+    if ff_path not in filename:
+        filename = os.path.join(ff_path, filename)
+    ff_tensor = np.loadtxt(filename, delimiter=",").reshape(shape, order="F")
+    return ff_tensor
+
+
+def load_hf_tensor(filename: str):
+    """Load a HuggingFace tensor from a file as a numpy array
+
+    Args:
+        filename (str): The file path of the HF tensor
+
+    Returns:
+        _type_: The HF tensor as a numpy array
+    """
+    if hf_path not in filename:
+        filename = os.path.join(hf_path, filename)
+    hf_tensor = torch.load(filename)
+    hf_tensor = hf_tensor.detach().cpu().numpy()
+    return hf_tensor
+
+
+def compare_loaded_tensors(hf_tensor, ff_tensor, tolerance=1e-2):
+    """Check whether a Huggingface and a FlexFlow tensors, both loaded to memory in the form of a numpy array, are equal
+
+    Args:
+        hf_tensor (_type_): The HuggingFace tensor (in numpy array form)
+        ff_tensor (_type_): The FlexFlow tensor (in numpy array form)
+        tolerance (_type_, optional): The floating point error tolerance for the check. Defaults to 1e-2.
+    """
+    assert hf_tensor.shape == ff_tensor.shape
+    mismatches = []
+    if not np.allclose(hf_tensor, ff_tensor, atol=tolerance):
+        print(f"mismatch between hf_tensor and ff_tensor")
+        print(f"HF: {hf_tensor}\nFF:{ff_tensor}")
+        print(np.isclose(hf_tensor, ff_tensor, atol=tolerance))
+        mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0]
+        print(mismatches)
+    len_hf_tensor = hf_tensor.flatten().shape[0]
+    assert len(mismatches) <= 0.05 * len_hf_tensor
+    print("Ok!")
+
+
+def are_np_arrays_identical(*np_arrays):
+    if len(np_arrays) < 2:
+        return True
+
+    first = np_arrays[0]
+
+    # Check shapes and dtypes
+    if not all(
+        t.shape == first.shape and t.dtype == first.dtype for t in np_arrays[1:]
+    ):
+        return False
+
+    # Stack all tensors along a new axis
+    stacked = np.stack(np_arrays)
+
+    # Check if all elements along the new axis are equal
+    return np.all(stacked == stacked[0])
+
+
+class TPType(Enum):
+    REPLICATE = 0
+    PARTITION = 1
+    TO_REDUCE = 2
+
+
+@dataclass
+class TensorComparisonIdxs:
+    hf_tensor_type: str
+    ff_tensor_type: str
+    hf_tensor_idx: int
+    ff_tensor_idx: int
+
+
+def replace_value(lst, old_value, new_value):
+    occurrences = lst.count(old_value)
+    if occurrences == 0:
+        raise ValueError(f"Value {old_value} not found in the list.")
+    elif occurrences > 1:
+        raise ValueError(f"Multiple instances of {old_value} found in the list.")
+    else:
+        index = lst.index(old_value)
+        lst[index] = new_value
+        return lst
+
+
+def truncate_dimension(tensor, old_dim, new_dim):
+    # Check if old_dim appears exactly once in the tensor's shape
+    shape = tensor.shape
+    dim_occurrences = shape.count(old_dim)
+
+    if dim_occurrences == 0:
+        raise ValueError(f"Dimension {old_dim} not found in the tensor shape.")
+    elif dim_occurrences > 1:
+        raise ValueError(
+            f"Multiple instances of dimension {old_dim} found in the tensor shape."
+        )
+
+    # Check if new_dim is less than or equal to old_dim
+    if new_dim > old_dim:
+        raise ValueError(
+            f"New dimension ({new_dim}) must be less than or equal to old dimension ({old_dim})."
+        )
+
+    # Find the index of the dimension to truncate
+    dim_index = shape.index(old_dim)
+
+    # Create a slice object for truncation
+    slices = [slice(None)] * len(shape)
+    slices[dim_index] = slice(0, new_dim)
+
+    # Truncate the tensor
+    truncated_tensor = tensor[tuple(slices)]
+
+    return truncated_tensor
diff --git a/tests/peft/alignment/llama_alignment_tests.ipynb b/tests/peft/alignment/llama_alignment_tests.ipynb
new file mode 100644
index 0000000000..86a4ef76c4
--- /dev/null
+++ b/tests/peft/alignment/llama_alignment_tests.ipynb
@@ -0,0 +1,2651 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os, torch\n",
+    "from align_test_utils import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/usr/FlexFlow/tests/peft/hf_peft_tensors /usr/FlexFlow/build/inference_tensors\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(hf_path, ff_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Check weights (semi-automatically)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "def convert_hf_filename_to_ff_filename(f, num_layers=12):\n",
+    "    if f.endswith(\".lm_head.weight\"):\n",
+    "        f_version = f\"fwd_step_0_layers_{num_layers-1}_lm_head_shard_0_weight_0\"\n",
+    "    elif f == \"norm.weight\":\n",
+    "        f_version = f\"fwd_step_0_layers_{num_layers-1}_norm_shard_0_weight_0\"\n",
+    "    else:\n",
+    "        f_version = \"fwd_step_0_\"\n",
+    "        if f.startswith(\"layers.\"):\n",
+    "            layernum = f.split(\"layers.\")[1].split(\".\")[0]\n",
+    "            f_version += f\"layers_{layernum}_\"\n",
+    "        f_version += f.split(\".weight\")[0].replace(\".base_layer\", \"\").replace(\".default\", \"\")\n",
+    "        weight_index=\"0\"\n",
+    "        if \"lora_A\" in f_version:\n",
+    "            weight_index=\"A\"\n",
+    "        elif \"lora_B\" in f_version:\n",
+    "            weight_index=\"B\"\n",
+    "        f_version = f_version.replace(\"lora_A\", \"lora\").replace(\"lora_B\", \"lora\")\n",
+    "        f_version += f\"_shard_0_weight_{weight_index}\"\n",
+    "    return f_version\n",
+    "\n",
+    "files_list = os.listdir(hf_path)\n",
+    "num_layers=12\n",
+    "for f in sorted(files_list):\n",
+    "    if f.endswith(\".weight\"):\n",
+    "        if \"self_attn\" in f:\n",
+    "            continue\n",
+    "        f_version = convert_hf_filename_to_ff_filename(f, num_layers=num_layers)\n",
+    "        # print(f, f_version)\n",
+    "        hf_w_path = os.path.join(hf_path, f)\n",
+    "        ff_w_path = os.path.join(ff_path, f_version)\n",
+    "        assert(os.path.isfile(hf_w_path))\n",
+    "        assert(os.path.isfile(ff_w_path))\n",
+    "        # print(\"\\t\", os.path.isfile(hf_w_path), os.path.isfile(ff_w_path))\n",
+    "        # print(\"\\t\", ff_w_path)\n",
+    "\n",
+    "        # check equivalence\n",
+    "        compare_tensors(hf_w_path, ff_w_path, tolerance=1e-5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load model for automatic check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/opt/conda/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoModelForCausalLM\n",
+    "from peft import PeftModel, PeftConfig\n",
+    "use_full_precision=True\n",
+    "peft_model_id=\"goliaro/llama-160m-lora\"\n",
+    "peft_config = PeftConfig.from_pretrained(peft_model_id)\n",
+    "if peft_config.peft_type != \"LORA\":\n",
+    "    raise ValueError(f\"PEFT type {peft_config.peft_type} not supported yet\")\n",
+    "\n",
+    "peft_config.init_lora_weights = (\n",
+    "    False\n",
+    ")  # prevent HF from re-inizialing the weights randomly\n",
+    "model_name = peft_config.base_model_name_or_path\n",
+    "# Load base model, and apply the PEFT layer\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    torch_dtype=torch.float32 if use_full_precision else torch.float16,\n",
+    "    device_map=\"auto\",\n",
+    ")\n",
+    "model = PeftModel.from_pretrained(model, peft_model_id, config=peft_config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "embed_tokens True True\n",
+      "layers.0.self_attn.q_proj True True\n",
+      "layers.0.self_attn.k_proj True True\n",
+      "layers.0.self_attn.v_proj True True\n",
+      "layers.0.self_attn.o_proj True True\n",
+      "layers.0.self_attn.rotary_emb True True\n",
+      "layers.0.mlp.gate_proj True True\n",
+      "layers.0.mlp.up_proj True True\n",
+      "layers.0.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.0.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.0.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_0_layers.0.mlp.act_fn_shard_0_output_0\n",
+      "layers.0.input_layernorm True True\n",
+      "layers.0.post_attention_layernorm True True\n",
+      "layers.1.self_attn.q_proj True True\n",
+      "layers.1.self_attn.k_proj True True\n",
+      "layers.1.self_attn.v_proj True True\n",
+      "layers.1.self_attn.o_proj True True\n",
+      "layers.1.self_attn.rotary_emb True True\n",
+      "layers.1.mlp.gate_proj True True\n",
+      "layers.1.mlp.up_proj True True\n",
+      "layers.1.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.1.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.1.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_1_layers.1.mlp.act_fn_shard_0_output_0\n",
+      "layers.1.input_layernorm True True\n",
+      "layers.1.post_attention_layernorm True True\n",
+      "layers.2.self_attn.q_proj True True\n",
+      "layers.2.self_attn.k_proj True True\n",
+      "layers.2.self_attn.v_proj True True\n",
+      "layers.2.self_attn.o_proj True True\n",
+      "layers.2.self_attn.rotary_emb True True\n",
+      "layers.2.mlp.gate_proj True True\n",
+      "layers.2.mlp.up_proj True True\n",
+      "layers.2.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.2.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.2.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_2_layers.2.mlp.act_fn_shard_0_output_0\n",
+      "layers.2.input_layernorm True True\n",
+      "layers.2.post_attention_layernorm True True\n",
+      "layers.3.self_attn.q_proj True True\n",
+      "layers.3.self_attn.k_proj True True\n",
+      "layers.3.self_attn.v_proj True True\n",
+      "layers.3.self_attn.o_proj True True\n",
+      "layers.3.self_attn.rotary_emb True True\n",
+      "layers.3.mlp.gate_proj True True\n",
+      "layers.3.mlp.up_proj True True\n",
+      "layers.3.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.3.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.3.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_3_layers.3.mlp.act_fn_shard_0_output_0\n",
+      "layers.3.input_layernorm True True\n",
+      "layers.3.post_attention_layernorm True True\n",
+      "layers.4.self_attn.q_proj True True\n",
+      "layers.4.self_attn.k_proj True True\n",
+      "layers.4.self_attn.v_proj True True\n",
+      "layers.4.self_attn.o_proj True True\n",
+      "layers.4.self_attn.rotary_emb True True\n",
+      "layers.4.mlp.gate_proj True True\n",
+      "layers.4.mlp.up_proj True True\n",
+      "layers.4.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.4.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.4.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_4_layers.4.mlp.act_fn_shard_0_output_0\n",
+      "layers.4.input_layernorm True True\n",
+      "layers.4.post_attention_layernorm True True\n",
+      "layers.5.self_attn.q_proj True True\n",
+      "layers.5.self_attn.k_proj True True\n",
+      "layers.5.self_attn.v_proj True True\n",
+      "layers.5.self_attn.o_proj True True\n",
+      "layers.5.self_attn.rotary_emb True True\n",
+      "layers.5.mlp.gate_proj True True\n",
+      "layers.5.mlp.up_proj True True\n",
+      "layers.5.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.5.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.5.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_5_layers.5.mlp.act_fn_shard_0_output_0\n",
+      "layers.5.input_layernorm True True\n",
+      "layers.5.post_attention_layernorm True True\n",
+      "layers.6.self_attn.q_proj True True\n",
+      "layers.6.self_attn.k_proj True True\n",
+      "layers.6.self_attn.v_proj True True\n",
+      "layers.6.self_attn.o_proj True True\n",
+      "layers.6.self_attn.rotary_emb True True\n",
+      "layers.6.mlp.gate_proj True True\n",
+      "layers.6.mlp.up_proj True True\n",
+      "layers.6.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.6.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.6.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_6_layers.6.mlp.act_fn_shard_0_output_0\n",
+      "layers.6.input_layernorm True True\n",
+      "layers.6.post_attention_layernorm True True\n",
+      "layers.7.self_attn.q_proj True True\n",
+      "layers.7.self_attn.k_proj True True\n",
+      "layers.7.self_attn.v_proj True True\n",
+      "layers.7.self_attn.o_proj True True\n",
+      "layers.7.self_attn.rotary_emb True True\n",
+      "layers.7.mlp.gate_proj True True\n",
+      "layers.7.mlp.up_proj True True\n",
+      "layers.7.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.7.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.7.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_7_layers.7.mlp.act_fn_shard_0_output_0\n",
+      "layers.7.input_layernorm True True\n",
+      "layers.7.post_attention_layernorm True True\n",
+      "layers.8.self_attn.q_proj True True\n",
+      "layers.8.self_attn.k_proj True True\n",
+      "layers.8.self_attn.v_proj True True\n",
+      "layers.8.self_attn.o_proj True True\n",
+      "layers.8.self_attn.rotary_emb True True\n",
+      "layers.8.mlp.gate_proj True True\n",
+      "layers.8.mlp.up_proj True True\n",
+      "layers.8.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.8.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.8.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_8_layers.8.mlp.act_fn_shard_0_output_0\n",
+      "layers.8.input_layernorm True True\n",
+      "layers.8.post_attention_layernorm True True\n",
+      "layers.9.self_attn.q_proj True True\n",
+      "layers.9.self_attn.k_proj True True\n",
+      "layers.9.self_attn.v_proj True True\n",
+      "layers.9.self_attn.o_proj True True\n",
+      "layers.9.self_attn.rotary_emb True True\n",
+      "layers.9.mlp.gate_proj True True\n",
+      "layers.9.mlp.up_proj True True\n",
+      "layers.9.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.9.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.9.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_9_layers.9.mlp.act_fn_shard_0_output_0\n",
+      "layers.9.input_layernorm True True\n",
+      "layers.9.post_attention_layernorm True True\n",
+      "layers.10.self_attn.q_proj True True\n",
+      "layers.10.self_attn.k_proj True True\n",
+      "layers.10.self_attn.v_proj True True\n",
+      "layers.10.self_attn.o_proj True True\n",
+      "layers.10.self_attn.rotary_emb True True\n",
+      "layers.10.mlp.gate_proj True True\n",
+      "layers.10.mlp.up_proj True True\n",
+      "layers.10.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.10.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.10.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.mlp.act_fn_shard_0_output_0\n",
+      "layers.10.input_layernorm True True\n",
+      "layers.10.post_attention_layernorm True True\n",
+      "layers.11.self_attn.q_proj True True\n",
+      "layers.11.self_attn.k_proj True True\n",
+      "layers.11.self_attn.v_proj True True\n",
+      "layers.11.self_attn.o_proj True True\n",
+      "layers.11.self_attn.rotary_emb True True\n",
+      "layers.11.mlp.gate_proj True True\n",
+      "layers.11.mlp.up_proj True True\n",
+      "layers.11.mlp.down_proj.base_layer True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.base_layer_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_dropout.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_dropout.default_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_A.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_A.default_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_B.default True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_B.default_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_embedding_A False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_A_shard_0_output_0\n",
+      "layers.11.mlp.down_proj.lora_embedding_B False False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.down_proj.lora_embedding_B_shard_0_output_0\n",
+      "layers.11.mlp.act_fn True False\n",
+      "\t /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_input_0 /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.mlp.act_fn_shard_0_output_0\n",
+      "layers.11.input_layernorm True True\n",
+      "layers.11.post_attention_layernorm True True\n",
+      "norm True True\n",
+      "lm_head True True\n"
+     ]
+    }
+   ],
+   "source": [
+    "named_modules_ = [\n",
+    "    name.replace(\"base_model.model.model.\", \"\").replace(\"base_model.model.model\", \"\").replace(\"base_model.model.\", \"\").replace(\"base_model.model\", \"\").replace(\"base_model.\", \"\").replace(\"base_model\", \"\")\n",
+    "    for name, _ in model.named_modules()\n",
+    "]\n",
+    "\n",
+    "def remove_prefixes(named_modules):\n",
+    "    i = 0\n",
+    "    while i < len(named_modules) - 1:\n",
+    "        if named_modules[i + 1].startswith(named_modules[i]):\n",
+    "            named_modules.pop(i)\n",
+    "        else:\n",
+    "            i += 1\n",
+    "    return named_modules\n",
+    "named_modules = remove_prefixes(named_modules_)\n",
+    "\n",
+    "def convert_hf_module_name_to_ff_filenames(n, num_layers=12):\n",
+    "    if n == \"embed_tokens\":\n",
+    "        ff_in_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n",
+    "        ff_out_name = \"fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n",
+    "    elif n == \"lm_head\" or n == \"norm\":\n",
+    "        ff_in_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_input_0\"\n",
+    "        ff_out_name = f\"fwd_step_0_layers_{num_layers-1}_{n}_shard_0_output_0\"\n",
+    "    elif n.startswith(\"layers.\"):\n",
+    "        layernum = n.split(\"layers.\")[1].split(\".\")[0]\n",
+    "        ff_in_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_input_0\"\n",
+    "        ff_out_name = f\"fwd_step_0_layers_{layernum}_{n}_shard_0_output_0\"\n",
+    "    else:\n",
+    "        assert False, f\"Module {n} not supported yet\"\n",
+    "    return os.path.join(ff_path, ff_in_name), os.path.join(ff_path, ff_out_name)\n",
+    "\n",
+    "# Compute the hf path, check if the input and output are there\n",
+    "for n in named_modules:\n",
+    "    in_name = f\"fwd_step_0_{n}.input_0\"\n",
+    "    out_name = f\"fwd_step_0_{n}.output_0\"\n",
+    "    if n == \"lm_head\":\n",
+    "        in_name = f\"fwd_step_0_base_model.model.{n}.input_0\"\n",
+    "        out_name = f\"fwd_step_0_base_model.model.{n}.output_0\"\n",
+    "    hf_mod_in = os.path.join(hf_path, in_name)\n",
+    "    hf_mod_out = os.path.join(hf_path, out_name)\n",
+    "    check = os.path.exists(hf_mod_in) and os.path.exists(hf_mod_out)\n",
+    "    \n",
+    "    check2=True\n",
+    "    if \"self_attn\" not in n:\n",
+    "        ff_mod_in, ff_mod_out = convert_hf_module_name_to_ff_filenames(n, num_layers=num_layers)\n",
+    "        check2 = os.path.exists(ff_mod_in) and os.path.exists(ff_mod_out)\n",
+    "    print(n, check, check2)\n",
+    "    if not check2:\n",
+    "        print(\"\\t\", ff_mod_in, ff_mod_out)\n",
+    "    # print(n, check)\n",
+    "    # print(\"\\t\", )\n",
+    "    \n",
+    "\n",
+    "# Compute the corresponding ff path, check if the input and output are there\n",
+    "\n",
+    "# for x in named_modules:\n",
+    "#     print(x)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'down_proj'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(model.peft_config['default'].target_modules)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Manual check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "hf_embed_input= \"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.input_0\"\n",
+    "ff_embed_input=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_input_0\"\n",
+    "compare_tensors(hf_embed_input, ff_embed_input)\n",
+    "hf_embed_output=\"/usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_embed_tokens.output_0\"\n",
+    "ff_embed_output=\"/usr/FlexFlow/tests/peft/inference_tensors/fwd_step_0_layers_0_embed_tokens_shard_0_output_0\"\n",
+    "compare_tensors(hf_embed_output, ff_embed_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.10.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_10_layers.10.input_layernorm_shard_0_output_0\n",
+      "HF: [ 0.          0.          0.         ...  0.06630182  6.3429456\n",
+      " -0.21220279]\n",
+      "FF:[ 0.          0.          0.         ...  0.06630275  6.34293985\n",
+      " -0.21219885]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[15889]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.input_layernorm.input_0 and /usr/FlexFlow/build/inference_tensors/fwd_step_0_layers_11_layers.11.input_layernorm_shard_0_output_0\n",
+      "HF: [ 0.          0.          0.         ...  0.14172177  9.79423\n",
+      " -6.2940273 ]\n",
+      "FF:[ 0.          0.          0.         ...  0.14172006  9.79421902\n",
+      " -6.29402065]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 2878  3206  3367  3607  5183  5346  6257  6544  7466  7679  7805  8119\n",
+      "  8159  8911  9450  9897 13696 13938 14058 14599 15126 15839 16128 16195]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for i in range(tot_num_layers):\n",
+    "    hf_input_ln_in = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.input_0\"\n",
+    "    ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_input_0\"\n",
+    "    if i > 0:\n",
+    "        ff_input_ln_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n",
+    "    compare_tensors(hf_input_ln_in, ff_input_ln_in, tolerance=1e-5)\n",
+    "    hf_input_ln_out = f\"{hf_path}/fwd_step_0_layers.{i}.input_layernorm.output_0\"\n",
+    "    ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_0\"\n",
+    "    if i > 0:\n",
+    "        ff_input_ln_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.input_layernorm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_input_ln_out, ff_input_ln_out, tolerance=1e-5)\n",
+    "    hf_attn_out = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.o_proj.output_0\"\n",
+    "    ff_attn_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.self_attn_shard_0_output_0\"\n",
+    "    compare_tensors(hf_attn_out, ff_attn_out, tolerance=1e-5)\n",
+    "    hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_layers.{i}.post_attention_layernorm.output_0\"\n",
+    "    ff_ffn_norm_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.post_attention_layernorm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out, tolerance=1e-5)\n",
+    "    # w1\n",
+    "    hf_gate_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n",
+    "    ff_gate_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.gate_proj_shard_0_output_0\"\n",
+    "    compare_tensors(hf_gate_proj_out, ff_gate_proj_out, tolerance=1e-5)\n",
+    "    # w3\n",
+    "    hf_up_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\" \n",
+    "    ff_up_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.up_proj_shard_0_output_0\"\n",
+    "    compare_tensors(hf_up_proj_out, ff_up_proj_out, tolerance=1e-5)\n",
+    "    # w2\n",
+    "    hf_down_proj_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.input_0\"\n",
+    "    hf_down_proj_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.output_0\"\n",
+    "    ff_down_proj_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_input_0\"\n",
+    "    ff_down_proj_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj_shard_0_output_0\"\n",
+    "    compare_tensors(hf_down_proj_in, ff_down_proj_in)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_down_proj_out)\n",
+    "    # LORA input\n",
+    "    hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.input_0\"\n",
+    "    ff_lora_A_in = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_input_0\"\n",
+    "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
+    "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
+    "    # LORA weights\n",
+    "    hf_lora_A_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
+    "    # LORA intermediate hf\n",
+    "    hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.output_0\"\n",
+    "    hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.input_0\"\n",
+    "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
+    "    # LORA output\n",
+    "    hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.output_0\"\n",
+    "    ff_lora_out = f\"{ff_path}/fwd_step_0_layers_{i}_layers.{i}.mlp.down_proj.lora_shard_0_output_0\"\n",
+    "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
+    "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
+    "    compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n",
+    "    \n",
+    "\n",
+    "# After last layer only\n",
+    "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n",
+    "ff_norm_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_1\"\n",
+    "compare_tensors(hf_norm_out, ff_norm_out, tolerance=1e-5)\n",
+    "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_lm_head_out = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_lm_head_shard_0_output_0\"\n",
+    "compare_tensors(hf_lm_head_out, ff_lm_head_out, tolerance=1e-5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-- LM head --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Final Norm --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "\n",
+    "# ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n",
+    "print(\"-- LM head --\")\n",
+    "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_BWD_lm_head_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_output_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n",
+    "# compare weights\n",
+    "hf_lm_head_weight = f\"{hf_path}/base_model.model.lm_head.weight\"\n",
+    "ff_lm_head_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_output_shard_0_weight_0\"\n",
+    "compare_tensors(hf_lm_head_weight, ff_lm_head_weight, tolerance=1e-5)\n",
+    "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_BWD_lm_head_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_output_shard_0_input_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n",
+    "# # Manually check the matmul\n",
+    "# ff_tensor_out = np.loadtxt(ff_BWD_lm_head_out, delimiter=',')\n",
+    "# ff_weight = np.loadtxt(ff_lm_head_weight, delimiter=',').reshape((4096,32000), order='F')\n",
+    "# ff_tensor_out = ff_tensor_out[:32000*24].reshape((32000,24), order='F')\n",
+    "# print(ff_tensor_out.shape)\n",
+    "# print(ff_weight.shape)\n",
+    "# print(np.matmul(ff_weight, ff_tensor_out))\n",
+    "# compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in)\n",
+    "# ff_tensor = np.loadtxt(ff_tensor_filepath, delimiter=',')\n",
+    "print(\"-- Final Norm --\")\n",
+    "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_norm.go_0\"\n",
+    "ff_BWD_norm_out = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_output_0\"\n",
+    "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n",
+    "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n",
+    "ff_BWD_norm_weight = f\"{ff_path}/fwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_weight_0\"\n",
+    "hf_FWD_norm_weight = f\"{hf_path}/norm.weight\"\n",
+    "compare_tensors(hf_FWD_norm_weight, ff_BWD_norm_weight, tolerance=1e-5)\n",
+    "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n",
+    "ff_BWD_norm_in = f\"{ff_path}/bwd_step_0_layers_{tot_num_layers-1}_norm_shard_0_input_1\"\n",
+    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch import nn\n",
+    "class LlamaRotaryEmbedding(nn.Module):\n",
+    "    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):\n",
+    "        super().__init__()\n",
+    "\n",
+    "        self.dim = dim\n",
+    "        self.max_position_embeddings = max_position_embeddings\n",
+    "        self.base = base\n",
+    "        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))\n",
+    "        self.register_buffer(\"inv_freq\", inv_freq, persistent=False)\n",
+    "\n",
+    "        # Build here to make `torch.jit.trace` work.\n",
+    "        self._set_cos_sin_cache(\n",
+    "            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()\n",
+    "        )\n",
+    "\n",
+    "    def _set_cos_sin_cache(self, seq_len, device, dtype):\n",
+    "        self.max_seq_len_cached = seq_len\n",
+    "        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)\n",
+    "\n",
+    "        freqs = torch.einsum(\"i,j->ij\", t, self.inv_freq)\n",
+    "        # Different from paper, but it uses a different permutation in order to obtain the same calculation\n",
+    "        emb = torch.cat((freqs, freqs), dim=-1)\n",
+    "        self.register_buffer(\"cos_cached\", emb.cos().to(dtype), persistent=False)\n",
+    "        self.register_buffer(\"sin_cached\", emb.sin().to(dtype), persistent=False)\n",
+    "\n",
+    "    def forward(self, x, seq_len=None):\n",
+    "        # x: [bs, num_attention_heads, seq_len, head_size]\n",
+    "        if seq_len > self.max_seq_len_cached:\n",
+    "            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)\n",
+    "\n",
+    "        return (\n",
+    "            self.cos_cached[:seq_len].to(dtype=x.dtype),\n",
+    "            self.sin_cached[:seq_len].to(dtype=x.dtype),\n",
+    "        )\n",
+    "def rotate_half(x):\n",
+    "    \"\"\"Rotates half the hidden dims of the input.\"\"\"\n",
+    "    x1 = x[..., : x.shape[-1] // 2] # first half\n",
+    "    x2 = x[..., x.shape[-1] // 2 :] # second half\n",
+    "    return torch.cat((x2, -x1), dim=-1)\n",
+    "def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):\n",
+    "    \"\"\"Applies Rotary Position Embedding to the query and key tensors.\n",
+    "\n",
+    "    Args:\n",
+    "        q (`torch.Tensor`): The query tensor.\n",
+    "        k (`torch.Tensor`): The key tensor.\n",
+    "        cos (`torch.Tensor`): The cosine part of the rotary embedding.\n",
+    "        sin (`torch.Tensor`): The sine part of the rotary embedding.\n",
+    "        position_ids (`torch.Tensor`):\n",
+    "            The position indices of the tokens corresponding to the query and key tensors. For example, this can be\n",
+    "            used to pass offsetted position ids when working with a KV-cache.\n",
+    "        unsqueeze_dim (`int`, *optional*, defaults to 1):\n",
+    "            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and\n",
+    "            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note\n",
+    "            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and\n",
+    "            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes\n",
+    "            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have\n",
+    "            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.\n",
+    "    Returns:\n",
+    "        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.\n",
+    "    \"\"\"\n",
+    "    cos = cos[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    sin = sin[position_ids].unsqueeze(unsqueeze_dim)\n",
+    "    q_embed = (q * cos) + (rotate_half(q) * sin)\n",
+    "    k_embed = (k * cos) + (rotate_half(k) * sin)\n",
+    "    return q_embed, k_embed\n",
+    "head_dim = 64\n",
+    "max_position_embeddings = 2048\n",
+    "rope_theta=10_000\n",
+    "kv_seq_len = 24\n",
+    "rotary_emb = LlamaRotaryEmbedding(\n",
+    "    head_dim,\n",
+    "    max_position_embeddings=max_position_embeddings,\n",
+    "    base=rope_theta,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43525000e+03 -6.48986062e+05  1.17611250e+05 ...  2.14103413e+01\n",
+      "  1.20965385e+01  3.64246368e+00]\n",
+      "[False  True  True ...  True  True  True]\n",
+      "[   0  162  185  308  339  745  747  820  830  909  933  968 1008 1156\n",
+      " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n",
+      " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n",
+      " 2888 2894 2937 3024]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_feed_forward_w2_shard_0_input_0\n",
+      "HF: [ 6.4350547e+03 -6.4898600e+05  1.1761116e+05 ...  2.1410337e+01\n",
+      "  1.2096541e+01  3.6424692e+00]\n",
+      "FF:[ 6.43525000e+03 -6.48986062e+05  1.17611250e+05 ...  2.14103413e+01\n",
+      "  1.20965385e+01  3.64246368e+00]\n",
+      "[False  True  True ...  True  True  True]\n",
+      "[   0  162  185  308  339  745  747  820  830  909  933  968 1008 1156\n",
+      " 1160 1190 1212 1296 1304 1311 1323 1353 1395 1421 1523 1578 1689 1717\n",
+      " 1736 1748 1836 2074 2124 2192 2221 2313 2394 2515 2518 2693 2758 2825\n",
+      " 2888 2894 2937 3024]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_11_layers_11_attention_shard_0_o_proj_in_grad\n",
+      "HF: [ 1.2223595e+06 -2.6348565e+06 -5.0760525e+05 ...  6.8275871e+01\n",
+      " -5.8116108e+01  9.5347488e+01]\n",
+      "FF:[ 1.22235925e+06 -2.63485625e+06 -5.07605000e+05 ...  6.82758865e+01\n",
+      " -5.81161423e+01  9.53475494e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 51  77  95 168 175 232 725]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 1.22235950e+06  9.93645859e+01 -2.82157593e+01 ... -3.94578514e+01\n",
+      "  -1.98409653e+01 -1.33438044e+01]\n",
+      " [-2.63485650e+06 -1.13461929e+02  1.14223976e+02 ...  7.52578735e+01\n",
+      "   1.33362747e+02  6.78501587e+01]\n",
+      " [-5.07605250e+05  4.34111862e+01  8.10619354e+01 ...  4.70537224e+01\n",
+      "   4.02149696e+01  6.98045502e+01]\n",
+      " ...\n",
+      " [ 3.02792250e+06  3.31295319e+02  9.98417091e+00 ...  4.90895653e+01\n",
+      "   9.71413574e+01  6.82758713e+01]\n",
+      " [-3.64456375e+06 -2.43692596e+02 -6.85474396e+00 ... -3.71503868e+01\n",
+      "  -1.34136658e+01 -5.81161079e+01]\n",
+      " [ 3.31921500e+06  2.24193970e+02 -6.64005566e+00 ...  2.11662292e+00\n",
+      "   3.37400856e+01  9.53474884e+01]]\n",
+      "FF:[[ 1.22235925e+06  9.93645630e+01 -2.82157211e+01 ... -3.94577713e+01\n",
+      "  -1.98408775e+01 -1.33438234e+01]\n",
+      " [-2.63485625e+06 -1.13461960e+02  1.14224037e+02 ...  7.52577744e+01\n",
+      "   1.33362701e+02  6.78501205e+01]\n",
+      " [-5.07605000e+05  4.34111404e+01  8.10619278e+01 ...  4.70536804e+01\n",
+      "   4.02149124e+01  6.98045578e+01]\n",
+      " ...\n",
+      " [ 3.02792250e+06  3.31295227e+02  9.98412323e+00 ...  4.90895386e+01\n",
+      "   9.71413727e+01  6.82758865e+01]\n",
+      " [-3.64456400e+06 -2.43692627e+02 -6.85472488e+00 ... -3.71504822e+01\n",
+      "  -1.34137001e+01 -5.81161423e+01]\n",
+      " [ 3.31921500e+06  2.24193970e+02 -6.64004517e+00 ...  2.11670875e+00\n",
+      "   3.37400322e+01  9.53475494e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[ 51  77  95 168 175 232 725]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 1.2223588e+06 -2.6348530e+06 -5.0760291e+05 ...  3.0279325e+06\n",
+      "  -3.6445672e+06  3.3192180e+06]\n",
+      " [-4.2496326e+02  1.1576636e+03  9.8397858e+02 ...  1.6480791e+03\n",
+      "  -5.9697235e+02  6.2627173e+02]\n",
+      " [-2.2012039e+01  6.6097900e+01  3.9933994e+01 ...  5.7103355e+01\n",
+      "  -1.5968766e+01  3.6536639e+00]\n",
+      " ...\n",
+      " [-1.2302110e+00  5.3052688e+00  2.1982718e+00 ...  1.3990868e+00\n",
+      "  -5.5132383e-01  4.8985812e-01]\n",
+      " [-1.0771493e+00  6.9571300e+00  2.7373023e+00 ...  4.9663010e+00\n",
+      "  -9.9705428e-01  2.1829298e+00]\n",
+      " [-5.9534687e-01  3.0272012e+00  3.1143982e+00 ...  2.4072502e+00\n",
+      "  -2.0490403e+00  3.3617332e+00]]\n",
+      "FF:[[ 1.22235850e+06 -2.63485275e+06 -5.07602656e+05 ...  3.02793250e+06\n",
+      "  -3.64456750e+06  3.31921800e+06]\n",
+      " [-4.24962585e+02  1.15766296e+03  9.83978577e+02 ...  1.64807898e+03\n",
+      "  -5.96972351e+02  6.26271790e+02]\n",
+      " [-2.20120354e+01  6.60979462e+01  3.99340210e+01 ...  5.71033745e+01\n",
+      "  -1.59687757e+01  3.65366316e+00]\n",
+      " ...\n",
+      " [-1.23020661e+00  5.30526114e+00  2.19826817e+00 ...  1.39908671e+00\n",
+      "  -5.51325083e-01  4.89858717e-01]\n",
+      " [-1.07714510e+00  6.95712519e+00  2.73729825e+00 ...  4.96630049e+00\n",
+      "  -9.97055829e-01  2.18292713e+00]\n",
+      " [-5.95347941e-01  3.02720070e+00  3.11439991e+00 ...  2.40725493e+00\n",
+      "  -2.04904509e+00  3.36174107e+00]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "7.4363425925925934% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-7.52523500e+06 -1.27625415e+03 -4.39338150e+01 ... -3.34414902e+01\n",
+      "   2.38160934e+01  3.15938339e+01]\n",
+      " [-9.55138900e+06  6.71377197e+02  2.06871887e+02 ... -3.86393509e+01\n",
+      "   2.14816055e+01 -6.58599396e+01]\n",
+      " [ 1.14522670e+07  2.19898975e+03 -6.89673233e+00 ...  9.51593590e+00\n",
+      "  -1.68612709e+01  6.02474251e+01]\n",
+      " ...\n",
+      " [ 2.10891925e+06  3.78648706e+03  1.02701221e+03 ...  3.59794388e+01\n",
+      "   5.03902206e+01  4.19777756e+01]\n",
+      " [ 2.11695300e+06 -2.36283508e+02 -1.08002625e+02 ...  9.36443710e+00\n",
+      "   3.84094887e+01 -7.51948738e+00]\n",
+      " [ 7.39155050e+06  1.11731885e+03  3.38369843e+02 ...  3.70399475e+01\n",
+      "   1.77629051e+01  9.76780853e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-7.52523600e+06 -1.27625293e+03 -4.39336700e+01 ... -3.34414597e+01\n",
+      "   2.38162422e+01  3.15938187e+01]\n",
+      " [-9.55138900e+06  6.71377319e+02  2.06871674e+02 ... -3.86393127e+01\n",
+      "   2.14817867e+01 -6.58600464e+01]\n",
+      " [ 1.14522660e+07  2.19898950e+03 -6.89660644e+00 ...  9.51594448e+00\n",
+      "  -1.68611774e+01  6.02474518e+01]\n",
+      " ...\n",
+      " [ 2.10891850e+06  3.78648633e+03  1.02701196e+03 ...  3.59794846e+01\n",
+      "   5.03901253e+01  4.19777679e+01]\n",
+      " [ 2.11695400e+06 -2.36282440e+02 -1.08002762e+02 ...  9.36448860e+00\n",
+      "   3.84096107e+01 -7.51954842e+00]\n",
+      " [ 7.39155000e+06  1.11731921e+03  3.38370087e+02 ...  3.70398293e+01\n",
+      "   1.77627277e+01  9.76782227e+01]]\n",
+      "6.011284722222222% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-9.4779546e+09 -1.2174155e+10  1.4899113e+10 ...  4.9057606e+01\n",
+      "  4.7770348e+01  5.8564331e+01]\n",
+      "FF:[-9.47795558e+09 -1.21741548e+10  1.48991119e+10 ...  4.90575981e+01\n",
+      "  4.77703362e+01  5.85643845e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   88   138   187   203   232   242   493   657   750   900  1198  1249\n",
+      "  1287  1305  1414  1428  1490  1588  1600  1612  1625  1657  1676  1677\n",
+      "  1692  1694  1724  1730  1772  1822  1825  1838  1853  1910  2035  2043\n",
+      "  2053  2059  2073  2078  2123  2145  2214  2238  2241  2285  2292  2389\n",
+      "  2542  2582  2589  2599  2674  2688  2711  2840  2856  2961  2963  2980\n",
+      "  3064  3176  3192  3255  3262  3278  3338  3341  3412  3419  3492  3590\n",
+      "  3624  3646  3657  3807  3840  3842  3846  3883  3887  4005  4049  4071\n",
+      "  4076  4077  4079  4137  4142  4192  4193  4202  4218  4224  4273  4355\n",
+      "  4358  4381  4401  4435  4469  4499  4514  4546  4598  4619  4747  4846\n",
+      "  4872  4916  4952  4966  5016  5067  5107  5112  5116  5194  5225  5350\n",
+      "  5364  5403  5515  5537  5550  5578  5650  5653  5654  5736  5751  5837\n",
+      "  5870  5881  5972  5998  6006  6051  6061  6107  6129  6204  6236  6292\n",
+      "  6296  6327  6382  6393  6403  6420  6424  6436  6468  6542  6599  6675\n",
+      "  6681  6711  6723  6767  6823  6914  6983  7047  7064  7133  7167  7197\n",
+      "  7198  7209  7528  7537  7538  7686  7850  7855  7889  7910  7919  7927\n",
+      "  7937  7939  8089  8101  8157  8169  8175  8223  8292  8304  8306  8342\n",
+      "  8351  8414  8475  8500  8543  8558  8609  8656  8687  8704  8724  8726\n",
+      "  8777  8816  8826  8871  8904  8934  8983  9012  9033  9043  9068  9093\n",
+      "  9125  9133  9144  9151  9154  9217  9222  9320  9335  9367  9398  9421\n",
+      "  9434  9521  9547  9633  9702  9726  9763  9949 10018 10053 10062 10079\n",
+      " 10137 10149 10203 10261 10269 10292 10312 10332 10471 10478 10514 10596\n",
+      " 10645 10676 10678 10781 10795 10810 10833 10891 10904 10935 10957 10977\n",
+      " 10982 11028 11095 11172 11223 11251 11283 11303 11319 11374 11392 11437\n",
+      " 11486 11627 11678 11750 11759 11979 11996 12019 12126 12237 12262 12288\n",
+      " 12303 12309 12315 12387 12543 12569 12613 12648 12786 12852 12866 12879\n",
+      " 12947 12963 13037 13058 13261 13284 13312 13394 13399 13427 13526 13527\n",
+      " 13592 13695 13741 13752 13775 13803 13812 13866 13902 14049 14170 14241\n",
+      " 14354 14382 14426 14451 14455 14486 14502 14582 14820 14934 14961 14976\n",
+      " 15000 15003 15014 15077 15096 15108 15135 15148 15165 15219 15232 15290\n",
+      " 15339 15345 15819 15945 15994 16077 16135 16218 16231 16233 16239 16243\n",
+      " 16295 16311 16339 16356 16366 16417 16456 16498 16502 16503 16506 16547\n",
+      " 16585 16603 16611 16633 16661 16683 16704 16710 16723 16724 16745 16754\n",
+      " 16773 16787 16789 16818 16829 16833 16913 16933 17025 17033 17037 17055\n",
+      " 17084 17098 17109 17176 17225 17240 17292 17294 17339 17390 17427 17437\n",
+      " 17579 17626 17630 17654 17719 17902 17912 18023 18025 18124 18203 18339\n",
+      " 18344]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_output_0\n",
+      "HF: [-9.4779546e+09 -1.2174155e+10  1.4899113e+10 ...  4.9057606e+01\n",
+      "  4.7770348e+01  5.8564331e+01]\n",
+      "FF:[-9.47795558e+09 -1.21741548e+10  1.48991119e+10 ...  4.90575981e+01\n",
+      "  4.77703362e+01  5.85643845e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 88 138 187 203 232 242 493 657 750]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_lora_shard_0_input_0\n",
+      "HF: [ 4.7819588e+07  3.8833264e+07  4.7789860e+07 ...  1.0804405e+00\n",
+      "  2.7186510e-01 -2.9918199e+00]\n",
+      "FF:[ 4.78195960e+07  3.88332640e+07  4.77898600e+07 ...  1.08044124e+00\n",
+      "  2.71864563e-01 -2.99182224e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 109  211  312  422  590  832  835 1016 1053 1076 1268 1353 1374 1693\n",
+      " 1701 1710 1722 1832 1954 1965 1997 2076 2124 2146 2378 2520 2605 2624\n",
+      " 2967 3007 3015]\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [ 3.3558659e+09  1.3409817e+10 -1.4671958e+10 ...  7.2100967e+01\n",
+      "  6.5979071e+00 -2.1230124e+01]\n",
+      "FF:[ 3.35586406e+09  1.34098166e+10 -1.46719611e+10 ...  7.21009750e+01\n",
+      "  6.59790993e+00 -2.12301121e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   4   95  111  163  179  191  279  305  363  406  447  487  489  494\n",
+      "  517  617  703  713  735  796  805  819  826  858  882  959  964  967\n",
+      "  986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n",
+      " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n",
+      " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n",
+      " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_feed_forward_w2_shard_0_input_0\n",
+      "HF: [ 3.3558659e+09  1.3409817e+10 -1.4671958e+10 ...  7.2100967e+01\n",
+      "  6.5979071e+00 -2.1230124e+01]\n",
+      "FF:[ 3.35586406e+09  1.34098166e+10 -1.46719611e+10 ...  7.21009750e+01\n",
+      "  6.59790993e+00 -2.12301121e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   4   95  111  163  179  191  279  305  363  406  447  487  489  494\n",
+      "  517  617  703  713  735  796  805  819  826  858  882  959  964  967\n",
+      "  986 1020 1035 1054 1067 1070 1077 1081 1095 1097 1123 1139 1181 1238\n",
+      " 1296 1342 1369 1489 1550 1557 1623 1669 1752 1757 1783 1819 1876 1949\n",
+      " 1963 1993 2034 2047 2091 2115 2153 2170 2306 2381 2419 2431 2456 2501\n",
+      " 2503 2591 2653 2768 2778 2791 2970 2980 3053 3067]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_output_0\n",
+      "HF: [-9.4779546e+09 -1.2174155e+10  1.4899113e+10 ...  9.3464905e+01\n",
+      "  7.5613129e+01  7.6598846e+01]\n",
+      "FF:[-9.47795558e+09 -1.21741548e+10  1.48991119e+10 ...  9.34649200e+01\n",
+      "  7.56131058e+01  7.65989227e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 88 138 187 203 232 242 493 657 750]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.10.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_10_layers_10_attention_shard_0_o_proj_in_grad\n",
+      "HF: [-9.4470595e+09 -7.3870331e+09  1.2659395e+10 ... -2.8149616e+01\n",
+      "  1.7019112e+02 -7.7236428e+00]\n",
+      "FF:[-9.44706150e+09 -7.38703309e+09  1.26593966e+10 ... -2.81496239e+01\n",
+      "  1.70191177e+02 -7.72364044e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 11  98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-9.44705946e+09  2.28078384e+01  3.18554016e+02 ...  1.17267204e+02\n",
+      "   2.06791725e+01  1.13138672e+02]\n",
+      " [-7.38703309e+09 -7.36898804e+00  7.93705673e+01 ...  2.04039650e+01\n",
+      "   3.18331490e+01  5.44241562e+01]\n",
+      " [ 1.26593946e+10  1.77534424e+02 -2.97175941e+01 ...  1.16716766e+01\n",
+      "   7.70214081e+01  2.81902496e+02]\n",
+      " ...\n",
+      " [ 4.51210445e+10  3.63867615e+02 -8.04915466e+01 ... -1.34332123e+02\n",
+      "  -1.22151840e+02 -2.81496162e+01]\n",
+      " [-1.39591885e+10  1.59216873e+02  6.11343079e+01 ...  1.56675262e+02\n",
+      "   9.68551483e+01  1.70191116e+02]\n",
+      " [-1.29442345e+10 -2.39441833e+02  2.73647644e+02 ... -4.41197014e+01\n",
+      "  -9.48526230e+01 -7.72364283e+00]]\n",
+      "FF:[[-9.44706150e+09  2.28079376e+01  3.18553864e+02 ...  1.17267227e+02\n",
+      "   2.06791859e+01  1.13138741e+02]\n",
+      " [-7.38703309e+09 -7.36921692e+00  7.93703690e+01 ...  2.04038925e+01\n",
+      "   3.18332825e+01  5.44241333e+01]\n",
+      " [ 1.26593966e+10  1.77534454e+02 -2.97174206e+01 ...  1.16717224e+01\n",
+      "   7.70213699e+01  2.81902618e+02]\n",
+      " ...\n",
+      " [ 4.51210527e+10  3.63867554e+02 -8.04915695e+01 ... -1.34332092e+02\n",
+      "  -1.22151901e+02 -2.81496239e+01]\n",
+      " [-1.39591834e+10  1.59216995e+02  6.11343040e+01 ...  1.56675293e+02\n",
+      "   9.68551559e+01  1.70191177e+02]\n",
+      " [-1.29442304e+10 -2.39441772e+02  2.73647644e+02 ... -4.41196594e+01\n",
+      "  -9.48526916e+01 -7.72364044e+00]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[ 11  98 109 134 262 266 274 309 310 327 328 364 398 409 429 605 645]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-9.44705946e+09 -7.38703309e+09  1.26593946e+10 ...  4.51210445e+10\n",
+      "  -1.39591885e+10 -1.29442345e+10]\n",
+      " [ 1.14852783e+03  4.39543152e+02  1.07877356e+03 ... -2.42416113e+03\n",
+      "   2.64504834e+03  4.68633453e+02]\n",
+      " [ 5.72417107e+01  4.12602806e+01 -2.27319489e+01 ... -3.40788422e+01\n",
+      "   4.86237946e+01  1.25752163e+01]\n",
+      " ...\n",
+      " [ 6.76848269e+00  8.23165894e+00  2.10253639e+01 ... -3.19590777e-01\n",
+      "   3.68098617e-01 -1.95310101e-01]\n",
+      " [ 4.08574820e+00  5.33035660e+00  1.41003275e+01 ... -1.35607815e+00\n",
+      "   4.06074905e+00 -7.67630756e-01]\n",
+      " [ 2.03186665e+01  9.77407932e+00  5.06271019e+01 ... -6.80029154e-01\n",
+      "   4.11142111e+00 -1.86585218e-01]]\n",
+      "FF:[[-9.44706150e+09 -7.38703309e+09  1.26593966e+10 ...  4.51210527e+10\n",
+      "  -1.39591834e+10 -1.29442304e+10]\n",
+      " [ 1.14852808e+03  4.39542755e+02  1.07877344e+03 ... -2.42416138e+03\n",
+      "   2.64504932e+03  4.68633698e+02]\n",
+      " [ 5.72415771e+01  4.12602005e+01 -2.27318707e+01 ... -3.40787392e+01\n",
+      "   4.86236725e+01  1.25752039e+01]\n",
+      " ...\n",
+      " [ 6.76847696e+00  8.23167515e+00  2.10253181e+01 ... -3.19590837e-01\n",
+      "   3.68098557e-01 -1.95310280e-01]\n",
+      " [ 4.08574867e+00  5.33037567e+00  1.41003180e+01 ... -1.35607564e+00\n",
+      "   4.06074095e+00 -7.67629445e-01]\n",
+      " [ 2.03186874e+01  9.77407932e+00  5.06271439e+01 ... -6.80029511e-01\n",
+      "   4.11142349e+00 -1.86585203e-01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "6.640625% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-5.1505955e+10 -4.7166772e+03 -1.3288132e+02 ... -3.0123844e+00\n",
+      "  -5.5234032e+01  6.0299168e+00]\n",
+      " [-3.5960029e+10 -5.3263096e+03 -1.9434322e+02 ... -5.6601189e+01\n",
+      "  -1.0787462e+02 -6.0718418e+01]\n",
+      " [ 4.8131662e+10  1.1578307e+04  1.7744476e+02 ... -5.6970375e+01\n",
+      "  -1.7497168e+01 -7.2297249e+00]\n",
+      " ...\n",
+      " [-9.0346426e+08  6.4752144e+03  3.2408417e+02 ...  6.1075470e+01\n",
+      "   8.5356834e+01  8.3221588e+01]\n",
+      " [-5.0754217e+09 -2.2929268e+03 -1.4913528e+02 ...  8.6639397e+01\n",
+      "   1.1156468e+02  1.0695674e+02]\n",
+      " [ 5.5844772e+09  3.0225920e+03 -6.3137859e+01 ... -6.5270996e+01\n",
+      "   8.2730171e+01 -1.0107367e+02]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-5.15059548e+10 -4.71667773e+03 -1.32881012e+02 ... -3.01225996e+00\n",
+      "  -5.52339973e+01  6.02991867e+00]\n",
+      " [-3.59600292e+10 -5.32630957e+03 -1.94343079e+02 ... -5.66010437e+01\n",
+      "  -1.07874649e+02 -6.07182846e+01]\n",
+      " [ 4.81316659e+10  1.15783076e+04  1.77444519e+02 ... -5.69703102e+01\n",
+      "  -1.74972763e+01 -7.22990799e+00]\n",
+      " ...\n",
+      " [-9.03455232e+08  6.47521484e+03  3.24083832e+02 ...  6.10753632e+01\n",
+      "   8.53567886e+01  8.32217255e+01]\n",
+      " [-5.07543654e+09 -2.29292749e+03 -1.49135025e+02 ...  8.66392517e+01\n",
+      "   1.11564789e+02  1.06956917e+02]\n",
+      " [ 5.58446592e+09  3.02259229e+03 -6.31376152e+01 ... -6.52709351e+01\n",
+      "   8.27302551e+01 -1.01073837e+02]]\n",
+      "7.025824652777778% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-6.33203254e+13 -4.43651289e+13  6.35509366e+13 ...  1.08435585e+02\n",
+      "  9.42303467e+01  5.89958420e+01]\n",
+      "FF:[-6.33203296e+13 -4.43651289e+13  6.35509408e+13 ...  1.08435623e+02\n",
+      "  9.42303467e+01  5.89958954e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   26    51    66    85   259   262   272   296   298   329   392   415\n",
+      "   428   482   492   514   526   531   671   731   763   777   893   927\n",
+      "   984  1105  1184  1206  1418  1541  1548  1572  1577  1613  1619  1643\n",
+      "  1658  1661  1691  1701  1706  1726  1757  1784  1815  1833  1849  1856\n",
+      "  1880  1891  1921  1956  1969  2012  2021  2028  2030  2059  2065  2144\n",
+      "  2149  2183  2210  2238  2292  2342  2357  2384  2414  2495  2531  2565\n",
+      "  2597  2662  2713  2781  2821  2829  2877  2904  2921  2927  2962  2973\n",
+      "  3044  3066  3094  3100  3106  3159  3193  3251  3377  3389  3397  3427\n",
+      "  3436  3570  3594  3703  3729  3770  3772  3780  3811  3840  3842  3860\n",
+      "  3907  3920  3929  3946  3955  3969  4005  4009  4034  4048  4077  4089\n",
+      "  4104  4129  4134  4178  4202  4212  4219  4239  4245  4256  4273  4373\n",
+      "  4407  4463  4464  4465  4481  4511  4537  4541  4543  4549  4597  4599\n",
+      "  4633  4759  4760  4789  4846  4884  4901  4930  4954  4971  4993  5024\n",
+      "  5030  5041  5050  5116  5130  5163  5207  5224  5282  5313  5322  5349\n",
+      "  5363  5403  5410  5412  5454  5543  5581  5590  5654  5673  5784  5821\n",
+      "  5849  5880  5911  5917  5982  6000  6062  6165  6178  6193  6200  6272\n",
+      "  6322  6351  6366  6376  6380  6382  6393  6412  6420  6430  6433  6446\n",
+      "  6476  6482  6488  6490  6519  6527  6540  6556  6563  6567  6577  6600\n",
+      "  6619  6680  6709  6735  6768  6777  6780  6823  6825  6826  6830  6863\n",
+      "  6880  6912  6988  7006  7030  7071  7077  7102  7123  7244  7264  7367\n",
+      "  7389  7390  7434  7451  7452  7455  7505  7532  7539  7589  7598  7620\n",
+      "  7651  7653  7659  7709  7714  7740  7751  7759  7803  7808  7820  7917\n",
+      "  7923  7926  7949  7962  7966  7978  8002  8004  8040  8050  8052  8068\n",
+      "  8180  8223  8250  8253  8265  8341  8344  8375  8376  8386  8449  8468\n",
+      "  8501  8509  8522  8535  8585  8590  8593  8642  8657  8674  8687  8707\n",
+      "  8714  8726  8729  8737  8756  8769  8801  8846  8850  8865  8907  8998\n",
+      "  9018  9043  9059  9066  9083  9093  9098  9130  9131  9165  9189  9216\n",
+      "  9285  9337  9368  9526  9539  9563  9620  9659  9723  9793  9804  9817\n",
+      "  9820  9827  9908  9995 10053 10128 10135 10143 10205 10253 10274 10292\n",
+      " 10300 10311 10327 10356 10406 10441 10491 10494 10551 10562 10563 10634\n",
+      " 10649 10674 10710 10734 10821 10831 10833 10838 10845 10911 10966 10981\n",
+      " 10988 10990 10998 11008 11044 11049 11100 11127 11141 11197 11250 11269\n",
+      " 11285 11308 11361 11383 11437 11460 11494 11502 11511 11522 11546 11557\n",
+      " 11564 11588 11649 11658 11671 11674 11703 11729 11749 11759 11832 11892\n",
+      " 11979 11988 12000 12038 12063 12078 12107 12119 12165 12259 12269 12270\n",
+      " 12347 12369 12386 12415 12475 12518 12566 12569 12574 12652 12693 12792\n",
+      " 12833 12834 12852 12872 12900 12946 13117 13121 13124 13321 13345 13357\n",
+      " 13427 13431 13446 13473 13526 13635 13638 13662 13706 13733 13803 13807\n",
+      " 13852 13882 13912 13924 13962 13969 13986 14023 14036 14046 14085 14110\n",
+      " 14130 14141 14175 14183 14191 14220 14222 14223 14285 14310 14331 14336\n",
+      " 14354 14375 14425 14427 14451 14482 14493 14516 14560 14563 14581 14623\n",
+      " 14671 14677 14679 14680 14685 14688 14742 14799 14860 14868 14870 14872\n",
+      " 14900 14909 14916 14940 14964 14991 15003 15023 15027 15033 15038 15051\n",
+      " 15086 15100 15184 15214 15232 15290 15352 15363 15365 15407 15433 15451\n",
+      " 15522 15577 15707 15720 15725 15739 15830 15837 15875 15937 15965 15985\n",
+      " 16017 16054 16113 16136 16142 16169 16191 16232 16238 16250 16268 16282\n",
+      " 16285 16290 16295 16304 16327 16334 16353 16356 16363 16382 16403 16407\n",
+      " 16408 16409 16458 16459 16495 16497 16499 16500 16516 16532 16595 16603\n",
+      " 16611 16657 16678 16680 16695 16701 16704 16754 16768 16807 16818 16856\n",
+      " 16870 16951 16971 16986 16989 16992 17048 17134 17181 17208 17217 17236\n",
+      " 17243 17319 17363 17398 17448 17471 17497 17557 17646 17654 17659 17692\n",
+      " 17754 17947 17957 17969 17975 18029 18128 18146 18196 18206 18207 18250\n",
+      " 18265 18313 18406]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_output_0\n",
+      "HF: [-6.33203254e+13 -4.43651289e+13  6.35509366e+13 ...  1.08435585e+02\n",
+      "  9.42303467e+01  5.89958420e+01]\n",
+      "FF:[-6.33203296e+13 -4.43651289e+13  6.35509408e+13 ...  1.08435623e+02\n",
+      "  9.42303467e+01  5.89958954e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 26  51  66  85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n",
+      " 671 731 763]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_lora_shard_0_input_0\n",
+      "HF: [ 5.0590863e+10  3.7823513e+11 -5.0394451e+11 ... -5.5814421e-01\n",
+      "  2.2970559e-01 -1.2293311e+00]\n",
+      "FF:[ 5.05906831e+10  3.78235290e+11 -5.03944544e+11 ... -5.58144033e-01\n",
+      "  2.29705781e-01 -1.22933090e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 189  254  317  418  515  546  577  634  636  675  712  808 1011 1030\n",
+      " 1080 1091 1132 1168 1254 1265 1285 1287 1354 1381 1427 1459 1506 1620\n",
+      " 1654 1752 1887 1897 1900 1937 1981 1985 1986 2003 2029 2152 2181 2295\n",
+      " 2395 2426 2445 2673 2687 2859 2947 2977 3037]\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n",
+      "  1.2655228e+01  7.1020460e+00]\n",
+      "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n",
+      "  1.26551876e+01  7.10206795e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   9   49  113  174  243  267  271  288  323  335  397  399  438  439\n",
+      "  457  475  506  568  569  652  680  689  715  735  739  758  766  777\n",
+      "  785  837  842  852  865  884  893  919  930  932  936  939  957 1018\n",
+      " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n",
+      " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n",
+      " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n",
+      " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n",
+      " 2932 2948 3018 3034 3066]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_feed_forward_w2_shard_0_input_0\n",
+      "HF: [ 2.5211001e+13 -5.6630301e+13 -2.3639437e+13 ... -4.6000423e+01\n",
+      "  1.2655228e+01  7.1020460e+00]\n",
+      "FF:[ 2.52109673e+13 -5.66302930e+13 -2.36394182e+13 ... -4.60003510e+01\n",
+      "  1.26551876e+01  7.10206795e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   9   49  113  174  243  267  271  288  323  335  397  399  438  439\n",
+      "  457  475  506  568  569  652  680  689  715  735  739  758  766  777\n",
+      "  785  837  842  852  865  884  893  919  930  932  936  939  957 1018\n",
+      " 1095 1105 1112 1114 1129 1168 1217 1220 1229 1230 1233 1237 1283 1304\n",
+      " 1354 1453 1532 1542 1547 1550 1592 1597 1603 1615 1647 1679 1698 1699\n",
+      " 1712 1770 1819 1835 1875 1977 2007 2016 2039 2066 2078 2102 2153 2245\n",
+      " 2403 2447 2621 2698 2704 2728 2736 2743 2774 2792 2836 2858 2870 2881\n",
+      " 2932 2948 3018 3034 3066]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_output_0\n",
+      "HF: [-6.3320325e+13 -4.4365129e+13  6.3550937e+13 ...  7.2449814e+01\n",
+      "  8.6617142e+01  8.3981407e+01]\n",
+      "FF:[-6.33203296e+13 -4.43651289e+13  6.35509408e+13 ...  7.24498901e+01\n",
+      "  8.66170959e+01  8.39814606e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 26  51  66  85 259 262 272 296 298 329 392 415 428 482 492 514 526 531\n",
+      " 671 731 763]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.9.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_9_layers_9_attention_shard_0_o_proj_in_grad\n",
+      "HF: [ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ...  2.5297220e+02\n",
+      " -8.1722275e+01 -7.0014725e+01]\n",
+      "FF:[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ...  2.52972260e+02\n",
+      " -8.17222137e+01 -7.00146637e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  6  36  43  55  60  82 101 110 117 217 221 229 236 256 289 392 421 429\n",
+      " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 7.28854608e+13  6.37500977e+02  2.96775421e+02 ...  8.35403061e+01\n",
+      "   1.72460327e+02  2.90482426e+01]\n",
+      " [-6.08358210e+13 -5.23222847e+01 -2.34542664e+02 ... -1.87500763e+01\n",
+      "  -8.99429398e+01  8.64021378e+01]\n",
+      " [-7.97326117e+13 -4.24736328e+02 -1.82208099e+02 ...  3.21808720e+00\n",
+      "  -5.87415466e+01 -2.08511108e+02]\n",
+      " ...\n",
+      " [-1.13411917e+14 -3.48418640e+02  1.52205795e+02 ...  1.51519928e+02\n",
+      "   2.45651031e+02  2.52972198e+02]\n",
+      " [-3.75985275e+12  2.39696625e+02  1.51989685e+02 ... -2.85605354e+01\n",
+      "  -1.79121232e+00 -8.17222748e+01]\n",
+      " [ 1.11016038e+14 -1.96372967e+01 -1.27668396e+02 ...  3.35008011e+01\n",
+      "  -7.46116943e+01 -7.00147247e+01]]\n",
+      "FF:[[ 7.28854608e+13  6.37500977e+02  2.96775513e+02 ...  8.35403976e+01\n",
+      "   1.72460068e+02  2.90483646e+01]\n",
+      " [-6.08357832e+13 -5.23225098e+01 -2.34542755e+02 ... -1.87501526e+01\n",
+      "  -8.99431992e+01  8.64022217e+01]\n",
+      " [-7.97326201e+13 -4.24736572e+02 -1.82207733e+02 ...  3.21793270e+00\n",
+      "  -5.87416573e+01 -2.08511139e+02]\n",
+      " ...\n",
+      " [-1.13411925e+14 -3.48418640e+02  1.52205902e+02 ...  1.51519714e+02\n",
+      "   2.45650864e+02  2.52972260e+02]\n",
+      " [-3.75988630e+12  2.39696686e+02  1.51989319e+02 ... -2.85606136e+01\n",
+      "  -1.79138493e+00 -8.17222137e+01]\n",
+      " [ 1.11016046e+14 -1.96372318e+01 -1.27668480e+02 ...  3.35009079e+01\n",
+      "  -7.46116791e+01 -7.00146637e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[  6  36  43  55  60  82 101 110 117 217 221 229 236 256 289 392 421 429\n",
+      " 433 454 486 518 523 565 568 629 639 648 707 725 744]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[ 7.2885461e+13 -6.0835821e+13 -7.9732612e+13 ... -1.1341192e+14\n",
+      "  -3.7598527e+12  1.1101604e+14]\n",
+      " [ 3.3241980e+03 -6.3044128e+02 -3.0447307e+03 ...  3.0137921e+02\n",
+      "   3.8262988e+02 -4.2889914e+02]\n",
+      " [ 3.5639046e+01 -1.6155790e+01 -2.4461178e+01 ...  2.7450909e+02\n",
+      "   1.6181946e+02 -2.5407137e+02]\n",
+      " ...\n",
+      " [ 4.6487908e+00 -9.6633381e-01 -2.7078497e-01 ...  3.6374569e+01\n",
+      "  -1.7563061e+00 -7.1206141e+00]\n",
+      " [ 1.8901447e+00  8.9006472e-01 -4.3125896e+00 ...  2.6014965e+01\n",
+      "  -3.7720141e-01 -7.8855257e+00]\n",
+      " [ 1.9513500e+00  5.8041654e+00 -1.4006979e+01 ...  7.2743622e+01\n",
+      "  -2.3499712e+01 -2.0133139e+01]]\n",
+      "FF:[[ 7.28854608e+13 -6.08357832e+13 -7.97326201e+13 ... -1.13411925e+14\n",
+      "  -3.75988630e+12  1.11016046e+14]\n",
+      " [ 3.32419922e+03 -6.30442505e+02 -3.04472998e+03 ...  3.01379364e+02\n",
+      "   3.82629669e+02 -4.28898712e+02]\n",
+      " [ 3.56390572e+01 -1.61558037e+01 -2.44611683e+01 ...  2.74509308e+02\n",
+      "   1.61819229e+02 -2.54071594e+02]\n",
+      " ...\n",
+      " [ 4.64879847e+00 -9.66338813e-01 -2.70792574e-01 ...  3.63745117e+01\n",
+      "  -1.75632846e+00 -7.12060070e+00]\n",
+      " [ 1.89013767e+00  8.90062451e-01 -4.31257772e+00 ...  2.60149212e+01\n",
+      "  -3.77217919e-01 -7.88551569e+00]\n",
+      " [ 1.95135939e+00  5.80417490e+00 -1.40069904e+01 ...  7.27435226e+01\n",
+      "  -2.34996586e+01 -2.01330910e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "7.609953703703703% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-1.17282076e+14 -2.12461621e+03  8.80099030e+01 ...  4.34470520e+01\n",
+      "   7.55885468e+01 -2.88791332e+01]\n",
+      " [-2.07757936e+14 -3.81796265e+02 -2.33774780e+02 ...  8.11984329e+01\n",
+      "  -4.41825638e+01  7.35064125e+00]\n",
+      " [ 4.11484165e+13  2.50572113e+02  1.91601822e+02 ...  1.00269365e+01\n",
+      "  -3.41638985e+01  1.20433075e+02]\n",
+      " ...\n",
+      " [ 7.95562329e+13  1.55007373e+03  1.70351212e+02 ... -1.80320053e+01\n",
+      "   8.77533417e+01  2.14678173e+01]\n",
+      " [-1.86546485e+14 -5.18847070e+03 -3.34331085e+02 ...  2.51586838e+01\n",
+      "  -4.06135368e+01 -6.27860641e+00]\n",
+      " [ 1.89751705e+14 -3.09853809e+03 -1.18278351e+01 ... -1.24640663e+02\n",
+      "   1.59719009e+01 -6.47173615e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-1.17282034e+14 -2.12461694e+03  8.80101547e+01 ...  4.34468918e+01\n",
+      "   7.55886002e+01 -2.88791542e+01]\n",
+      " [-2.07757920e+14 -3.81795776e+02 -2.33774765e+02 ...  8.11985397e+01\n",
+      "  -4.41825829e+01  7.35066986e+00]\n",
+      " [ 4.11484543e+13  2.50570099e+02  1.91601196e+02 ...  1.00270777e+01\n",
+      "  -3.41638451e+01  1.20433121e+02]\n",
+      " ...\n",
+      " [ 7.95562413e+13  1.55007288e+03  1.70350784e+02 ... -1.80321960e+01\n",
+      "   8.77533112e+01  2.14678249e+01]\n",
+      " [-1.86546469e+14 -5.18847070e+03 -3.34331268e+02 ...  2.51588135e+01\n",
+      "  -4.06132622e+01 -6.27861023e+00]\n",
+      " [ 1.89751521e+14 -3.09853711e+03 -1.18275299e+01 ... -1.24640862e+02\n",
+      "   1.59719791e+01 -6.47173767e+01]]\n",
+      "7.530381944444445% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-1.3223293e+17 -2.3794983e+17  4.7027590e+16 ...  7.7873253e+01\n",
+      "  8.6085976e+01  6.8200005e+01]\n",
+      "FF:[-1.32232886e+17 -2.37949812e+17  4.70276284e+16 ...  7.78733292e+01\n",
+      "  8.60859299e+01  6.82000580e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[    3    24    66    71    94    95   124   134   141   150   163   181\n",
+      "   226   261   284   318   320   378   382   385   391   395   403   422\n",
+      "   434   495   515   523   524   549   579   610   644   710   764   772\n",
+      "   870   984   987  1045  1249  1330  1362  1489  1517  1550  1556  1588\n",
+      "  1595  1659  1672  1684  1689  1768  1792  1799  1808  1818  1842  1871\n",
+      "  1889  1899  1910  1915  1925  1936  1993  1997  2033  2041  2059  2062\n",
+      "  2066  2098  2111  2124  2129  2130  2146  2153  2159  2166  2197  2206\n",
+      "  2210  2212  2222  2234  2237  2320  2321  2357  2359  2362  2385  2428\n",
+      "  2518  2539  2553  2568  2598  2683  2689  2694  2711  2714  2733  2787\n",
+      "  2788  2795  2811  2815  2853  2881  2890  2917  2981  2997  3021  3037\n",
+      "  3089  3149  3163  3191  3196  3217  3225  3248  3277  3287  3292  3305\n",
+      "  3327  3361  3385  3402  3417  3425  3456  3479  3516  3521  3528  3555\n",
+      "  3587  3599  3608  3684  3702  3733  3770  3779  3819  3822  3823  3898\n",
+      "  3921  3942  3950  4012  4053  4077  4086  4091  4139  4185  4198  4225\n",
+      "  4241  4296  4347  4349  4368  4403  4407  4418  4453  4471  4472  4473\n",
+      "  4494  4537  4549  4555  4558  4598  4623  4648  4666  4698  4729  4782\n",
+      "  4848  4866  4886  4943  4959  5008  5010  5012  5057  5079  5177  5178\n",
+      "  5186  5211  5271  5281  5296  5313  5328  5356  5364  5409  5429  5440\n",
+      "  5453  5455  5457  5476  5529  5563  5591  5621  5625  5631  5654  5661\n",
+      "  5692  5705  5720  5740  5751  5758  5787  5799  5813  5835  5836  5867\n",
+      "  5872  5893  5953  5974  5980  5982  6000  6055  6082  6086  6102  6107\n",
+      "  6123  6159  6172  6193  6220  6230  6231  6263  6286  6297  6362  6396\n",
+      "  6401  6430  6436  6485  6497  6499  6502  6510  6537  6554  6555  6563\n",
+      "  6564  6579  6586  6598  6615  6625  6626  6649  6651  6661  6754  6764\n",
+      "  6776  6852  6863  6874  6883  6892  6913  6945  6969  7036  7057  7066\n",
+      "  7082  7138  7147  7150  7157  7197  7202  7231  7234  7235  7240  7270\n",
+      "  7278  7287  7322  7327  7345  7348  7361  7390  7402  7490  7539  7573\n",
+      "  7610  7714  7721  7758  7794  7812  7827  7829  7837  7839  7882  7894\n",
+      "  7943  7948  7952  7969  7975  7996  8024  8027  8037  8043  8055  8078\n",
+      "  8079  8088  8090  8095  8154  8258  8264  8283  8297  8313  8329  8336\n",
+      "  8359  8361  8376  8383  8416  8421  8428  8454  8475  8502  8521  8613\n",
+      "  8642  8653  8696  8756  8764  8777  8791  8837  8849  8859  8878  8955\n",
+      "  8991  8997  9006  9012  9040  9066  9093  9097  9098  9131  9158  9162\n",
+      "  9165  9214  9216  9280  9297  9301  9316  9355  9371  9412  9421  9475\n",
+      "  9510  9580  9620  9645  9696  9713  9732  9768  9802  9817  9819  9826\n",
+      "  9839  9846  9947 10004 10062 10065 10072 10103 10107 10108 10138 10167\n",
+      " 10173 10228 10262 10292 10326 10356 10360 10372 10421 10446 10466 10468\n",
+      " 10499 10505 10513 10517 10589 10606 10612 10645 10664 10669 10726 10777\n",
+      " 10835 10838 10839 10848 10855 10877 10897 10941 10963 10971 10977 10997\n",
+      " 11030 11060 11065 11076 11088 11140 11167 11174 11231 11252 11257 11259\n",
+      " 11275 11297 11302 11319 11331 11333 11357 11358 11380 11382 11402 11423\n",
+      " 11446 11447 11500 11501 11522 11585 11623 11670 11728 11736 11759 11761\n",
+      " 11772 11785 11839 11894 11916 11924 11936 11962 11968 11969 11977 11984\n",
+      " 12008 12030 12054 12074 12123 12175 12182 12194 12237 12262 12282 12285\n",
+      " 12341 12348 12351 12370 12376 12386 12399 12449 12507 12513 12518 12522\n",
+      " 12549 12572 12643 12648 12663 12689 12696 12710 12769 12780 12788 12792\n",
+      " 12793 12852 12864 12879 12884 12985 13018 13041 13057 13176 13264 13272\n",
+      " 13274 13275 13292 13303 13333 13379 13427 13428 13442 13451 13454 13500\n",
+      " 13510 13533 13564 13588 13607 13640 13655 13686 13687 13688 13732 13747\n",
+      " 13786 13801 13803 13826 13841 13846 13850 13892 13909 13946 14036 14040\n",
+      " 14046 14060 14080 14152 14161 14183 14195 14210 14240 14278 14331 14354\n",
+      " 14370 14372 14386 14395 14409 14432 14434 14497 14506 14531 14559 14589\n",
+      " 14648 14663 14686 14698 14715 14743 14757 14799 14808 14810 14849 14893\n",
+      " 14902 14929 14937 14947 14953 14958 15005 15012 15018 15036 15066 15069\n",
+      " 15083 15152 15154 15196 15197 15212 15292 15309 15323 15340 15343 15375\n",
+      " 15389 15396 15408 15410 15454 15499 15532 15557 15605 15647 15677 15736\n",
+      " 15745 15756 15769 15809 15824 15876 15882 15900 15906 15941 16027 16030\n",
+      " 16040 16116 16190 16192 16205 16207 16239 16279 16285 16295 16348 16358\n",
+      " 16367 16384 16386 16394 16399 16455 16457 16458 16471 16495 16500 16502\n",
+      " 16520 16541 16542 16598 16623 16643 16651 16665 16673 16679 16713 16725\n",
+      " 16734 16736 16739 16751 16756 16768 16861 16870 16939 16976 17007 17028\n",
+      " 17040 17069 17087 17108 17125 17139 17151 17158 17174 17175 17178 17182\n",
+      " 17189 17221 17258 17341 17360 17370 17381 17395 17396 17415 17432 17450\n",
+      " 17463 17470 17472 17473 17496 17507 17536 17608 17626 17627 17649 17653\n",
+      " 17664 17771 17815 17822 17831 17864 17883 17931 17994 17999 18035 18174\n",
+      " 18209 18250 18274 18307 18327 18403 18423]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Lora --\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_B.default.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_output_0\n",
+      "HF: [-1.3223293e+17 -2.3794983e+17  4.7027590e+16 ...  7.7873253e+01\n",
+      "  8.6085976e+01  6.8200005e+01]\n",
+      "FF:[-1.32232886e+17 -2.37949812e+17  4.70276284e+16 ...  7.78733292e+01\n",
+      "  8.60859299e+01  6.82000580e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  3  24  66  71  94  95 124 134 141 150 163 181 226 261 284 318 320 378\n",
+      " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.lora_A.default.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_lora_shard_0_input_0\n",
+      "HF: [ 6.5550952e+14  4.9376585e+14  3.8510841e+14 ...  1.6802770e+00\n",
+      " -1.1248941e+00 -1.1701980e+00]\n",
+      "FF:[ 6.55509317e+14  4.93765882e+14  3.85108377e+14 ...  1.68027747e+00\n",
+      " -1.12489426e+00 -1.17019880e+00]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[   6   79  111  149  155  168  187  195  220  223  252  261  329  343\n",
+      "  347  369  386  392  403  438  439  450  461  524  535  643  656  659\n",
+      "  661  668  722  727  732  742  754  801  816  820  835  837  849  850\n",
+      "  978  993  997 1012 1019 1034 1044 1071 1088 1094 1114 1135 1151 1170\n",
+      " 1190 1212 1273 1275 1277 1289 1290 1308 1311 1337 1364 1379 1394 1430\n",
+      " 1454 1460 1469 1474 1703 1725 1728 1732 1733 1741 1754 1757 1804 1806\n",
+      " 1856 1862 1932 1945 1996 2030 2044 2045 2065 2071 2075 2094 2149 2152\n",
+      " 2163 2180 2182 2215 2254 2357 2362 2370 2392 2398 2428 2484 2519 2521\n",
+      " 2524 2582 2618 2641 2645 2664 2674 2681 2691 2735 2747 2779 2872 2899\n",
+      " 2909 2935 2957 3000 3033]\n",
+      "Ok!\n",
+      "-- W2/W1/W3 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_SigmoidSiluMulti_shard_0_output_0\n",
+      "HF: [-1.3871785e+17 -8.3164397e+16  4.9509505e+16 ...  4.3806694e+01\n",
+      "  9.4386072e+00 -2.4460859e+01]\n",
+      "FF:[-1.38717840e+17 -8.31644654e+16  4.95094495e+16 ...  4.38065948e+01\n",
+      "  9.43864822e+00 -2.44608364e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  80   83  172  173  176  184  215  285  329  338  341  395  403  465\n",
+      "  468  565  572  601  614  636  639  651  660  749  750  806  828  844\n",
+      "  873  952  971  988  992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n",
+      " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n",
+      " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n",
+      " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n",
+      " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n",
+      " 2936 2945 2951 3013 3016]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.mlp.down_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_feed_forward_w2_shard_0_input_0\n",
+      "HF: [-1.3871785e+17 -8.3164397e+16  4.9509505e+16 ...  4.3806694e+01\n",
+      "  9.4386072e+00 -2.4460859e+01]\n",
+      "FF:[-1.38717840e+17 -8.31644654e+16  4.95094495e+16 ...  4.38065948e+01\n",
+      "  9.43864822e+00 -2.44608364e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  80   83  172  173  176  184  215  285  329  338  341  395  403  465\n",
+      "  468  565  572  601  614  636  639  651  660  749  750  806  828  844\n",
+      "  873  952  971  988  992 1014 1082 1083 1085 1123 1152 1195 1200 1227\n",
+      " 1391 1397 1462 1546 1548 1563 1584 1629 1704 1706 1759 1764 1820 1833\n",
+      " 1851 1857 1864 1899 1929 1943 1958 1967 1980 1985 2002 2030 2069 2076\n",
+      " 2120 2127 2130 2157 2180 2187 2195 2212 2243 2249 2256 2299 2393 2505\n",
+      " 2516 2525 2546 2562 2604 2702 2712 2731 2745 2764 2789 2821 2873 2915\n",
+      " 2936 2945 2951 3013 3016]\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "-- Attention --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_output_0\n",
+      "HF: [-1.3223293e+17 -2.3794983e+17  4.7027590e+16 ...  3.5121140e+01\n",
+      " -3.5587997e+00  9.5641022e+01]\n",
+      "FF:[-1.32232886e+17 -2.37949812e+17  4.70276284e+16 ...  3.51211472e+01\n",
+      " -3.55898285e+00  9.56410980e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[  3  24  66  71  94  95 124 134 141 150 163 181 226 261 284 318 320 378\n",
+      " 382 385 391 395 403 422 434 495 515 523 524 549 579 610 644 710 764]\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.8.self_attn.o_proj.gi_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_8_layers_8_attention_shard_0_o_proj_in_grad\n",
+      "HF: [-1.6186993e+17 -3.5698813e+17  3.4442975e+16 ... -2.5844165e+02\n",
+      "  2.0677340e+01 -2.4573349e+01]\n",
+      "FF:[-1.61869621e+17 -3.56988336e+17  3.44430865e+16 ... -2.58441467e+02\n",
+      "  2.06775093e+01 -2.45735531e+01]\n",
+      "[ True  True  True ...  True  True  True]\n",
+      "[ 93  99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n",
+      " 608 702 720 731 759]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-1.6186993e+17 -2.1968115e+02  8.5754425e+01 ... -6.9909119e+01\n",
+      "  -2.6478451e+01 -7.4195160e+01]\n",
+      " [-3.5698813e+17  3.9582391e+02  5.5431940e+02 ...  1.9529277e+02\n",
+      "   1.2558211e+02  6.7965935e+01]\n",
+      " [ 3.4442975e+16  2.8310864e+02 -8.1522171e+01 ... -2.3606525e+01\n",
+      "  -2.0410315e+01 -1.5228156e+02]\n",
+      " ...\n",
+      " [ 4.0923264e+16 -2.4507169e+02 -8.2614380e+02 ... -2.6583340e+02\n",
+      "  -1.9878247e+02 -2.5844165e+02]\n",
+      " [ 6.9156258e+17  1.3969666e+02 -7.5639044e+02 ... -1.5231053e+02\n",
+      "  -3.3650037e+02  2.0677340e+01]\n",
+      " [ 9.9511712e+16 -3.2348724e+01  3.0624988e+02 ...  1.0391423e+02\n",
+      "   6.0626881e+01 -2.4573349e+01]]\n",
+      "FF:[[-1.61869621e+17 -2.19681122e+02  8.57541504e+01 ... -6.99092026e+01\n",
+      "  -2.64783611e+01 -7.41952515e+01]\n",
+      " [-3.56988336e+17  3.95823853e+02  5.54319275e+02 ...  1.95292725e+02\n",
+      "   1.25582062e+02  6.79659348e+01]\n",
+      " [ 3.44430865e+16  2.83108551e+02 -8.15224686e+01 ... -2.36064014e+01\n",
+      "  -2.04101429e+01 -1.52281570e+02]\n",
+      " ...\n",
+      " [ 4.09233933e+16 -2.45071564e+02 -8.26143555e+02 ... -2.65833405e+02\n",
+      "  -1.98782272e+02 -2.58441467e+02]\n",
+      " [ 6.91562577e+17  1.39696579e+02 -7.56390808e+02 ... -1.52310455e+02\n",
+      "  -3.36500092e+02  2.06775093e+01]\n",
+      " [ 9.95114373e+16 -3.23486938e+01  3.06250122e+02 ...  1.03914482e+02\n",
+      "   6.06264191e+01 -2.45735531e+01]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[ 93  99 114 137 141 142 160 193 235 259 269 299 307 316 350 364 400 523\n",
+      " 608 702 720 731 759]\n",
+      "Ok!\n",
+      "mismatch between hf_tensor and ff_tensor\n",
+      "HF: [[-1.6186993e+17 -3.5698813e+17  3.4442975e+16 ...  4.0923264e+16\n",
+      "   6.9156258e+17  9.9511712e+16]\n",
+      " [-5.3483575e+02  2.6249797e+03 -6.7268573e+02 ... -6.1204077e+03\n",
+      "  -4.3047915e+03 -9.5139771e+01]\n",
+      " [-1.2200641e+01  1.0347147e+02 -2.6777636e+01 ... -1.4766699e+02\n",
+      "  -9.8514114e+01  1.2616925e+01]\n",
+      " ...\n",
+      " [-3.2097631e+00  9.1431990e+00 -1.6333975e+00 ... -6.9996667e+00\n",
+      "  -6.4008064e+00  1.9126304e+00]\n",
+      " [-3.0982289e+00  1.2355285e+01 -3.1715555e+00 ... -4.6754313e+00\n",
+      "  -6.2553053e+00  1.0515085e+00]\n",
+      " [-2.9516125e+00  2.7038031e+00 -6.0580249e+00 ... -1.6555168e+01\n",
+      "   1.3245420e+00 -1.5741113e+00]]\n",
+      "FF:[[-1.61869621e+17 -3.56988336e+17  3.44430865e+16 ...  4.09233933e+16\n",
+      "   6.91562577e+17  9.95114373e+16]\n",
+      " [-5.34834961e+02  2.62497900e+03 -6.72686401e+02 ... -6.12040576e+03\n",
+      "  -4.30479297e+03 -9.51402283e+01]\n",
+      " [-1.22006664e+01  1.03471611e+02 -2.67777309e+01 ... -1.47666946e+02\n",
+      "  -9.85141525e+01  1.26169167e+01]\n",
+      " ...\n",
+      " [-3.20977211e+00  9.14321709e+00 -1.63339353e+00 ... -6.99966621e+00\n",
+      "  -6.40081263e+00  1.91262615e+00]\n",
+      " [-3.09821057e+00  1.23552399e+01 -3.17152786e+00 ... -4.67541933e+00\n",
+      "  -6.25528765e+00  1.05149710e+00]\n",
+      " [-2.95161533e+00  2.70380235e+00 -6.05802393e+00 ... -1.65551491e+01\n",
+      "   1.32455230e+00 -1.57412362e+00]]\n",
+      "[[ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " ...\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]\n",
+      " [ True  True  True ...  True  True  True]]\n",
+      "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
+      "Ok!\n",
+      "8.101851851851851% mismatch in QK prods softmax out grad\n",
+      "Ok!\n",
+      "hf_attn_in:  (768, 24)\n",
+      "[[-7.3778828e+16  1.0956941e+03  1.1773144e+02 ... -4.0466427e+01\n",
+      "  -3.1198654e+01 -1.7603550e+01]\n",
+      " [-1.2087128e+18  6.9384756e+03  6.1327003e+01 ...  1.5329468e+01\n",
+      "   7.6757736e+00 -4.5589094e+00]\n",
+      " [-6.7892266e+17  5.4895034e+03  7.6927376e+01 ...  9.1396770e+00\n",
+      "   2.3195824e+01 -6.1995559e+00]\n",
+      " ...\n",
+      " [ 2.6452032e+17  9.9761787e+03  2.2349066e+02 ...  5.7504387e+01\n",
+      "  -8.6791611e-01  4.6890911e+01]\n",
+      " [-6.7528534e+16  3.3856902e+03  2.5189743e+02 ...  2.2824722e+01\n",
+      "   8.7917282e+01 -2.1569672e+01]\n",
+      " [-2.1779064e+17  5.2511855e+03  6.6282043e+01 ...  9.9689598e+00\n",
+      "  -5.5022659e+00 -3.2573143e+01]]\n",
+      "ff_attn_in:  (768, 24)\n",
+      "[[-7.37791458e+16  1.09569678e+03  1.17731285e+02 ... -4.04664154e+01\n",
+      "  -3.11988506e+01 -1.76035423e+01]\n",
+      " [-1.20871251e+18  6.93847900e+03  6.13275528e+01 ...  1.53295393e+01\n",
+      "   7.67594433e+00 -4.55900288e+00]\n",
+      " [-6.78922523e+17  5.48950342e+03  7.69272308e+01 ...  9.13961220e+00\n",
+      "   2.31957569e+01 -6.19959354e+00]\n",
+      " ...\n",
+      " [ 2.64520284e+17  9.97617871e+03  2.23490509e+02 ...  5.75044785e+01\n",
+      "  -8.67943764e-01  4.68908234e+01]\n",
+      " [-6.75287400e+16  3.38569165e+03  2.51897339e+02 ...  2.28247147e+01\n",
+      "   8.79171448e+01 -2.15696106e+01]\n",
+      " [-2.17790679e+17  5.25118652e+03  6.62821960e+01 ...  9.96885872e+00\n",
+      "  -5.50213098e+00 -3.25731125e+01]]\n",
+      "9.809027777777777% mismatch in attention input grads\n",
+      "\n",
+      "Huggingface checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "FlexFlow checks:\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "Huggingface-FlexFlow checks:\n",
+      "-- W2 --\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/bwd_step_0_layers.7.mlp.down_proj.go_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/bwd_step_0_layers_7_layers_7_feed_forward_w2_shard_0_output_0\n",
+      "HF: [-7.5522525e+19 -1.3283726e+21 -7.2549753e+20 ...  4.9017162e+01\n",
+      " -9.7436657e+00  8.5870697e+01]\n",
+      "FF:[-7.55228501e+19 -1.32837218e+21 -7.25497390e+20 ...  4.90171394e+01\n",
+      " -9.74382782e+00  8.58707886e+01]\n",
+      "[ True  True  True ...  True False  True]\n",
+      "[   19    64    75 ... 18418 18428 18430]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[23], line 95\u001b[0m\n\u001b[1;32m     93\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mHuggingface-FlexFlow checks:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     94\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- W2 --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m---> 95\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_BWD_w2_out\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtolerance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-5\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m     96\u001b[0m compare_tensors(hf_w2_weight, ff_w2_weight, tolerance\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1e-5\u001b[39m)\n\u001b[1;32m     98\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-- Lora --\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:47\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     42\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     43\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     44\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     45\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     46\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 47\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     48\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "attention_tests=True\n",
+    "for i in range(tot_num_layers-1, -1, -1):\n",
+    "    # HuggingFace filepaths\n",
+    "    hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_norm.gi_0\"\n",
+    "    hf_BWD_loraB_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.go_0\"\n",
+    "    hf_BWD_loraB_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_B.default.gi_0\"\n",
+    "    hf_BWD_loraA_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.go_0\"\n",
+    "    hf_BWD_loraA_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_A.default.gi_0\"\n",
+    "    hf_loraA_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    hf_loraB_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    hf_BWD_lora_dropout_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.go_0\"\n",
+    "    hf_BWD_lora_dropout_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.lora_dropout.default.gi_0\"\n",
+    "    hf_BWD_w2_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.go_0\"\n",
+    "    hf_BWD_w2_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.down_proj.gi_0\"\n",
+    "    hf_w2_weight = f\"{hf_path}/layers.{i}.mlp.down_proj.weight\"\n",
+    "    hf_BWD_w3_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.go_0\"\n",
+    "    hf_BWD_w3_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.up_proj.gi_0\"\n",
+    "    hf_BWD_w1_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.go_0\"\n",
+    "    hf_BWD_w1_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.gate_proj.gi_0\"\n",
+    "    hf_BWD_act_fn_in = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.gi_0\"\n",
+    "    hf_BWD_act_fn_out = f\"{hf_path}/bwd_step_0_layers.{i}.mlp.act_fn.go_0\"\n",
+    "    hf_BWD_ffn_norm_out = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.go_0\"\n",
+    "    hf_BWD_ffn_norm_in = f\"{hf_path}/bwd_step_0_layers.{i}.post_attention_layernorm.gi_0\"\n",
+    "    hf_BWD_attn_out_out = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.go_0\"\n",
+    "    hf_BWD_attn_q_in = f\"{hf_path}/bwd_step_0_layers.11.self_attn.q_proj.gi_0\"\n",
+    "    hf_FWD_w1_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.gate_proj.output_0\"\n",
+    "    hf_FWD_w3_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.up_proj.output_0\"\n",
+    "    hf_FWD_act_fn_out = f\"{hf_path}/fwd_step_0_layers.{i}.mlp.act_fn.output_0\"\n",
+    "    hf_BWD_attn_oproj_in = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n",
+    "    hf_attn_qproj_weight = f\"{hf_path}/layers.{i}.self_attn.q_proj.weight\"\n",
+    "    hf_attn_kproj_weight = f\"{hf_path}/layers.{i}.self_attn.k_proj.weight\"\n",
+    "    hf_attn_vproj_weight = f\"{hf_path}/layers.{i}.self_attn.v_proj.weight\"\n",
+    "    hf_attn_oproj_weight = f\"{hf_path}/layers.{i}.self_attn.o_proj.weight\"\n",
+    "    \n",
+    "    # FlexFlow filepaths\n",
+    "    ff_BWD_w2_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_output_0\"\n",
+    "    ff_BWD_w2_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_input_0\"\n",
+    "    ff_BWD_w2_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_pre_input_0\"\n",
+    "    ff_w2_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_shard_0_weight_0\"\n",
+    "    ff_BWD_ssm_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_output_0\"\n",
+    "    ff_BWD_ssm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_0\"\n",
+    "    ff_BWD_ssm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_input_1\"\n",
+    "    ff_BWD_w3_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_output_0\"\n",
+    "    ff_BWD_w3_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w3_shard_0_input_0\"\n",
+    "    ff_BWD_lora_A_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_input_0\"\n",
+    "    ff_BWD_lora_B_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_output_0\"\n",
+    "    ff_lora_A_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_A\"\n",
+    "    ff_lora_B_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_feed_forward_w2_lora_shard_0_weight_B\"\n",
+    "    ff_BWD_w1_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_output_0\"\n",
+    "    ff_BWD_w1_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_input_0\"\n",
+    "    ff_BWD_w1_in_pre = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_pre_input_0\"\n",
+    "    ff_w1_weight = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_feed_forward_w1_shard_0_weight_0\"\n",
+    "    ff_BWD_ffn_norm_in1 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_0\"\n",
+    "    ff_BWD_ffn_norm_in2 = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_input_1\"\n",
+    "    ff_BWD_ffn_norm_out = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_ffn_norm_shard_0_output_0\"\n",
+    "    ff_BWD_attn_out = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_output_0\"\n",
+    "    ff_BWD_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_input_0\"\n",
+    "    ff_BWD_ssm_cached_w1_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w1_output\"\n",
+    "    ff_BWD_ssm_cached_w3_input = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_cached_w3_output\"\n",
+    "    ff_FWD_w1_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w1_shard_0_output_0\"\n",
+    "    ff_FWD_w3_out = f\"{ff_path}/fwd_step_0_layers_0_layers_0_feed_forward_w3_shard_0_output_0\"\n",
+    "    ff_FWD_act_fnc_out = f\"{ff_path}/bwd_step_0_layers_{i}_SigmoidSiluMulti_shard_0_act_fn_output\"\n",
+    "    ff_BWD_attn_o_proj_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n",
+    "    ff_attn_oproj_weight = f\"{ff_path}/fwd_step_0_layers_{i}_layers_{i}_attention_shard_0_weight_0\"\n",
+    "    \n",
+    "    \n",
+    "    # HuggingFace checks\n",
+    "    print(\"\\nHuggingface checks:\")\n",
+    "    if i == tot_num_layers-1:\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_loraB_out)\n",
+    "        compare_hf_tensors(hf_BWD_norm_in, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_out, hf_BWD_w2_out)\n",
+    "    compare_hf_tensors(hf_BWD_loraB_in, hf_BWD_loraA_out)\n",
+    "\n",
+    "    compare_hf_tensors(hf_BWD_act_fn_in, hf_BWD_w1_out)\n",
+    "    check_hf_sum_tensors(hf_BWD_ffn_norm_out, hf_BWD_w1_in, hf_BWD_w3_in)\n",
+    "    if i == tot_num_layers-1:\n",
+    "        check_hf_sum_tensors(hf_BWD_attn_out_out, hf_BWD_ffn_norm_in, hf_BWD_norm_in)\n",
+    "\n",
+    "    # FlexFlow checks\n",
+    "    print(\"\\nFlexFlow checks:\")\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_out, ff_BWD_lora_B_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in_pre, ff_BWD_lora_A_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in2, ff_BWD_w3_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_ssm_in1, ff_BWD_w1_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in, ff_BWD_ffn_norm_out)\n",
+    "    compare_flexflow_tensors(ff_BWD_w1_in_pre, ff_BWD_w3_in)\n",
+    "    compare_flexflow_tensors(ff_BWD_ffn_norm_in1, ff_BWD_ffn_norm_in2, max_len=24*768)\n",
+    "    \n",
+    "    # HF-FlexFlow checks\n",
+    "    print(\"\\nHuggingface-FlexFlow checks:\")\n",
+    "    print(\"-- W2 --\")\n",
+    "    compare_tensors(hf_BWD_w2_out, ff_BWD_w2_out, tolerance=1e-5)\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    \n",
+    "    print(\"-- Lora --\")\n",
+    "    compare_tensors(hf_loraA_weight, ff_lora_A_weight, tolerance=1e-5)\n",
+    "    compare_tensors(hf_loraB_weight, ff_lora_B_weight, tolerance=1e-5)\n",
+    "\n",
+    "    compare_tensors(hf_BWD_loraB_out, ff_BWD_lora_B_out)\n",
+    "    compare_tensors(hf_BWD_loraA_in, ff_BWD_lora_A_in)\n",
+    "    \n",
+    "    print(\"-- W2/W1/W3 --\")\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_ssm_out)\n",
+    "    compare_tensors(hf_BWD_w2_in, ff_BWD_w2_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    compare_tensors_difference(hf_BWD_w1_in, ff_BWD_w1_in, ff_BWD_w1_in_pre)\n",
+    "    compare_tensors(hf_BWD_w3_out, ff_BWD_w3_out)\n",
+    "    compare_tensors(hf_BWD_w3_in, ff_BWD_w3_in)\n",
+    "    compare_tensors(hf_BWD_w1_out, ff_BWD_w1_out)\n",
+    "    \n",
+    "    print(\"-- Attention --\")\n",
+    "    compare_tensors(hf_BWD_attn_out_out, ff_BWD_attn_out)\n",
+    "    hidden_size = 768\n",
+    "    qProjSize = 64\n",
+    "    num_heads = 12\n",
+    "    num_new_tokens = num_tokens = 24\n",
+    "    if attention_tests:\n",
+    "        # compare attn weight tensors\n",
+    "        ff_attn_weight_tensor = np.loadtxt(ff_attn_oproj_weight, delimiter=',')\n",
+    "        ff_attn_qproj_weight_tensor = ff_attn_weight_tensor[:hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "        ff_attn_kproj_weight_tensor = ff_attn_weight_tensor[hidden_size*qProjSize*num_heads:2*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "        ff_attn_vproj_weight_tensor = ff_attn_weight_tensor[2*hidden_size*qProjSize*num_heads:3*hidden_size*qProjSize*num_heads].reshape((hidden_size,qProjSize*num_heads), order = 'F')\n",
+    "        ff_attn_oproj_weight_tensor = ff_attn_weight_tensor[3*hidden_size*qProjSize*num_heads:].reshape((qProjSize*num_heads,hidden_size), order='F')\n",
+    "        \n",
+    "        hf_attn_qproj_weight_tensor = torch.load(hf_attn_qproj_weight).T.detach().cpu().numpy()\n",
+    "        hf_attn_kproj_weight_tensor = torch.load(hf_attn_kproj_weight).T.detach().cpu().numpy()\n",
+    "        hf_attn_vproj_weight_tensor = torch.load(hf_attn_vproj_weight).T.detach().cpu().numpy()\n",
+    "        hf_attn_oproj_weight_tensor = torch.load(hf_attn_oproj_weight).T.detach().cpu().numpy()\n",
+    "        \n",
+    "        assert(np.allclose(ff_attn_qproj_weight_tensor, hf_attn_qproj_weight_tensor, atol=1e-5))\n",
+    "        assert(np.allclose(ff_attn_kproj_weight_tensor, hf_attn_kproj_weight_tensor, atol=1e-5))\n",
+    "        assert(np.allclose(ff_attn_vproj_weight_tensor, hf_attn_vproj_weight_tensor, atol=1e-5))\n",
+    "        assert(np.allclose(ff_attn_oproj_weight_tensor, hf_attn_oproj_weight_tensor, atol=1e-5))\n",
+    "        \n",
+    "        # Compare attn outproj grad in tensors\n",
+    "        compare_tensors(hf_BWD_attn_oproj_in, ff_BWD_attn_o_proj_in)\n",
+    "        \n",
+    "        ########### Compare value projs grads ######################\n",
+    "        # 1. compare qk prods softmax\n",
+    "        hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.qk_prods_softmax.output_0\"\n",
+    "        ff_attn_qk_prods_softmax = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax\"\n",
+    "        \n",
+    "        hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)\n",
+    "        ff_qk_prods_softmax = np.loadtxt(ff_attn_qk_prods_softmax, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "\n",
+    "        for head_idx in range(num_heads):\n",
+    "            hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "            ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "            assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "        \n",
+    "        # 2. compare attn heads grads\n",
+    "        hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.o_proj.gi_0\"\n",
+    "        ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n",
+    "\n",
+    "        hf_attn_heads_grads = torch.load(hf_attn_heads_grads).T.squeeze().detach().cpu().numpy()\n",
+    "        ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize*num_heads, num_new_tokens), order = 'F')\n",
+    "        # NEED TO VISUALLY INSPECT\n",
+    "        compare_loaded_tensors(hf_attn_heads_grads, ff_attn_heads_grads)\n",
+    "\n",
+    "        # 3. vproj grads\n",
+    "        hf_vproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.v_proj.go_0\"\n",
+    "        ff_vproj_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_v_proj_in_grad\"\n",
+    "\n",
+    "        hf_vproj_grads = torch.load(hf_vproj_grads).squeeze().detach().cpu().numpy()\n",
+    "        ff_vproj_grads = np.loadtxt(ff_vproj_grads, delimiter=',').reshape((num_tokens, qProjSize*num_heads), order='F')\n",
+    "        compare_loaded_tensors(hf_vproj_grads, ff_vproj_grads)\n",
+    "        \n",
+    "        \n",
+    "        ##############################\n",
+    "        hf_value_states = f\"{hf_path}/fwd_step_0_layers.{i}.self_attn.value_states.output_0\"\n",
+    "        hf_value_states = torch.load(hf_value_states).squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "        # print(hf_value_states.shape)\n",
+    "        ff_value_states = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_vcache\"\n",
+    "        ff_value_states = np.loadtxt(ff_value_states, delimiter=',').reshape((qProjSize, num_heads, num_tokens), order='F')\n",
+    "        # print(ff_value_states.shape)\n",
+    "        assert(np.allclose(hf_value_states, ff_value_states, atol=1e-2))\n",
+    "        \n",
+    "        \n",
+    "        \n",
+    "        ########## Compare key and query projs grads ##################\n",
+    "        ff_devQKVPRojArray = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n",
+    "        ff_devQKVPRojArray = np.loadtxt(ff_devQKVPRojArray, delimiter=',').reshape((num_tokens, qProjSize*num_heads, 3), order = 'F')\n",
+    "        ff_qProjGrads = ff_devQKVPRojArray[:,:,0]\n",
+    "        ff_kProjGrads = ff_devQKVPRojArray[:,:,1]\n",
+    "        ff_vProjGrads = ff_devQKVPRojArray[:,:,2]\n",
+    "        assert(np.allclose(ff_vProjGrads, ff_vproj_grads, atol=1e-5))\n",
+    "\n",
+    "        # simulate qk_prods_softmax\n",
+    "        ff_attn_heads_grads = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_o_proj_in_grad\"\n",
+    "        ff_attn_heads_grads = np.loadtxt(ff_attn_heads_grads, delimiter=',').reshape((qProjSize,num_heads, num_new_tokens), order = 'F')\n",
+    "        ff_attn_heads_grads = torch.from_numpy(ff_attn_heads_grads)\n",
+    "        ff_attn_heads_grads = ff_attn_heads_grads.permute(1,2,0)\n",
+    "        ff_value_states = torch.from_numpy(ff_value_states)\n",
+    "        ff_value_states = ff_value_states.permute(1,0,2)\n",
+    "        # print(ff_attn_heads_grads.shape)\n",
+    "        # print(ff_value_states.shape)\n",
+    "        simulated_qk_prods_softmax_grads = torch.matmul(ff_attn_heads_grads, ff_value_states)\n",
+    "        #simulated_qk_prods_softmax_grads = simulated_qk_prods_softmax_grads\n",
+    "        #print(\"Simulated QK prods grads:\")\n",
+    "        #print(simulated_qk_prods_softmax_grads[0,:,:])\n",
+    "\n",
+    "        # qk prods softmax right before softmax\n",
+    "        hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.qk_prods_softmax.go_0\"\n",
+    "        hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "        ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad\"\n",
+    "        ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "        \n",
+    "        mismatches = np.where(~np.isclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2))\n",
+    "        mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "        pct_mismatch = len(mismatches) / (hf_qk_prods_softmax2.shape[0] * hf_qk_prods_softmax2.shape[1] * hf_qk_prods_softmax2.shape[2])\n",
+    "        print(f\"{pct_mismatch*100}% mismatch in QK prods softmax out grad\")\n",
+    "        # print(hf_qk_prods_softmax2[:2,:,0])\n",
+    "        # print(ff_qk_prods_softmax2[:2,:,0])\n",
+    "        assert(pct_mismatch <= 0.1)\n",
+    "\n",
+    "        # qk prods softmax right after softmax\n",
+    "        hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.pre_softmax.gi_0\"\n",
+    "        hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "        ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in\"\n",
+    "        ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "        compare_loaded_tensors(hf_qk_prods_softmax2, ff_qk_prods_softmax2)\n",
+    "        \n",
+    "        # qk prods softmax after mask\n",
+    "        hf_qk_prods_softmax2 = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.matmul_op.go_0\"\n",
+    "        hf_qk_prods_softmax2 = torch.load(hf_qk_prods_softmax2)\n",
+    "        ff_qk_prods_softmax2 = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_qk_prods_softmax_grad_in_masked\"\n",
+    "        ff_qk_prods_softmax2 = np.loadtxt(ff_qk_prods_softmax2, delimiter=',').reshape((num_new_tokens, num_tokens, num_heads), order = 'F')\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.squeeze().permute(1,2,0)\n",
+    "        hf_qk_prods_softmax2 = hf_qk_prods_softmax2.detach().cpu().numpy()\n",
+    "        assert(np.allclose(ff_qk_prods_softmax2, hf_qk_prods_softmax2, atol=1e-2))\n",
+    "\n",
+    "        # Compare query activation\n",
+    "        hf_query_activation = hf_path + f\"/fwd_step_0_layers.11.self_attn.query_activation.output_0\"\n",
+    "        hf_query_activation = torch.load(hf_query_activation)\n",
+    "        ff_query_activation = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_query_activation\"\n",
+    "        ff_query_activation = np.loadtxt(ff_query_activation, delimiter=',').reshape((qProjSize, num_heads, num_new_tokens), order = 'F')\n",
+    "        hf_query_activation = hf_query_activation.squeeze().permute(2,0,1).detach().cpu().numpy()\n",
+    "        # assert(np.allclose(ff_query_activation, hf_query_activation, atol=1e-2))\n",
+    "        # print(hf_query_activation[:,0,:])\n",
+    "        # print()\n",
+    "        # print(ff_query_activation[:,0,:])\n",
+    "        # assert False\n",
+    "        # compare_loaded_tensors(hf_query_activation, ff_query_activation)\n",
+    "        check_rope = False\n",
+    "        if check_rope:\n",
+    "        ########################################## ROPE and Kproj ##########################################\n",
+    "\n",
+    "            # Compare FF kproj with intermediate kproj data from HF\n",
+    "            hf_kproj_grads_post_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_post_rotary.go_0\"\n",
+    "            hf_kproj_grads_post_rotary = torch.load(hf_kproj_grads_post_rotary)\n",
+    "            hf_kproj_grads_post_rotary_copy = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "            # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary_copy.shape)\n",
+    "            # print(hf_kproj_grads_post_rotary_copy[:,:,0])\n",
+    "            # Check hf ROPE \n",
+    "            cos, sin = rotary_emb(hf_kproj_grads_post_rotary, seq_len=24)\n",
+    "            cos = cos.cuda()\n",
+    "            sin = sin.cuda()\n",
+    "            # query_states:  torch.Size([1, 12, 24, 64])\n",
+    "            # key_states:  torch.Size([1, 12, 24, 64])\n",
+    "            # position_ids:  torch.Size([1, 24])\n",
+    "            # tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "            #          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "            query_states = torch.zeros([1, 12, 24, 64]).cuda()\n",
+    "            position_ids = torch.arange(24).unsqueeze(0).cuda()\n",
+    "            query_states, hf_kproj_grads_post_rotary = apply_rotary_pos_emb(query_states, hf_kproj_grads_post_rotary, cos, sin, position_ids)\n",
+    "            hf_kproj_grads_post_rotary = hf_kproj_grads_post_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "            # print(\"hf_kproj_grads_post_rotary: \", hf_kproj_grads_post_rotary.shape)\n",
+    "            # print(hf_kproj_grads_post_rotary[:,:,0])\n",
+    "            \n",
+    "            hf_kproj_grads_before_rotary = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.identity_kv_before_rotary.go_0\"\n",
+    "            hf_kproj_grads_before_rotary = torch.load(hf_kproj_grads_before_rotary)\n",
+    "            hf_kproj_grads_before_rotary = hf_kproj_grads_before_rotary.squeeze().permute(1,2,0).detach().cpu().numpy()\n",
+    "            # print(\"hf_kproj_grads_before_rotary: \", hf_kproj_grads_before_rotary.shape)\n",
+    "            # print(hf_kproj_grads_before_rotary[:,:,0])\n",
+    "            # Compare HF rope with manual ROPE\n",
+    "            assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "            # Compare HF Kproj with FF Kproj (before ROPE) \n",
+    "            ff_kproj_pre = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj_pre\"\n",
+    "            ff_kproj_pre = np.loadtxt(ff_kproj_pre, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "            # print(\"ff_kproj_pre: \", ff_kproj_pre.shape)\n",
+    "            #print(ff_kproj_pre[:,:,0])\n",
+    "            mismatches = np.where(~np.isclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "            mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "            pct_mismatch = len(mismatches) / (ff_kproj_pre.shape[0] * ff_kproj_pre.shape[1] * ff_kproj_pre.shape[2])\n",
+    "            print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (before applying ROPE)\")\n",
+    "            assert(pct_mismatch <= 0.05)\n",
+    "            #assert(np.allclose(ff_kproj_pre, hf_kproj_grads_post_rotary_copy, atol=1e-5))\n",
+    "            \n",
+    "            ff_kproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devkproj\"\n",
+    "            ff_kproj = np.loadtxt(ff_kproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads), order = 'F')\n",
+    "            # print(\"ff_kproj: \", ff_kproj.shape)\n",
+    "            #print(ff_kproj[:,:,0])\n",
+    "            mismatches = np.where(~np.isclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "            mismatches = [(mismatches[0][i],mismatches[1][i], mismatches[2][i]) for i in range(len(mismatches[0]))]\n",
+    "            pct_mismatch = len(mismatches) / (ff_kproj.shape[0] * ff_kproj.shape[1] * ff_kproj.shape[2])\n",
+    "            print(f\"{pct_mismatch*100}% mismatch between HF and FF for kproj (after applying ROPE)\")\n",
+    "            assert(pct_mismatch <= 0.05)\n",
+    "            #assert(np.allclose(ff_kproj, hf_kproj_grads_before_rotary, atol=1e-5))\n",
+    "        \n",
+    "        \n",
+    "            #assert(np.allclose(hf_kproj_grads_post_rotary, hf_kproj_grads_before_rotary, atol=1e-2))\n",
+    "            hf_kproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.k_proj.go_0\"\n",
+    "            hf_kproj_grads = torch.load(hf_kproj_grads).squeeze()\n",
+    "            #print(\"hf_kproj_grads: \", hf_kproj_grads.shape)\n",
+    "            #print(hf_kproj_grads[:,:64])\n",
+    "            reshaped_tensor = hf_kproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "            #print(reshaped_tensor.shape)\n",
+    "            assert(np.allclose(ff_kproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "        ########################################## Qproj (with ROPE) ##########################################\n",
+    "\n",
+    "        # Compare QProj\n",
+    "        hf_qproj_grads = f\"{hf_path}/bwd_step_0_layers.{i}.self_attn.q_proj.go_0\"\n",
+    "        hf_qproj_grads = torch.load(hf_qproj_grads).squeeze()\n",
+    "        # print(\"HF Qproj:\")\n",
+    "        # print(hf_qproj_grads.shape)\n",
+    "        reshaped_tensor = hf_qproj_grads.view(24, 12, 64).transpose(1, 2).contiguous().detach().cpu().numpy()\n",
+    "        # print(\"\\t reshaped: \", reshaped_tensor.shape)\n",
+    "        # print(reshaped_tensor[:,:,0])\n",
+    "        ff_qproj = ff_path + f\"/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_devQKVPRojArray\"\n",
+    "        ff_qproj = np.loadtxt(ff_qproj, delimiter=',').reshape((num_tokens, qProjSize, num_heads, 3), order = 'F')[:,:,:,0]\n",
+    "        # print(\"FF Qproj:\")\n",
+    "        # print(ff_qproj.shape)\n",
+    "        # print(ff_qproj[:,:,0])\n",
+    "        assert(np.allclose(ff_qproj, reshaped_tensor, atol=1e-2))\n",
+    "\n",
+    "    hf_attn_in = f\"{hf_path}/bwd_step_0_layers.{i}.input_layernorm.go_0\"\n",
+    "    hf_attn_in = torch.load(hf_attn_in)\n",
+    "    hf_attn_in = hf_attn_in.squeeze().T\n",
+    "    hf_attn_in = hf_attn_in.detach().cpu().numpy()\n",
+    "    print(\"hf_attn_in: \", hf_attn_in.shape)\n",
+    "    print(hf_attn_in)\n",
+    "\n",
+    "    ff_attn_in = f\"{ff_path}/bwd_step_0_layers_{i}_layers_{i}_attention_shard_0_attn_final_grad_in\"\n",
+    "    ff_attn_in = np.loadtxt(ff_attn_in, delimiter=',').reshape((768,num_tokens), order = 'F')\n",
+    "    print(\"ff_attn_in: \", ff_attn_in.shape)\n",
+    "    print(ff_attn_in)\n",
+    "    #assert(np.allclose(ff_attn_in, hf_attn_in, atol=1e-2))\n",
+    "\n",
+    "    mismatches = np.where(~np.isclose(ff_attn_in, hf_attn_in))\n",
+    "    mismatches = [(mismatches[0][i], mismatches[1][i]) for i in range(len(mismatches[0]))]\n",
+    "    pct_mismatch = len(mismatches) / (hf_attn_in.shape[0] * hf_attn_in.shape[1])\n",
+    "    print(f\"{pct_mismatch*100}% mismatch in attention input grads\")\n",
+    "    assert(pct_mismatch <= 0.1)\n",
+    "    \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[-0.01614726  0.01363804  0.01768043 ...  0.00724926 -0.00149747\n",
+      " -0.01781223]\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = np.fromfile(\"/usr0/home/goliaro/.cache/flexflow/weights/goliaro/llama-160m-lora-full/full-precision/layers_11_feed_forward_w2_lora_A_weight\", dtype=np.float32)\n",
+    "print(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# value states: torch.Size([1, 12, 24, 64])\n",
+    "value_states=torch.from_numpy(hf_kproj_grads_post_rotary).permute(2,0,1).unsqueeze(0)\n",
+    "key_states = value_states\n",
+    "cos, sin = rotary_emb(value_states, seq_len=kv_seq_len)\n",
+    "# query_states:  torch.Size([1, 12, 24, 64])\n",
+    "# key_states:  torch.Size([1, 12, 24, 64])\n",
+    "# position_ids:  torch.Size([1, 24])\n",
+    "# tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+    "#          18, 19, 20, 21, 22, 23]], device='cuda:0')\n",
+    "query_states = torch.zeros([1, 12, 24, 64])\n",
+    "position_ids = torch.arange(24).unsqueeze(0)\n",
+    "query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)\n",
+    "key_states = key_states.squeeze()\n",
+    "print(key_states.shape)\n",
+    "print(key_states[0,:,:])\n",
+    "print(hf_kproj_grads_before_rotary.shape)\n",
+    "print(hf_kproj_grads_before_rotary[:,:,0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,\n",
+       "         18, 19, 20, 21, 22, 23]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "torch.arange(24).unsqueeze(0).cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 12, 24, 24])\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb Cell 6\u001b[0m line \u001b[0;36m1\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=16'>17</a>\u001b[0m     ff_qkps \u001b[39m=\u001b[39m ff_qk_prods_softmax[:,:,head_idx]\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=17'>18</a>\u001b[0m     \u001b[39massert\u001b[39;00m(np\u001b[39m.\u001b[39mallclose(ff_qkps, hf_qkps, atol\u001b[39m=\u001b[39m\u001b[39m1e-5\u001b[39m))\n\u001b[0;32m---> <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=18'>19</a>\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mFalse\u001b[39;00m)\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=20'>21</a>\u001b[0m hf_value_states \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39mload(hf_value_states)\u001b[39m#.squeeze().T.detach().cpu().numpy()\u001b[39;00m\n\u001b[1;32m     <a href='vscode-notebook-cell://ssh-remote%2Bgs22359.sp.cs.cmu.edu/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/alignment_tests.ipynb#X11sdnNjb2RlLXJlbW90ZQ%3D%3D?line=21'>22</a>\u001b[0m \u001b[39mprint\u001b[39m(hf_value_states\u001b[39m.\u001b[39mshape)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "layer_num = 11\n",
+    "hf_qk_prods_softmax = f\"{hf_path}/fwd_step_0_layers.11.self_attn.qk_prods_softmax\"\n",
+    "ff_qk_prods_softmax = f\"{ff_path}/model_0_bwd-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_shard-id_0_qk_prods_softmax\"\n",
+    "\n",
+    "hf_value_states = f\"{hf_path}/fwd_step_0_layers.11.self_attn.value_states\"\n",
+    "\n",
+    "hf_qk_prods_softmax = torch.load(hf_qk_prods_softmax)#.squeeze().T.detach().cpu().numpy()\n",
+    "ff_qk_prods_softmax = np.loadtxt(ff_qk_prods_softmax, delimiter=',').reshape((24, 24, 12), order = 'F')\n",
+    "print(hf_qk_prods_softmax.shape)\n",
+    "#print(ff_qk_prods_softmax.shape)\n",
+    "#print(hf_qk_prods_softmax[:,:,0])\n",
+    "#print()\n",
+    "#print(ff_qk_prods_softmax[:,:,0])\n",
+    "\n",
+    "for head_idx in range(12):\n",
+    "    hf_qkps = hf_qk_prods_softmax.squeeze()[head_idx, :, :].detach().cpu().numpy()\n",
+    "    ff_qkps = ff_qk_prods_softmax[:,:,head_idx]\n",
+    "    assert(np.allclose(ff_qkps, hf_qkps, atol=1e-5))\n",
+    "\n",
+    "\n",
+    "hf_value_states = torch.load(hf_value_states)#.squeeze().T.detach().cpu().numpy()\n",
+    "print(hf_value_states.shape)\n",
+    "attn_output = torch.matmul(hf_qk_prods_softmax, hf_value_states)\n",
+    "print()\n",
+    "print(attn_output.shape)\n",
+    "print(attn_output.transpose(1, 2).contiguous().shape)\n",
+    "print(\"Hf attn heads\")\n",
+    "print(torch.load(\"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_layers.11.self_attn.o_proj.input_0\").shape)\n",
+    "\n",
+    "print(\"Attn heads grads:\")\n",
+    "hf_attn_heads_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.o_proj.gi_0\"\n",
+    "print(torch.load(hf_attn_heads_grads).shape)\n",
+    "print(\"HF value grads:\")\n",
+    "vproj_grads = f\"{hf_path}/bwd_step_0_layers.{layer_num}.self_attn.v_proj.gi_0\"\n",
+    "print(torch.load(vproj_grads).shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([2, 3, 4])\n",
+      "torch.Size([4, 3, 2])\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = torch.randn(2,3,4)\n",
+    "print(a.shape)\n",
+    "print(a.T.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[[   0.0000,    0.0000,    0.0000,  ...,    0.0000,    0.0000,\n",
+      "             0.0000],\n",
+      "         [  27.8890,  -21.5089,   45.8214,  ...,    5.4010,  -10.8787,\n",
+      "            39.7619],\n",
+      "         [  19.2197,   27.4681,  -68.7141,  ...,  102.3280,   66.7925,\n",
+      "          -160.8711],\n",
+      "         ...,\n",
+      "         [  63.9532,   17.4273,  -29.4416,  ...,  101.6105,   67.5937,\n",
+      "          -198.4432],\n",
+      "         [  31.2799,   13.0724,  -44.7179,  ...,  132.4898,   42.3135,\n",
+      "          -194.4037],\n",
+      "         [  42.3453,  -16.2693,  -55.7386,  ...,   90.5921,   52.2032,\n",
+      "          -124.1802]]], device='cuda:0')\n",
+      "tensor([[[-1.1845e+06, -6.7460e+05,  7.4494e+05,  ..., -9.1441e+05,\n",
+      "          -1.4912e+05,  3.5769e+06],\n",
+      "         [-7.3920e+01, -7.9389e+01,  1.1027e+02,  ..., -7.3020e+01,\n",
+      "          -2.3540e+01,  3.4587e+02],\n",
+      "         [-5.3885e+01, -1.7373e+01, -1.9780e+01,  ...,  4.1291e+01,\n",
+      "           5.5099e+01,  5.5910e+01],\n",
+      "         ...,\n",
+      "         [-2.1948e+01, -3.2109e+01,  2.8364e+01,  ...,  3.4321e+01,\n",
+      "           5.0713e+01,  5.6592e+01],\n",
+      "         [-4.4339e+01, -2.8339e+01,  1.4070e+01,  ...,  6.2797e+01,\n",
+      "           3.0760e+01,  6.1743e+01],\n",
+      "         [-1.6287e+01, -5.0413e+01, -1.9940e+01,  ...,  4.3766e+01,\n",
+      "           4.7833e+01,  4.7295e+01]]], device='cuda:0')\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = \"./hf_peft_tensors/bwd_step_0_layers.11.post_attention_layernorm.gi_0\"\n",
+    "b = \"./hf_peft_tensors/bwd_step_0_layers.11.self_attn.o_proj.go_0\"\n",
+    "a = torch.load(a)\n",
+    "b = torch.load(b)\n",
+    "print(a)\n",
+    "print(b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "for layer_num in range(12):\n",
+    "    hf_lora_A_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp, tolerance=1e-5)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp, tolerance=1e-5)\n",
+    "    hf_w1_weight = f\"{hf_path}/layers.{layer_num}.mlp.gate_proj.weight\"\n",
+    "    ff_w1_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w1_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w1_weight, ff_w1_weight, tolerance=1e-5)\n",
+    "    hf_w3_weight = f\"{hf_path}/layers.{layer_num}.mlp.up_proj.weight\"\n",
+    "    ff_w3_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w3_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w3_weight, ff_w3_weight, tolerance=1e-5)\n",
+    "    hf_w2_weight = f\"{hf_path}/layers.{layer_num}.mlp.down_proj.weight\"\n",
+    "    ff_w2_weight = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_shard-id_0_weight_0\"\n",
+    "    compare_tensors(hf_w2_weight, ff_w2_weight, tolerance=1e-5)\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/peft/alignment/opt_alignment_tests.ipynb b/tests/peft/alignment/opt_alignment_tests.ipynb
new file mode 100644
index 0000000000..ca679b1857
--- /dev/null
+++ b/tests/peft/alignment/opt_alignment_tests.ipynb
@@ -0,0 +1,450 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import os, torch\n",
+    "from align_test_utils import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- Attn bias + residual ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "--- MLP ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "--- LM head ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "\n",
+      "--- Final Norm ---\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "qProjSize = 64\n",
+    "num_heads = 12\n",
+    "num_tokens = 25\n",
+    "for i in range(tot_num_layers):\n",
+    "    hf_base = os.path.join(hf_path, f\"fwd_step_0_decoder.layers.{i}.\")\n",
+    "    ff_base = os.path.join(ff_path, f\"fwd_step_0_layers_{i}_layers_{i}_\")\n",
+    "    \n",
+    "    # LayerNorm\n",
+    "    hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n",
+    "    ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "    hf_tensor = hf_base + \"self_attn_layer_norm.output_0\"\n",
+    "    ff_tensor = ff_base + \"attention_layer_norm_shard_0_output_1\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "\n",
+    "    # # Attention QKV proj\n",
+    "    # print(\"---Attn---\")\n",
+    "    # ff_tensor = ff_base + \"attention_shard_0_qkv_proj_output\"\n",
+    "    # ff_tensor = load_ff_tensor(ff_tensor, [qProjSize, num_heads, 3, num_tokens])\n",
+    "    # ff_q_proj = ff_tensor[:,:,0,:]\n",
+    "    # ff_k_proj = ff_tensor[:,:,1,:]\n",
+    "    # ff_v_proj = ff_tensor[:,:,2,:]\n",
+    "    # hf_q_proj = hf_base + \"self_attn.q_proj.output_0\"\n",
+    "    # hf_q_proj = load_hf_tensor(hf_q_proj).squeeze().T\n",
+    "    # hf_q_proj = hf_q_proj.reshape(12,64,25)\n",
+    "    # hf_q_proj = np.transpose(hf_q_proj, (1,0,2))\n",
+    "    # hf_k_proj = hf_base + \"self_attn.k_proj.output_0\"\n",
+    "    # hf_k_proj = load_hf_tensor(hf_k_proj).squeeze().T\n",
+    "    # hf_k_proj = hf_k_proj.reshape(12,64,25)\n",
+    "    # hf_k_proj = np.transpose(hf_k_proj, (1,0,2))\n",
+    "    # hf_v_proj = hf_base + \"self_attn.v_proj.output_0\"\n",
+    "    # hf_v_proj = load_hf_tensor(hf_v_proj).squeeze().T\n",
+    "    # hf_v_proj = hf_v_proj.reshape(12,64,25)\n",
+    "    # hf_v_proj = np.transpose(hf_v_proj, (1,0,2))\n",
+    "    # compare_loaded_tensors(hf_q_proj/np.sqrt(qProjSize), ff_q_proj)\n",
+    "    # compare_loaded_tensors(hf_k_proj, ff_k_proj)\n",
+    "    # compare_loaded_tensors(hf_v_proj, ff_v_proj)\n",
+    "\n",
+    "    # Compare attn bias, residuals\n",
+    "    print(\"--- Attn bias + residual ---\")\n",
+    "    ff_residual1 = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_input_1\"\n",
+    "    ff_residual2 = ff_base + \"attention_layer_norm_shard_0_output_0\"\n",
+    "    compare_flexflow_tensors(ff_residual1, ff_residual2)\n",
+    "    hf_tensor = hf_base + \"self_attn_layer_norm.input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_residual2)\n",
+    "    ff_tensor = ff_path + f\"/fwd_step_0_layers_{i}_AddBiasResidualLayerNorm_shard_0_output_0\"\n",
+    "    hf_tensor = hf_base + \"final_layer_norm.input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "    \n",
+    "    print(\"--- MLP ---\")\n",
+    "    hf_tensor = hf_base + \"fc1.input_0\"\n",
+    "    ff_tensor = ff_base + \"fc1_shard_0_input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "    hf_tensor = hf_base + \"fc2.input_0\"\n",
+    "    ff_tensor = ff_base + \"fc2_shard_0_input_0\"\n",
+    "    compare_tensors(hf_tensor, ff_tensor)\n",
+    "# LM head\n",
+    "print(\"\\n--- LM head ---\")\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.input_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "# Final layer norm\n",
+    "print(\"\\n--- Final Norm ---\")\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.input_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "ff_tensor1 = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_input_activation\"\n",
+    "# compare_flexflow_tensors_shortest(ff_tensor, ff_tensor1)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.output_0\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_output_1\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_1\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_mean\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n",
+    "hf_tensor = hf_path + \"/fwd_step_0_decoder.final_layer_norm.saved_result_2\"\n",
+    "ff_tensor = ff_path + \"/fwd_step_0_layers_11_final_layer_norm_shard_0_rstd\"\n",
+    "compare_tensors(hf_tensor, ff_tensor)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[17], line 22\u001b[0m\n\u001b[1;32m     19\u001b[0m compare_flexflow_tensors(ff_tensor, ff_tensor1)\n\u001b[1;32m     20\u001b[0m compare_tensors(hf_tensor, ff_tensor) \u001b[38;5;66;03m# fails\u001b[39;00m\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m     24\u001b[0m \u001b[38;5;66;03m# Compare fwd input/output of layernorm\u001b[39;00m\n\u001b[1;32m     25\u001b[0m hf_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_decoder.final_layer_norm.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "# Compare backward pass\n",
+    "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n",
+    "hf_tensor = hf_path + \"/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_embed_tokens_weight_lm_head_shard_0_input_0\"\n",
+    "compare_tensors(hf_tensor, ff_tensor, tolerance=1e-5)\n",
+    "\n",
+    "hf_tensor1 = hf_path + \"/bwd_step_0_decoder.final_layer_norm.go_0\"\n",
+    "compare_hf_tensors(hf_tensor, hf_tensor1)\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_output_0\"\n",
+    "compare_tensors(hf_tensor1, ff_tensor)\n",
+    "\n",
+    "hf_tensor = hf_path + \"/bwd_step_0_decoder.final_layer_norm.gi_0\"\n",
+    "ff_tensor = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_0\"\n",
+    "ff_tensor1 = ff_path + \"/bwd_step_0_layers_11_final_layer_norm_shard_0_input_1\"\n",
+    "compare_flexflow_tensors(ff_tensor, ff_tensor1)\n",
+    "compare_tensors(hf_tensor, ff_tensor) # fails"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\n",
+      "HF: [ 0.0193019  -1.0467215   0.21579844 ...  0.04534929 -0.25642633\n",
+      "  0.10879952]\n",
+      "FF:[ 0.01458706 -1.02212262  0.20589906 ...  0.04446212 -0.25625792\n",
+      "  0.108039  ]\n",
+      "[ True False  True ...  True  True  True]\n",
+      "[    1     3     7 ... 19170 19174 19188]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[11], line 16\u001b[0m\n\u001b[1;32m     14\u001b[0m hf_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     15\u001b[0m ff_fc1_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 16\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_fc1_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_fc1_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;66;03m# LORA input\u001b[39;00m\n\u001b[1;32m     20\u001b[0m hf_lora_A_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/fwd_step_0_layers.\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlayer_num\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.mlp.down_proj.lora_A.default.input_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:32\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     27\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     28\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     29\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     30\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     31\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "for layer_num in range(tot_num_layers):\n",
+    "    hf_input_ln_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.self_attn_layer_norm.output_0\"\n",
+    "    ff_input_ln_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_attention_layer_norm_shard-id_0_output_1\"\n",
+    "    compare_tensors(hf_input_ln_out, ff_input_ln_out)\n",
+    "   \n",
+    "    hf_ffn_norm_in = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.input_0\"\n",
+    "    ff_ffn_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_0\"\n",
+    "    # compare_tensors(hf_ffn_norm_in, ff_ffn_norm_in)\n",
+    "    \n",
+    "    hf_ffn_norm_out = f\"{hf_path}/fwd_step_0_decoder.layers.{layer_num}.final_layer_norm.output_0\"\n",
+    "    ff_ffn_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_AddBiasResidualLayerNorm_shard-id_0_output_1\"\n",
+    "    # compare_tensors(hf_ffn_norm_out, ff_ffn_norm_out)\n",
+    "    hf_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.layers.0.fc1.input_0\"\n",
+    "    ff_fc1_in = \"/usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_0_layer-name_layers_0_fc1_shard-id_0_input_0\"\n",
+    "    compare_tensors(hf_fc1_in, ff_fc1_in)\n",
+    "\n",
+    "\n",
+    "    # LORA input\n",
+    "    hf_lora_A_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.input_0\"\n",
+    "    ff_lora_A_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_input_0\"\n",
+    "    compare_hf_tensors(hf_down_proj_in, hf_lora_A_in)\n",
+    "    compare_tensors(hf_lora_A_in, ff_lora_A_in)\n",
+    "    # LORA weights\n",
+    "    hf_lora_A_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_A.default.weight\"\n",
+    "    ff_lora_A_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_A\"\n",
+    "    compare_tensors(hf_lora_A_weight_fp, ff_lora_A_weight_fp)\n",
+    "    hf_lora_B_weight_fp = f\"{hf_path}/base_model.model.model.layers.{layer_num}.mlp.down_proj.lora_B.default.weight\"\n",
+    "    ff_lora_B_weight_fp = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_weight_B\"\n",
+    "    compare_tensors(hf_lora_B_weight_fp, ff_lora_B_weight_fp)\n",
+    "    # LORA intermediate hf\n",
+    "    hf_lora_A_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_A.default.output_0\"\n",
+    "    hf_lora_B_in = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.input_0\"\n",
+    "    compare_hf_tensors(hf_lora_A_out, hf_lora_B_in)\n",
+    "    # LORA output\n",
+    "    hf_lora_out = f\"{hf_path}/fwd_step_0_layers.{layer_num}.mlp.down_proj.lora_B.default.output_0\"\n",
+    "    ff_lora_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{layer_num}_layer-name_layers_{layer_num}_feed_forward_w2_lora_shard-id_0_output_0\"\n",
+    "    # compare_tensors(hf_lora_out, ff_lora_out)\n",
+    "    # compare_flexflow_tensors(ff_down_proj_out, ff_lora_out)\n",
+    "    # compare_tensors(hf_down_proj_out, ff_lora_out)\n",
+    "    compare_tensors_difference(hf_lora_out, ff_lora_out, ff_down_proj_out)\n",
+    "    \n",
+    "\n",
+    "# After last layer only\n",
+    "hf_norm_out = f\"{hf_path}/fwd_step_0_norm.output_0\"\n",
+    "ff_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_norm_shard-id_0_output_1\"\n",
+    "compare_tensors(hf_norm_out, ff_norm_out)\n",
+    "hf_lm_head_out = f\"{hf_path}/fwd_step_0_base_model.model.lm_head.output_0\"\n",
+    "ff_lm_head_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_{tot_num_layers-1}_layer-name_output_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_lm_head_out, ff_lm_head_out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "Ok!\n",
+      "mismatch between /usr0/home/goliaro/Desktop/FlexFlow/tests/peft/hf_peft_tensors/fwd_step_0_decoder.final_layer_norm.input_0 and /usr0/home/goliaro/Desktop/FlexFlow/build/inference_tensors/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\n",
+      "HF: [-0.00542103 -1.781267    0.16552497 ... -0.77217525 -0.5760026\n",
+      "  0.04363118]\n",
+      "FF:[ 0.03817766 -1.5644939   0.22477378 ... -0.94569921 -0.43960798\n",
+      " -0.06447437]\n",
+      "[False False False ... False False False]\n",
+      "[    0     1     2 ... 19197 19198 19199]\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[10], line 22\u001b[0m\n\u001b[1;32m     20\u001b[0m ff_FWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m     21\u001b[0m ff_FWD_norm_out \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mff_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 22\u001b[0m \u001b[43mcompare_tensors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhf_FWD_norm_in\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mff_FWD_norm_in\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     23\u001b[0m compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n\u001b[1;32m     25\u001b[0m hf_BWD_norm_in \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mhf_weight_base_path\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/bwd_step_0_decoder.final_layer_norm.gi_0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "File \u001b[0;32m~/Desktop/FlexFlow/tests/peft/align_test_utils.py:29\u001b[0m, in \u001b[0;36mcompare_tensors\u001b[0;34m(hf_tensor_filepath, ff_tensor_filepath, tolerance)\u001b[0m\n\u001b[1;32m     24\u001b[0m     \u001b[38;5;28mprint\u001b[39m(mismatches)\n\u001b[1;32m     25\u001b[0m     \u001b[38;5;66;03m#print(np.nonzero(hf_tensor)[0])\u001b[39;00m\n\u001b[1;32m     26\u001b[0m     \u001b[38;5;66;03m# print(np.where(np.isclose(ff_tensor, hf_tensor, atol=tolerance) ==0)[0])\u001b[39;00m\n\u001b[1;32m     27\u001b[0m     \u001b[38;5;66;03m# print(ff_tensor[36], hf_tensor[36])\u001b[39;00m\n\u001b[1;32m     28\u001b[0m \u001b[38;5;66;03m#assert(np.allclose(ff_tensor, hf_tensor, atol=tolerance))\u001b[39;00m\n\u001b[0;32m---> 29\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m(\u001b[38;5;28mlen\u001b[39m(mismatches) \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m.05\u001b[39m\u001b[38;5;241m*\u001b[39mlen_hf_tensor)\n\u001b[1;32m     30\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOk!\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "tot_num_layers = 12\n",
+    "\n",
+    "ff_BWD_softmax_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_100_layer-name_Softmax_shard-id_0_input_0\"\n",
+    "\n",
+    "hf_BWD_lm_head_out = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.go_0\"\n",
+    "ff_BWD_lm_head_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_output_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_out, ff_BWD_lm_head_out, tolerance=1e-5)\n",
+    "hf_BWD_lm_head_in = f\"{hf_path}/bwd_step_0_base_model.model.lm_head.gi_0\"\n",
+    "ff_BWD_lm_head_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_embed_tokens_weight_lm_head_shard-id_0_input_0\"\n",
+    "compare_tensors(hf_BWD_lm_head_in, ff_BWD_lm_head_in, tolerance=1e-5)\n",
+    "\n",
+    "hf_BWD_norm_out = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.go_0\"\n",
+    "ff_BWD_norm_out = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_output_0\"\n",
+    "compare_hf_tensors(hf_BWD_lm_head_in, hf_BWD_norm_out)\n",
+    "compare_tensors(hf_BWD_norm_out, ff_BWD_norm_out)\n",
+    "\n",
+    "# Compare fwd input/output of layernorm\n",
+    "hf_FWD_norm_in = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.input_0\"\n",
+    "hf_FWD_norm_out = f\"{hf_path}/fwd_step_0_decoder.final_layer_norm.output_0\"\n",
+    "ff_FWD_norm_in = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_0\"\n",
+    "ff_FWD_norm_out = f\"{ff_path}/model_0_decoding-step_0_layer-num_11_layer-name_final_layer_norm_shard-id_0_output_1\"\n",
+    "compare_tensors(hf_FWD_norm_in, ff_FWD_norm_in)\n",
+    "compare_tensors(hf_FWD_norm_out, ff_FWD_norm_out)\n",
+    "\n",
+    "hf_BWD_norm_in = f\"{hf_path}/bwd_step_0_decoder.final_layer_norm.gi_0\"\n",
+    "ff_BWD_norm_in = f\"{ff_path}/model_0_bwd-step_0_layer-num_{tot_num_layers-1}_layer-name_final_layer_norm_shard-id_0_input_1\"\n",
+    "compare_tensors(hf_BWD_norm_in, ff_BWD_norm_in, tolerance=1e-5)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/tests/peft/hf_finetune.py b/tests/peft/hf_finetune.py
new file mode 100644
index 0000000000..16b46cfa81
--- /dev/null
+++ b/tests/peft/hf_finetune.py
@@ -0,0 +1,129 @@
+import os, sys, shutil
+import torch
+
+# Reproducibility
+import random
+import numpy as np
+
+torch.manual_seed(0)
+random.seed(0)
+np.random.seed(0)
+# torch.use_deterministic_algorithms(True)
+
+# import bitsandbytes as bnb
+import argparse
+import transformers
+
+if transformers.__version__ < "4.31.0":
+    raise RuntimeError(
+        "Please update the transformers library version to 4.31.0 or above"
+    )
+from datasets import load_dataset
+
+
+from hf_utils import *
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--peft-model-id", type=str, default="goliaro/llama-160m-lora")
+    parser.add_argument(
+        "--lora-alpha",
+        type=int,
+        default=-1,
+        help="The scaling coefficient for LoRA. Leave it set to -1 to use the original value from the HF config",
+    )
+    parser.add_argument(
+        "--lora-dropout",
+        type=float,
+        default=0.0,
+        help="The dropout rate for LoRA. Set it to -1 to use the original value from the HF config",
+    )
+    parser.add_argument("-lr", "--learning-rate", type=float, default=0.001)
+    parser.add_argument("-n", "--max-steps", type=int, default=2)
+    parser.add_argument(
+        "--optimizer", type=str, choices=["sgs", "adam", "adamw"], default="sgd"
+    )
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--publish-peft-with-id", type=str, default="")
+    parser.add_argument(
+        "--save-peft-tensors",
+        action="store_true",
+        help="Save PEFT hidden states and weights to file",
+    )
+    args = parser.parse_args()
+
+    # Change working dir to folder storing this script
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+
+    # Get PEFT config, model, tokenizer, and optimizer type
+    peft_config = build_peft_config(args, finetuning=True)
+    tokenizer = get_peft_tokenizer(args, peft_config)
+    model = build_peft_model(args, peft_config)
+    optim_type = get_optim_type(args)
+
+    # Print model with PEFT
+    print(model)
+    for name, params in model.named_parameters():
+        print(name)
+    print_trainable_parameters(model)
+
+    # Add hooks to save PEFT tensors, save any weights of interest before finetuning
+    if args.save_peft_tensors:
+        make_debug_dirs()
+        register_peft_hooks(model)
+        save_peft_weights(model, target_modules=["lora", "lm_head", "down_proj"])
+
+    # Load fine-tuning dataset
+    data = load_dataset("Abirate/english_quotes")
+    # TODO: remove using of a single row
+    key_to_filter = "quote"
+    desired_value = "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”"
+    data = filter_dataset_for_debugging(data, key_to_filter, desired_value)
+    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+    # Training loop
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=data["train"],
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=1,
+            gradient_accumulation_steps=1,
+            max_grad_norm=None,  # Disable gradient clipping
+            warmup_steps=0,
+            max_steps=args.max_steps,
+            learning_rate=args.learning_rate,
+            fp16=True if not args.use_full_precision else False,
+            logging_steps=1,
+            output_dir=os.path.join(
+                args.output_dir if len(args.output_dir) > 0 else "./",
+                "lora_training_logs",
+            ),
+            optim=optim_type,
+            lr_scheduler_type=transformers.training_args.SchedulerType.CONSTANT,
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(
+            tokenizer, mlm=False
+        ),
+        callbacks=[HFTrainingCallBack] if args.save_peft_tensors else None,
+    )
+    # silence the warnings. Please re-enable for inference!
+    model.config.use_cache = False
+
+    # for batch in trainer.get_train_dataloader():
+    #     print("First batch: ")
+    #     print(batch)
+    #     break
+
+    trainer.train()
+
+    save_finetuned_model(model, args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/peft/hf_serve.py b/tests/peft/hf_serve.py
new file mode 100644
index 0000000000..7bfc560cc2
--- /dev/null
+++ b/tests/peft/hf_serve.py
@@ -0,0 +1,140 @@
+import argparse
+import torch
+import os, sys, shutil, json
+from peft import PeftModel, PeftConfig
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoConfig,
+    LlamaTokenizer,
+    GenerationConfig,
+)
+
+
+def peft_pre_forward_hook(module, input):
+    assert module.name is not None and module.decoding_step is not None
+    name = module.name.replace("base_model.model.model.", "")
+    print(
+        f"Pre-forward hook activated on module: {name}, decoding step: {module.decoding_step}"
+    )
+    print("Pre-Input: ", input[0].shape)
+    torch.save(
+        input, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.input"
+    )
+    # print("===")
+
+
+def peft_post_forward_hook(module, input, output):
+    assert module.name is not None and module.decoding_step is not None
+    name = module.name.replace("base_model.model.model.", "")
+    print(
+        f"Post-forward Hook activated for module: {name}, decoding step: {module.decoding_step}"
+    )
+    print("Post-Input/Output: ", input[0].shape, output[0].shape)
+    torch.save(
+        output, f"./hf_peft_tensors/decoding_step_{module.decoding_step}_{name}.output"
+    )
+    print("===")
+    module.decoding_step += 1
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--peft-model-id", type=str, required=True)
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
+    parser.add_argument("--max-length", type=int, default=50)
+    parser.add_argument("--prompt-file", type=str, required=True)
+    parser.add_argument("--do-sample", action="store_true", help="Use sampling")
+    parser.add_argument(
+        "--save-peft-tensors",
+        action="store_true",
+        help="Save PEFT hidden states and weights to file",
+    )
+    args = parser.parse_args()
+
+    # Check if prompt-file exists
+    if not os.path.isfile(args.prompt_file):
+        print(f"Error: {args.prompt_file} does not exist.")
+        return
+
+    # Get peft model config
+    config = PeftConfig.from_pretrained(args.peft_model_id)
+    
+    # Load the base model
+    model = AutoModelForCausalLM.from_pretrained(
+        config.base_model_name_or_path,
+        return_dict=True,
+        # load_in_8bit=True,
+        torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        device_map="auto",
+    )
+    # Load the Lora model
+    model = PeftModel.from_pretrained(model, args.peft_model_id)
+    print(model)
+    
+    # Get tokenizer
+    hf_config = AutoConfig.from_pretrained(
+        config.base_model_name_or_path, trust_remote_code=True
+    )
+    hf_arch = getattr(hf_config, "architectures")[0]
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+        tokenizer = LlamaTokenizer.from_pretrained(
+            config.base_model_name_or_path,
+            use_fast=True,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            config.base_model_name_or_path,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        )
+    
+    # Generation config
+    generation_config = GenerationConfig.from_pretrained(config.base_model_name_or_path)
+    generation_config.do_sample = args.do_sample
+
+    # Register hooks to save tensors, if needed
+    if args.save_peft_tensors:
+        # Change working dir to folder storing this script
+        abspath = os.path.abspath(__file__)
+        dname = os.path.dirname(abspath)
+        os.chdir(dname)
+        # Create output dir
+        shutil.rmtree("./hf_peft_tensors")
+        os.makedirs("./hf_peft_tensors", exist_ok=True)
+        # Save weights
+        for name, params in model.named_parameters():
+            if "lora" in name:
+                torch.save(params, f"./hf_peft_tensors/{name}")
+                # params.detach().cpu().numpy().tofile(f"{weights_path}/{name}")
+        # Save hidden states
+        for name, layer in dict(model.named_modules()).items():
+            if "lora_A.default" in name or "lora_B.default" in name:
+                layer.name = name
+                layer.decoding_step = 0
+                print(f"Adding hooks to layer {layer.name}")
+                layer.register_forward_pre_hook(peft_pre_forward_hook)
+                layer.register_forward_hook(peft_post_forward_hook)
+
+    # Run inference
+    # Read prompt-file into a list of strings
+    with open(args.prompt_file, "r") as f:
+        try:
+            prompt_list = json.load(f)
+        except json.JSONDecodeError:
+            print(f"Error: Unable to parse {args.prompt_file} as JSON.")
+            sys.exit(1)
+    
+    for i, prompt in enumerate(prompt_list):
+        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+        with torch.cuda.amp.autocast():
+            output_tokens = model.generate(
+                **batch, max_new_tokens=args.max_length, generation_config=generation_config
+            )
+        print("\n\n", tokenizer.decode(output_tokens[0], skip_special_tokens=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/peft/hf_train.py b/tests/peft/hf_train.py
new file mode 100644
index 0000000000..707fc9d0ae
--- /dev/null
+++ b/tests/peft/hf_train.py
@@ -0,0 +1,161 @@
+import os, sys
+
+# os.environ["CUDA_VISIBLE_DEVICES"]="0"
+import torch
+import torch.nn as nn
+
+# import bitsandbytes as bnb
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaTokenizer
+import argparse
+from peft import LoraConfig, get_peft_model
+import transformers
+from datasets import load_dataset
+
+
+class CastOutputToFloat(nn.Sequential):
+    def forward(self, x):
+        return super().forward(x).to(torch.float32)
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-name", type=str, default="meta-llama/Llama-2-7b-hf")
+    parser.add_argument("--lora-rank", type=int, default=16)
+    parser.add_argument("--lora-alpha", type=int, default=32)
+    parser.add_argument(
+        "--lora-target-modules",
+        type=str,
+        default="down_proj",
+        help="Comma-separated list of layers from the base model to target",
+    )
+    parser.add_argument("--lora-dropout", type=float, default=0.05)
+    parser.add_argument(
+        "--use-full-precision", action="store_true", help="Use full precision"
+    )
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--publish-peft-with-id", type=str, default="")
+    args = parser.parse_args()
+    model_name = args.model_name
+    use_full_precision = args.use_full_precision
+    lora_rank = args.lora_rank
+    lora_alpha = args.lora_alpha
+    lora_target_modules = args.lora_target_modules.split(",")
+    lora_dropout = args.lora_dropout
+    output_dir = args.output_dir
+    publish_peft_with_id = args.publish_peft_with_id
+    if len(output_dir) == 0 and len(publish_peft_with_id) == 0:
+        raise ValueError(
+            "Please pass either a --output-dir or a --publish-peft-with-id to specify where to store the trained model"
+        )
+
+    # Change working dir to folder storing this script
+    abspath = os.path.abspath(__file__)
+    dname = os.path.dirname(abspath)
+    os.chdir(dname)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        # load_in_8bit=True,
+        torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        device_map="auto",
+    )
+
+    # Get Tokenizer
+    hf_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    hf_arch = getattr(hf_config, "architectures")[0]
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+        tokenizer = LlamaTokenizer.from_pretrained(
+            model_name,
+            use_fast=True,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32 if use_full_precision else torch.float16,
+        )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = "[PAD]"
+        tokenizer.padding_side = "left"
+
+    for param in model.parameters():
+        param.requires_grad = False  # freeze the model - train adapters later
+        if param.ndim == 1:
+            # cast the small parameters (e.g. layernorm) to fp32 for stability
+            param.data = param.data.to(torch.float32)
+
+    model.gradient_checkpointing_enable()  # reduce number of stored activations
+    model.enable_input_require_grads()
+
+    model.lm_head = CastOutputToFloat(model.lm_head)
+
+    config = LoraConfig(
+        r=lora_rank,
+        lora_alpha=lora_alpha,
+        # target_modules=["q_proj", "v_proj"],
+        # target_modules=["down_proj"],
+        target_modules=lora_target_modules,
+        lora_dropout=lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    print(model)
+    print(model.named_parameters())
+    model = get_peft_model(model, config)
+    print_trainable_parameters(model)
+
+    data = load_dataset("Abirate/english_quotes")
+    data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)
+
+    trainer = transformers.Trainer(
+        model=model,
+        train_dataset=data["train"],
+        args=transformers.TrainingArguments(
+            per_device_train_batch_size=4,
+            gradient_accumulation_steps=4,
+            warmup_steps=100,
+            max_steps=200,
+            learning_rate=2e-4,
+            fp16=True if not use_full_precision else False,
+            logging_steps=1,
+            output_dir=os.path.join(
+                output_dir if len(output_dir) > 0 else "./", "lora_training_logs"
+            ),
+        ),
+        data_collator=transformers.DataCollatorForLanguageModeling(
+            tokenizer, mlm=False
+        ),
+    )
+    model.config.use_cache = (
+        False
+    )  # silence the warnings. Please re-enable for inference!
+    trainer.train()
+
+    if len(output_dir) > 0:
+        print(f"Done training! Saving the model to {output_dir}...")
+        model.save_pretrained(output_dir)
+
+    if len(publish_peft_with_id) > 0:
+        print(
+            f"Done training! Uploading the model to HF hub with id: {publish_peft_with_id}..."
+        )
+        model.push_to_hub(publish_peft_with_id, use_auth_token=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/peft/hf_utils.py b/tests/peft/hf_utils.py
new file mode 100644
index 0000000000..9332c803b2
--- /dev/null
+++ b/tests/peft/hf_utils.py
@@ -0,0 +1,352 @@
+import torch
+import torch.nn as nn
+import transformers
+from transformers import (
+    TrainerCallback,
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    LlamaTokenizer,
+)
+import os, shutil
+from peft import PeftConfig, PeftModel
+from datasets import load_dataset, DatasetDict
+
+debug_dir = None
+debug_subdirs = ["fwd", "bwd", "optim", "weights"]
+verbose = False
+
+
+def make_debug_dirs():
+    global debug_dir
+    global debug_subdirs
+    debug_dir = os.environ.get("FF_CACHE_PATH", os.path.expanduser("~/.cache/flexflow"))
+    debug_dir = os.path.join(debug_dir, "debug", "huggingface")
+    shutil.rmtree(debug_dir, ignore_errors=True)
+    os.makedirs(debug_dir, exist_ok=True)
+    assert debug_dir is not None
+    assert os.path.isdir(debug_dir)
+    for subdir in debug_subdirs:
+        subdir_path = os.path.join(debug_dir, subdir)
+        os.makedirs(subdir_path, exist_ok=False)
+
+
+def get_dst_folder(subdir, step_idx=0):
+    global debug_dir, debug_subdirs
+    assert subdir in debug_subdirs
+    dst_folder = os.path.join(debug_dir, subdir, f"step_{step_idx}")
+    os.makedirs(dst_folder, exist_ok=True)
+    return dst_folder
+
+
+def simplify_name(name):
+    return name.replace("base_model.model.model.", "").replace("base_model.model.", "")
+
+
+def get_optim_type(args):
+    if args.optimizer == "sgd":
+        return transformers.training_args.OptimizerNames.SGD
+    elif args.optimizer == "adam":
+        return transformers.training_args.OptimizerNames.ADAM
+    elif args.optimizer == "adamw":
+        return transformers.training_args.OptimizerNames.ADAMW
+    else:
+        raise ValueError(f"Optimizer {args.optimizer} not supported")
+
+
+class CastOutputToFloat(nn.Sequential):
+    def forward(self, x):
+        return super().forward(x).to(torch.float32)
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+def peft_backward_hook(module, grad_input, grad_output):
+    assert type(grad_input) == tuple and type(grad_output) == tuple
+    if len(grad_input) == 0 or len(grad_output) == 0:
+        return
+    assert module.name is not None and module.bwd_step is not None
+    name = simplify_name(module.name)
+    if verbose:
+        print(
+            f"Backward Hook activated for module: {name}, bwd step: {module.bwd_step}"
+        )
+        print("Backward GRAD Output:")
+    for i, out_grad in enumerate(grad_output):
+        if type(out_grad) == torch.Tensor:
+            dst_folder = get_dst_folder("bwd", module.bwd_step)
+            dst_filepath = os.path.join(dst_folder, f"{name}.output_gradient_{i}")
+            if verbose:
+                print("\t", out_grad.shape)
+                print(f"\t\tSaving to {dst_filepath}")
+            torch.save(out_grad, dst_filepath)
+        else:
+            if verbose:
+                print(out_grad)
+    if verbose:
+        print("Backward GRAD Input:")
+    for i, in_grad in enumerate(grad_input):
+        if type(in_grad) == torch.Tensor:
+            dst_folder = get_dst_folder("bwd", module.bwd_step)
+            dst_filepath = os.path.join(dst_folder, f"{name}.input_gradient_{i}")
+            if verbose:
+                print("\t", in_grad.shape)
+                print(f"\t\tSaving to {dst_filepath}")
+            torch.save(in_grad, dst_filepath)
+        else:
+            if verbose:
+                print(in_grad)
+    if verbose:
+        print("===")
+    module.bwd_step += 1
+
+
+def peft_forward_hook(module, input, output):
+    if len(input) == 0 or len(output) == 0:
+        return
+    assert module.name is not None and module.fwd_step is not None
+    name = simplify_name(module.name)
+    if verbose:
+        print(f"Forward Hook activated for module: {name}, fwd step: {module.fwd_step}")
+        print("Input:")
+    if type(input) == torch.Tensor:
+        if verbose:
+            print(input.shape)
+        dst_folder = get_dst_folder("fwd", module.fwd_step)
+        dst_filepath = os.path.join(dst_folder, f"{name}.input_0")
+        torch.save(input, dst_filepath)
+    elif type(input) == tuple:
+        for i, inp in enumerate(input):
+            if type(inp) == torch.Tensor:
+                if verbose:
+                    print(inp.shape)
+                dst_folder = get_dst_folder("fwd", module.fwd_step)
+                dst_filepath = os.path.join(dst_folder, f"{name}.input_{i}")
+                torch.save(inp, dst_filepath)
+            else:
+                if verbose:
+                    print(inp)
+    else:
+        assert False
+    if verbose:
+        print("Output:")
+    if type(output) == torch.Tensor:
+        if verbose:
+            print(output.shape)
+        dst_folder = get_dst_folder("fwd", module.fwd_step)
+        dst_filepath = os.path.join(dst_folder, f"{name}.output_0")
+        torch.save(output, dst_filepath)
+    elif type(output) == tuple:
+        for i, out in enumerate(output):
+            if type(out) == torch.Tensor:
+                if verbose:
+                    print(out.shape)
+                dst_folder = get_dst_folder("fwd", module.fwd_step)
+                dst_filepath = os.path.join(dst_folder, f"{name}.output_{i}")
+                torch.save(out, dst_filepath)
+            else:
+                if verbose:
+                    print(out)
+    else:
+        assert False
+    if verbose:
+        print("===")
+    module.fwd_step += 1
+
+
+def peft_optimizer_hook(model_, callback_func_handle):
+    def post_hook(optimizer, args, kwargs):
+        if verbose:
+            print("Optimizer Hook activated")
+        bwd_step = callback_func_handle.step_count
+        for name_, module in model_.named_modules():
+            name = simplify_name(name_)
+            for param_name, param in module.named_parameters(recurse=False):
+                if param.requires_grad:
+                    if verbose:
+                        print(
+                            f"Step #{bwd_step}: Saving weight gradient for {name} ({param.grad.shape})"
+                        )
+                    dst_folder = get_dst_folder("weights", bwd_step)
+                    dst_filepath = os.path.join(dst_folder, f"{name}.gradient")
+                    torch.save(param.grad, dst_filepath)
+
+    return post_hook
+
+
+class HFTrainingCallBack(TrainerCallback):
+    def on_train_begin(self, args, state, control, **kwargs):
+        if verbose:
+            print("Starting finetuning")
+        model_ = kwargs.get("model", None)
+        optim = kwargs.get("optimizer", None)
+        assert model_ is not None
+        assert optim is not None
+        self.step_count = 0
+        optim.optimizer.register_step_post_hook(peft_optimizer_hook(model_, self))
+
+    def save_lora_weights(self, model, pre_finetuning=False):
+        lora_weights_handles = [
+            (simplify_name(name), params)
+            for name, params in model.named_parameters()
+            if "lora" in name
+        ]
+        for simplified_name, params in lora_weights_handles:
+            dst_folder = get_dst_folder("weights", self.step_count)
+            if pre_finetuning:
+                dst_filepath = os.path.join(dst_folder, f"{simplified_name}_original")
+                torch.save(params, dst_filepath)
+                if verbose:
+                    print(
+                        f"Step #{self.step_count}: Saving ORIGINAL weight {simplified_name} ({params.shape})"
+                    )
+            else:
+                dst_filepath = os.path.join(dst_folder, f"{simplified_name}_finetuned")
+                torch.save(params, dst_filepath)
+                if verbose:
+                    print(
+                        f"Step #{self.step_count}: Saving FINETUNED weight {simplified_name} ({params.shape})"
+                    )
+        if not pre_finetuning:
+            self.step_count += 1
+
+    def on_step_end(
+        self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs
+    ):
+        self.save_lora_weights(model, pre_finetuning=False)
+
+    def on_step_begin(
+        self, args, state, control, model, tokenizer, optimizer, lr_scheduler, **kwargs
+    ):
+        self.save_lora_weights(model, pre_finetuning=True)
+
+    def on_train_end(self, args, state, control, **kwargs):
+        if verbose:
+            print(f"Finetuning ended after {self.step_count} steps")
+
+
+def build_peft_config(args, finetuning=False):
+    peft_config = PeftConfig.from_pretrained(args.peft_model_id)
+    if peft_config.peft_type != "LORA":
+        raise ValueError(f"PEFT type {peft_config.peft_type} not supported yet")
+    if args.lora_alpha > 0.0:
+        peft_config.lora_alpha = args.lora_alpha
+    if peft_config.lora_dropout >= 0.0:
+        peft_config.lora_dropout = args.lora_dropout
+    # prevent HF from re-inizialing the weights randomly if finetuning
+    if finetuning:
+        peft_config.init_lora_weights = False
+    return peft_config
+
+
+def prepare_model_for_lora_finetuning(model, save_peft_tensors=False):
+    # Freeze all layers except the LORA ones. Cast small layers to full precision for stability
+    for name, param in model.named_parameters():
+        if "lora" not in name:
+            param.requires_grad = False  # freeze the model - train adapters later
+        else:
+            param.requires_grad = True
+        if param.ndim == 1:
+            # cast the small parameters (e.g. layernorm) to fp32 for stability
+            param.data = param.data.to(torch.float32)
+    if not save_peft_tensors:
+        model.gradient_checkpointing_enable()  # reduce number of stored activations
+    model.enable_input_require_grads()
+    model.lm_head = CastOutputToFloat(model.lm_head)
+    return model
+
+
+def build_peft_model(args, peft_config):
+    # Load base model, and apply the PEFT layer
+    model = AutoModelForCausalLM.from_pretrained(
+        peft_config.base_model_name_or_path,
+        torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        device_map="auto",
+    )
+    model = PeftModel.from_pretrained(model, args.peft_model_id, config=peft_config)
+    model = prepare_model_for_lora_finetuning(model, args.save_peft_tensors)
+    return model
+
+
+def get_peft_tokenizer(args, peft_config):
+    # Get Tokenizer
+    hf_config = AutoConfig.from_pretrained(
+        peft_config.base_model_name_or_path, trust_remote_code=True
+    )
+    hf_arch = getattr(hf_config, "architectures")[0]
+    if hf_arch == "LLaMAForCausalLM" or hf_arch == "LlamaForCausalLM":
+        tokenizer = LlamaTokenizer.from_pretrained(
+            peft_config.base_model_name_or_path,
+            use_fast=True,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        )
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            peft_config.base_model_name_or_path,
+            torch_dtype=torch.float32 if args.use_full_precision else torch.float16,
+        )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = "[PAD]"
+        tokenizer.padding_side = "left"
+    return tokenizer
+
+
+def register_peft_hooks(model):
+    # Save hidden states and gradients
+    for name, layer in dict(model.named_modules()).items():
+        layer.name = name
+        layer.fwd_step = 0
+        layer.bwd_step = 0
+        if verbose:
+            print(f"Adding hooks to layer {layer.name}")
+        layer.register_forward_hook(peft_forward_hook)
+        layer.register_full_backward_hook(peft_backward_hook)
+
+
+def save_peft_weights(model, target_modules=[]):
+    # Save any weights of interest
+    for name, params in model.named_parameters():
+        simplified_name = simplify_name(name)
+        for target_module in target_modules:
+            if target_module in name:
+                dst_folder = get_dst_folder("weights")
+                dst_filepath = os.path.join(dst_folder, f"{simplified_name}")
+                torch.save(params, dst_filepath)
+
+
+def filter_dataset_for_debugging(data, key_to_filter, desired_value):
+    filtered_dataset_dict = DatasetDict()
+    for split, dataset in data.items():
+        filtered_dataset = dataset.filter(
+            lambda example: example[key_to_filter] == desired_value
+        )
+        filtered_dataset_dict[split] = filtered_dataset
+    data = filtered_dataset_dict
+    return data
+
+
+def save_finetuned_model(model, args):
+    if len(args.output_dir) > 0:
+        if verbose:
+            print(f"Saving the model to {args.output_dir}...")
+        model.save_pretrained(args.output_dir)
+
+    if len(args.publish_peft_with_id) > 0:
+        if verbose:
+            print(
+                f"Uploading the model to HF hub with id: {args.publish_peft_with_id}..."
+            )
+        model.push_to_hub(args.publish_peft_with_id, use_auth_token=True)
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
new file mode 100644
index 0000000000..266bb64137
--- /dev/null
+++ b/tests/peft/peft_alignment_test.py
@@ -0,0 +1,730 @@
+import numpy as np
+import os, torch, argparse
+from alignment.align_test_utils import *
+from transformers import AutoConfig
+from peft import PeftConfig
+from tqdm import tqdm
+
+class AlignmentTest:
+    def __init__(self, model_name, tp_degree=1):
+        raise NotImplementedError()
+    def check_weights_alignment(self):
+        raise NotImplementedError()
+    def check_fwd_pass(self):
+        raise NotImplementedError()
+    def check_bwd_pass(self):
+        raise NotImplementedError()
+    def check_step(self, step_idx, learning_rate=0.001):
+        raise NotImplementedError()
+
+class LllamaAlignmentTest(AlignmentTest):
+    def __init__(self, model_name, tp_degree=1):
+        self.model_name = model_name
+        self.peft_config = PeftConfig.from_pretrained(model_name)
+        self.hf_config = AutoConfig.from_pretrained(self.peft_config.base_model_name_or_path)
+        self.num_layers = self.hf_config.num_hidden_layers
+        self.hidden_size = self.hf_config.hidden_size
+        self.intermediate_size = self.hf_config.intermediate_size
+        self.num_attention_heads = self.hf_config.num_attention_heads
+        self.num_key_value_heads = self.num_attention_heads
+        self.projsize = self.hidden_size // self.num_attention_heads
+        self.tp_degree = tp_degree
+        self.lora_scaling_factor = self.peft_config.lora_alpha / self.peft_config.r
+
+        self.num_tokens = None
+        self.ff_batch_size = None
+    
+
+    def check_weights_alignment(self):
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "lm_head.weight":
+                f_version = f"layers.{self.num_layers-1}.lm_head.weight_0"
+            elif hf_filename == "norm.weight":
+                f_version = f"layers.{self.num_layers-1}.norm.weight_0"
+            else:
+                f_version = ""
+                if hf_filename.startswith("layers."):
+                    layernum = hf_filename.split("layers.")[1].split(".")[0]
+                    f_version += f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+                # compute weight index, then rename lora if needed if needed
+                weight_index="0"
+                if "lora_A" in f_version:
+                    weight_index="A"
+                elif "lora_B" in f_version:
+                    weight_index="B"
+                f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora")
+                if f_version.endswith(".weight"):
+                    if weight_index == "0":
+                        f_version += f"_{weight_index}"
+                    else:
+                        f_version += f"_{weight_index}.original"
+                elif f_version.endswith(".gradient"):
+                    prefix = f_version.split(".gradient")[0]
+                    f_version = prefix + f".weight_{weight_index}.gradient"
+            return f_version
+        def get_tp_partition_dim(ff_weight_name) -> int:
+            # MLP layers split the intermediate size dimension
+            # gate_proj, up_proj: [hidden_size, intermediate_size]
+            # down_proj: [intermediate_size, hidden_size]
+            if self.tp_degree == 1:
+                return -1
+            if "lora.weight_B" in ff_weight_name:
+                return -1
+            if "lm_head" in ff_weight_name or "norm" in ff_weight_name:
+                return 1
+            if "gate_proj" in ff_weight_name or "up_proj" in ff_weight_name:
+                return 1
+            elif "down_proj" in ff_weight_name:
+                return 0
+            else:
+                return -1
+        print("-- Weights alignment --")
+        hf_weights_folder = os.path.join(hf_path, "weights", "step_0")
+        ff_weights_folder = os.path.join(ff_path, "weights", "step_0", "shard_0")
+        files_list = os.listdir(hf_weights_folder)
+        for hf_weight_name in tqdm(sorted(files_list)):
+            if hf_weight_name.endswith(".weight"):
+                ff_weight_name = convert_hf_filename_to_ff(hf_weight_name)
+                # print(hf_weight_name, ff_weight_name)
+                hf_w_path = os.path.join(hf_weights_folder, hf_weight_name)
+                ff_w_path = os.path.join(ff_weights_folder, ff_weight_name)
+                if not os.path.isfile(hf_w_path):
+                    print(f"File '{hf_w_path}' not found")
+                if not os.path.isfile(ff_w_path):
+                    print(f"File '{ff_w_path}' not found")
+                assert(os.path.isfile(hf_w_path))
+                assert(os.path.isfile(ff_w_path))
+
+                # 1. get shape of hf weight
+                hf_weight = torch.load(hf_w_path, map_location='cpu')
+                hf_weigth_shape = hf_weight.shape
+                ff_partition_dim = get_tp_partition_dim(ff_weight_name)
+                ff_weigth_shape = list(hf_weigth_shape)[::-1]
+                if ff_partition_dim >= 0:
+                    ff_weigth_shape[ff_partition_dim] //= self.tp_degree
+                
+                # 2. handle flexflow shards in case of tensor parallelism
+                ff_weights = [load_ff_tensor(ff_w_path.replace("shard_0", f"shard_{tp_idx}"), ff_weigth_shape) for tp_idx in range(self.tp_degree)]
+                if self.tp_degree > 1:
+                    if ff_partition_dim >= 0:
+                        ff_weight = np.concatenate(ff_weights, axis=ff_partition_dim)
+                    else:
+                        assert(are_np_arrays_identical(ff_weights))
+                        ff_weight = ff_weights[0]
+                else:
+                    ff_weight = ff_weights[0]
+                ff_weight = torch.from_numpy(ff_weight).to(hf_weight.dtype)
+                
+                # check equivalence
+                try:
+                    torch.testing.assert_close(ff_weight, hf_weight.T)
+                except Exception as e:
+                    print(f"Error comparing {ff_w_path} weight to {hf_w_path}:\n{e}\n")
+                    raise e
+    
+    def check_fwd_pass(self, step_idx=0):
+        hf_fwd_folder = os.path.join(hf_path, "fwd", f"step_{step_idx}")
+        ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0")
+        
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "embed_tokens":
+                f_version = f"layers.0.embed_tokens"
+            elif hf_filename == "lm_head" or hf_filename == "norm":
+                f_version = f"layers.{self.num_layers-1}.{hf_filename}"
+            else:
+                assert hf_filename.startswith("layers.")
+                layernum = hf_filename.split("layers.")[1].split(".")[0]
+                f_version = f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+                # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix
+                f_version = f_version.replace(".q_proj", "").replace(".k_proj", "").replace(".v_proj", "").replace(".o_proj", "")
+                # lora in HuggingFace is split into A and B operators, in FF we use a single operator.
+                f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora")
+            return f_version
+        
+        def get_hf_tensor(hf_tensor_name, tensor_comparison_idx):
+            hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}"
+            hf_tensor_path = os.path.join(hf_fwd_folder, hf_tensor_filename)
+
+            if not os.path.isfile(hf_tensor_path):
+                raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
+            if hf_tensor_name == "embed_tokens":
+                self.num_tokens = hf_tensor.shape[1]
+            return hf_tensor
+        
+        def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE):
+            ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else ""
+            ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else ""
+            ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}"
+            ff_tensor_path = os.path.join(ff_fwd_folder, ff_tensor_filename)
+            if not os.path.isfile(ff_tensor_path):
+                raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
+
+            ff_shape = list(hf_shape)[::-1]
+            if tp_type == TPType.PARTITION:
+                ff_shape[0] //= self.tp_degree
+            
+            if "layers.0.embed_tokens.input_0" in ff_tensor_path:
+                # get number of tokens
+                ff_tensor = np.loadtxt(ff_tensor_path, delimiter=',')
+                self.ff_batch_size = ff_tensor.shape[0]
+
+            ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size)
+            ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)]
+            if self.tp_degree > 1:
+                # if replicate, check that they are identical
+                if tp_type == TPType.REPLICATE:
+                    assert(are_np_arrays_identical(ff_tensors))
+                    ff_tensor = ff_tensors[0]
+                # if partition, concatenate along the partition dimension
+                elif tp_type == TPType.PARTITION:
+                    ff_tensor = np.concatenate(ff_tensors, axis=0)
+                # if to_reduce, sum along the partition dimension
+                elif tp_type == TPType.TO_REDUCE:
+                    ff_tensor = np.sum(ff_tensors, axis=0)
+            else:
+                ff_tensor = ff_tensors[0]
+            ff_tensor = torch.from_numpy(ff_tensor)
+            ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens)
+            return ff_tensor
+
+        def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-2):
+            ff_tensor = ff_tensor.to(hf_tensor.dtype)
+            hf_tensor = hf_tensor.T
+            if additional_ff_tensor is not None:
+                additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype)
+                ff_tensor = ff_tensor - additional_ff_tensor
+            try:
+                # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=1.3e-6, atol=tolerance)
+                if not np.allclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance):
+                    mismatches = np.where(~np.isclose(hf_tensor.detach().numpy(), ff_tensor.detach().numpy(), atol=tolerance))[0]
+                    print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%")
+                    assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel())
+            except Exception as e:
+                print(f"Error in comparison {label}:\n{e}\n")
+                print("HF tensor:")
+                print(hf_tensor.squeeze())
+                print("FF tensor:")
+                print(ff_tensor.squeeze())
+                raise e
+
+        print(f"-- FWD pass {step_idx}--")
+
+        # Embedding layer
+        hf_tensor_name = "embed_tokens"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Embedding input")
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Embedding output")
+        
+        # Transformers blocks
+        for i in range(self.num_layers):
+            # Input laye norm
+            hf_tensor_name = f"layers.{i}.input_layernorm"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            if i == 0:
+                input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+                output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            else:
+                input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+                output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} input")
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Input layernorm {i} output")
+
+            # Attention
+            hf_tensor_name = f"layers.{i}.self_attn.o_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_tensor, ff_tensor, label=f"Attention {i} output")
+            
+            # Post-attention layernorm
+            hf_tensor_name = f"layers.{i}.post_attention_layernorm"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+            compare(hf_tensor, ff_tensor, label=f"Post-attention layernorm {i} output")
+
+            # W1 (gate_proj)
+            hf_tensor_name = f"layers.{i}.mlp.gate_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W1 {i} output")
+
+            # W3 (up_proj)
+            hf_tensor_name = f"layers.{i}.mlp.up_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W3 {i} output")
+
+            # W2 (down_proj)
+            hf_tensor_name = f"layers.{i}.mlp.down_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_down_proj_out = get_hf_tensor(hf_tensor_name, output_comparison)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W2 {i} input")
+
+            hf_down_proj_in = hf_tensor.clone()
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_down_proj_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+
+            # LoRA_A
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_A.default"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} input")
+            torch.testing.assert_close(hf_down_proj_in, hf_tensor, rtol=1.3e-6, atol=1e-5)
+
+            # LoRA intermediate
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="low_rank_activation", hf_tensor_idx=0, ff_tensor_idx=None)
+            hf_lora_A_out = get_hf_tensor(hf_tensor_name, output_comparison)
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default"
+            hf_lora_B_in = get_hf_tensor(hf_tensor_name, input_comparison)
+            torch.testing.assert_close(hf_lora_A_out, hf_lora_B_in, rtol=1.3e-6, atol=1e-5)
+            ff_tensor_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora"
+            ff_lora_A_out = get_ff_tensor(ff_tensor_name, output_comparison, hf_lora_A_out.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_lora_A_out, ff_lora_A_out, label=f"LoRA_A {i} output")
+
+            # LoRA_B
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) * self.lora_scaling_factor
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_down_proj_out.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_down_proj_out, ff_tensor, label=f"W2_out + scaling*LoRA_B_out {i}")
+            compare(hf_tensor, ff_tensor, additional_ff_tensor=ff_down_proj_out, label=f"LoRA_B {i} output")
+        
+        # Norm
+        hf_tensor_name = "norm"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=1)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Norm output")
+
+        # LM head
+        hf_tensor_name = "lm_head"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input", ff_tensor_type="input", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+        compare(hf_tensor, ff_tensor, label="LM head input")
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output", ff_tensor_type="output", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+        compare(hf_tensor, ff_tensor, label="LM head output")
+
+    def check_bwd_pass(self, step_idx=0):
+        if not self.num_tokens or not self.ff_batch_size:
+            raise ValueError("Number of tokens and batch size must be set before running backward pass check")
+        hf_bwd_folder = os.path.join(hf_path, "bwd", f"step_{step_idx}")
+        ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0")
+        
+        def convert_hf_filename_to_ff(hf_filename):
+            if hf_filename == "embed_tokens":
+                f_version = f"layers.0.embed_tokens"
+            elif hf_filename == "lm_head" or hf_filename == "norm":
+                f_version = f"layers.{self.num_layers-1}.{hf_filename}"
+            else:
+                assert hf_filename.startswith("layers.")
+                layernum = hf_filename.split("layers.")[1].split(".")[0]
+                f_version = f"layers.{layernum}."
+                f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+                # right now, attention in flexflow is done with a single operator, so there is a single output file without the projection suffix
+                # f_version = f_version.replace(".q_proj", "").replace(".k_proj", "").replace(".v_proj", "").replace(".o_proj", "")
+                # lora in HuggingFace is split into A and B operators, in FF we use a single operator.
+                f_version = f_version.replace("lora_A", "lora").replace("lora_B", "lora")
+            return f_version
+        
+        def get_hf_tensor(hf_tensor_name, tensor_comparison_idx):
+            hf_tensor_filename = f"{hf_tensor_name}.{tensor_comparison_idx.hf_tensor_type}_{tensor_comparison_idx.hf_tensor_idx}"
+            hf_tensor_path = os.path.join(hf_bwd_folder, hf_tensor_filename)
+
+            if not os.path.isfile(hf_tensor_path):
+                raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
+            return hf_tensor
+        
+        def get_ff_tensor(ff_tensor_name, tensor_comparison_idx, hf_shape, tp_type=TPType.REPLICATE, pre=False, shard_axis=0):
+            ff_tensor_suffix = f".{tensor_comparison_idx.ff_tensor_type}" if len(tensor_comparison_idx.ff_tensor_type) > 0 else ""
+            ff_tensor_idx_suffix = f"_{tensor_comparison_idx.ff_tensor_idx}" if tensor_comparison_idx.ff_tensor_idx is not None else ""
+            ff_tensor_filename = f"{ff_tensor_name}{ff_tensor_suffix}{ff_tensor_idx_suffix}"
+            
+            ff_tensor_path = os.path.join(ff_bwd_folder, ff_tensor_filename)
+            if pre:
+                ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre")
+            if not os.path.isfile(ff_tensor_path):
+                raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
+
+            ff_shape = list(hf_shape)[::-1]
+            if tp_type == TPType.PARTITION:
+                ff_shape[shard_axis] //= self.tp_degree
+
+            # exception: intermediate attention tensors
+            intermediate_attention_tensor = (
+                "self_attn" in ff_tensor_name and 
+                not (
+                    ff_tensor_name.endswith(".self_attn") and
+                    (
+                        tensor_comparison_idx.ff_tensor_type == "output_gradient" or
+                        tensor_comparison_idx.ff_tensor_type == "input_gradient"
+                    )
+                )
+            )
+            if not intermediate_attention_tensor:
+                ff_shape = replace_value(ff_shape, self.num_tokens, self.ff_batch_size)
+            
+            ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)]
+            if self.tp_degree > 1:
+                # if replicate, check that they are identical
+                if tp_type == TPType.REPLICATE:
+                    assert(are_np_arrays_identical(ff_tensors))
+                    ff_tensor = ff_tensors[0]
+                # if partition, concatenate along the partition dimension
+                elif tp_type == TPType.PARTITION:
+                    ff_tensor = np.concatenate(ff_tensors, axis=shard_axis)
+                # if to_reduce, sum along the partition dimension
+                elif tp_type == TPType.TO_REDUCE:
+                    ff_tensor = np.sum(ff_tensors, axis=shard_axis)
+            else:
+                ff_tensor = ff_tensors[0]
+            ff_tensor = torch.from_numpy(ff_tensor)
+            if not intermediate_attention_tensor:
+                ff_tensor = truncate_dimension(ff_tensor, self.ff_batch_size, self.num_tokens)
+            return ff_tensor
+
+        def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance=1e-3):
+            ff_tensor = ff_tensor.to(hf_tensor.dtype)
+            hf_tensor = hf_tensor.T
+            if additional_ff_tensor is not None:
+                additional_ff_tensor = additional_ff_tensor.to(hf_tensor.dtype)
+                ff_tensor = ff_tensor - additional_ff_tensor
+            try:
+                # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=rtol, atol=tolerance)
+                if not np.allclose(hf_tensor.numpy(), ff_tensor.numpy(), atol=tolerance):
+                    mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0]
+                    print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%")
+                    assert(np.prod(mismatches.shape) <= .06 * ff_tensor.numel())
+            except Exception as e:
+                print(f"Error in comparison {label}:\n{e}\n")
+                print("HF tensor:")
+                print(hf_tensor.squeeze())
+                print("FF tensor:")
+                print(ff_tensor.squeeze())
+                raise e
+        
+        print(f"-- BWD pass {step_idx}--")
+        
+        # LM head
+        hf_tensor_name = "lm_head"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+        compare(hf_tensor, ff_tensor, label="LM head gradient output")
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, TPType.TO_REDUCE)
+        compare(hf_tensor, ff_tensor, label="LM head gradient input")
+
+        # Norm
+        hf_tensor_name = "norm"
+        ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+        output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+        input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+        hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+        compare(hf_tensor, ff_tensor, label="Norm gradient output")
+        hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+        ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape)
+        compare(hf_tensor, ff_tensor, label="Norm gradient input")
+
+        # Transformers blocks
+        for i in range(self.num_layers-1, -1, -1):
+            # W2 (down_proj) output
+            hf_tensor_name = f"layers.{i}.mlp.down_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+            compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient output")
+
+            # LoRA_B
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_B.default"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) * self.lora_scaling_factor
+            compare(hf_tensor, ff_tensor, label=f"LoRA_B {i} gradient output")
+
+            # LoRA_A
+            hf_tensor_name = f"layers.{i}.mlp.down_proj.lora_A.default"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"LoRA_A {i} gradient input")
+
+            # W2 (down_proj) input
+            hf_tensor_name = f"layers.{i}.mlp.down_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W2 {i} gradient input")
+            
+            # W2 input (HF) and SigmoidSiluMulti output (FF)
+            hf_w2_input = hf_tensor.clone()
+            ff_tensor_name = f"layers.{i}.SigmoidSiluMulti"
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_w2_input, ff_tensor, label=f"HF W2 {i} output and FF SSM output")
+
+            # W1 (gate_proj) output
+            hf_tensor_name = f"layers.{i}.mlp.gate_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W1 {i} gradient output")
+            # W1 (gate_proj) input
+            # HF W1 in = FF W1 in - HF W1 in (pre)
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            ff_tensor_pre = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE, pre=True)
+            compare(hf_tensor, ff_tensor, additional_ff_tensor=ff_tensor_pre, label=f"W1 {i} gradient input")
+
+            # W3 (up_proj) output
+            hf_tensor_name = f"layers.{i}.mlp.up_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient output")
+            # W3 (up_proj) input
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_tensor, ff_tensor, label=f"W3 {i} gradient input")
+
+            # Attn O-proj
+            hf_tensor_name = f"layers.{i}.self_attn.o_proj"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+            compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output")
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj"
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.PARTITION)
+            compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient input")
+
+            # V-proj grads
+            # FF shape: [num_tokens, qProjSize*num_heads]
+            hf_tensor_name = f"layers.{i}.self_attn.v_proj"
+            ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+            mixed_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, mixed_comparison)
+            hf_tensor = hf_tensor.squeeze().T
+            ff_tensor = get_ff_tensor(ff_tensor_name, mixed_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, shard_axis=1)
+            compare(hf_tensor, ff_tensor, label=f"V-proj {i} gradient input")
+
+            # K-proj grads
+            # FF shape: (num_tokens, qProjSize, num_heads)
+            hf_tensor_name = f"layers.{i}.self_attn.k_proj"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            k_proj_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="devkproj", hf_tensor_idx=0, ff_tensor_idx=None)
+            hf_tensor = get_hf_tensor(hf_tensor_name, k_proj_comparison)
+            hf_tensor = hf_tensor.squeeze().view(self.num_tokens, self.num_attention_heads, self.projsize).transpose(1, 2).contiguous()
+            hf_tensor = hf_tensor.T
+            ff_tensor = get_ff_tensor(ff_tensor_name, k_proj_comparison, hf_tensor.shape, tp_type=TPType.PARTITION, shard_axis=2)
+            compare(hf_tensor, ff_tensor, label=f"K-proj {i} gradient input")
+            
+            # Q-proj grads
+            # FF shape (devQKVPRojArray): (num_tokens, qProjSize, num_heads, 3)
+            # Q-proj out grad: devQKVPRojArray[:,:,:,0]
+            hf_tensor_name = f"layers.{i}.self_attn.q_proj"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.devQKVPRojArray"
+            q_proj_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="", hf_tensor_idx=0, ff_tensor_idx=None)
+            hf_tensor = get_hf_tensor(hf_tensor_name, q_proj_comparison)
+            hf_tensor = hf_tensor.view(self.num_tokens, self.num_attention_heads, self.projsize).transpose(1, 2).contiguous().T
+            augmented_hf_tensor_shape = torch.Size([3]+list(hf_tensor.size()))
+            ff_tensor = get_ff_tensor(ff_tensor_name, q_proj_comparison, augmented_hf_tensor_shape, tp_type=TPType.PARTITION, shard_axis=2)[:,:,:,0]
+            compare(hf_tensor, ff_tensor, label=f"Q-proj {i} gradient input")
+            
+            # FF Attn input with HF layernorm out
+            hf_tensor_name = f"layers.{i}.input_layernorm"
+            ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
+            input_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+            hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+            ff_tensor = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            compare(hf_tensor, ff_tensor, label=f"Attn input {i} gradient input")
+
+            if i > 0:
+                # FF attn input with FF layernorm out 1
+                attn_input = ff_tensor.clone()
+                ff_tensor_name = f"layers.{i}.layers.{i}.input_layernorm"
+                _output_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=1)
+                input_layernorm_out1 = get_ff_tensor(ff_tensor_name, _output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+                torch.testing.assert_close(attn_input, input_layernorm_out1, rtol=1.3e-6, atol=1e-5)
+
+                # Input layernorm
+                
+                hf_tensor_name = f"layers.{i}.input_layernorm"
+                ff_tensor_name = convert_hf_filename_to_ff(hf_tensor_name)
+                input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
+                ff_in1_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=1)
+                input_layernorm0 = get_ff_tensor(ff_tensor_name, input_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+                input_layernorm1 = get_ff_tensor(ff_tensor_name, ff_in1_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
+                torch.testing.assert_close(input_layernorm0, input_layernorm1, rtol=1.3e-6, atol=1e-5)
+                hf_tensor = get_hf_tensor(hf_tensor_name, input_comparison)
+                # if i > 1:
+                #     compare(hf_tensor, input_layernorm1, label=f"Input layernorm {i} gradient input")
+
+    def check_step(self, step_idx=0, learning_rate=0.001):
+        hf_weight_folder = os.path.join(hf_path, "weights", f"step_{step_idx}")
+        ff_weight_folder = os.path.join(ff_path, "weights", f"step_{step_idx}", "shard_0")
+        def convert_hf_filename_to_ff(hf_filename):
+            assert hf_filename.startswith("layers.")
+            layernum = hf_filename.split("layers.")[1].split(".")[0]
+            f_version = f"layers.{layernum}."
+            f_version += hf_filename.replace(".base_layer", "").replace(".default", "")
+            # lora in HuggingFace is split into A and B operators, in FF we use a single operator.
+            f_version = f_version.replace("lora_A", "lora.weight_A").replace("lora_B", "lora.weight_B")
+            return f_version
+        def get_hf_tensor(hf_tensor_name):
+            hf_tensor_path = os.path.join(hf_weight_folder, hf_tensor_name)
+
+            if not os.path.isfile(hf_tensor_path):
+                raise FileNotFoundError(f"File '{hf_tensor_path}' not found")
+            hf_tensor = torch.load(hf_tensor_path, map_location='cpu')
+            return hf_tensor
+        def get_ff_tensor(ff_tensor_name, hf_shape, tp_type=TPType.REPLICATE, pre=False):
+            ff_tensor_path = os.path.join(ff_weight_folder, ff_tensor_name)
+            if pre:
+                ff_tensor_path = ff_tensor_path.replace(f"step_{step_idx}", f"step_{step_idx}_pre")
+            if not os.path.isfile(ff_tensor_path):
+                raise FileNotFoundError(f"File '{ff_tensor_path}' not found")
+
+            ff_shape = list(hf_shape)[::-1]
+            if tp_type == TPType.PARTITION:
+                ff_shape[0] //= self.tp_degree
+            
+            ff_tensors = [load_ff_tensor(ff_tensor_path.replace("shard_0", f"shard_{tp_idx}"), ff_shape) for tp_idx in range(self.tp_degree)]
+            if self.tp_degree > 1:
+                # if replicate, check that they are identical
+                if tp_type == TPType.REPLICATE:
+                    assert(are_np_arrays_identical(ff_tensors))
+                    ff_tensor = ff_tensors[0]
+                # if partition, concatenate along the partition dimension
+                elif tp_type == TPType.PARTITION:
+                    ff_tensor = np.concatenate(ff_tensors, axis=0)
+                # if to_reduce, sum along the partition dimension
+                elif tp_type == TPType.TO_REDUCE:
+                    ff_tensor = np.sum(ff_tensors, axis=0)
+            else:
+                ff_tensor = ff_tensors[0]
+            ff_tensor = torch.from_numpy(ff_tensor)
+            return ff_tensor
+        def compare(hf_tensor, ff_tensor, label="", tolerance=1e-4):
+            ff_tensor = ff_tensor.to(hf_tensor.dtype)
+            hf_tensor = hf_tensor.T
+            try:
+                # torch.testing.assert_close(hf_tensor, ff_tensor, rtol=rtol, atol=tolerance)
+                if not np.allclose(hf_tensor.numpy(), ff_tensor.numpy(), atol=tolerance):
+                    mismatches = np.where(~np.isclose(hf_tensor, ff_tensor, atol=tolerance))[0]
+                    print(f"Pct mismatch {label}: {100.0*(np.prod(mismatches.shape) / ff_tensor.numel()):.3f}%")
+                    assert(np.prod(mismatches.shape) <= .05 * ff_tensor.numel())
+            except Exception as e:
+                print(f"Error in comparison {label}:\n{e}\n")
+                print("HF tensor:")
+                print(hf_tensor.squeeze())
+                print("FF tensor:")
+                print(ff_tensor.squeeze())
+                raise e
+        print(f"-- optimizer pass {step_idx}--")
+        
+        for i in range(self.num_layers-1, -1, -1):
+            # LoRA_B gradient
+            hf_gradient_name = f"layers.{i}.mlp.down_proj.lora_B.default.gradient"
+            hf_gradient = get_hf_tensor(hf_gradient_name)
+            hf_original_weight_name = f"layers.{i}.mlp.down_proj.lora_B.default.weight_original"
+            hf_original_weight = get_hf_tensor(hf_original_weight_name)
+            hf_finetuned_weight_name = f"layers.{i}.mlp.down_proj.lora_B.default.weight_finetuned"
+            hf_finetuned_weight = get_hf_tensor(hf_finetuned_weight_name)
+            torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5)
+            ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name)
+            ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.REPLICATE)
+            compare(hf_gradient, ff_gradient, label=f"LoRA_B {i} gradient")
+            # ff_out_gradient_name = f"layers.{i}.layers.{i}.mlp.down_proj.lora.output_gradient_0"
+            # ff_fwd_folder = os.path.join(ff_path, "fwd", f"step_{step_idx}", "shard_0")
+            # ff_bwd_folder = os.path.join(ff_path, "bwd", f"step_{step_idx}", "shard_0")
+            # ff_out_gradient = load_ff_tensor(os.path.join(ff_bwd_folder, ff_out_gradient_name), [self.hidden_size, 128])[:,:self.num_tokens]
+            # ff_out_gradient = torch.from_numpy(ff_out_gradient)
+            # print("Output gradient shape: ", ff_out_gradient.shape)
+            # ff_low_rank_activation = f"layers.{i}.layers.{i}.mlp.down_proj.lora.low_rank_activation"
+            # ff_low_rank_activation = load_ff_tensor(os.path.join(ff_fwd_folder, ff_low_rank_activation), [16, 128])[:,:self.num_tokens]
+            # ff_low_rank_activation = torch.from_numpy(ff_low_rank_activation)
+            # print("Low rank activation shape: ", ff_low_rank_activation.shape)
+            # simulated_weight_grad = ff_low_rank_activation @ ff_out_gradient.T
+            # print("Simulated weight grad shape: ", simulated_weight_grad.shape)
+            # print(simulated_weight_grad)
+            # print(ff_gradient)
+            # compare(hf_gradient, simulated_weight_grad, label=f"LoRA_B {i} simulated gradient")
+            
+
+            # LoRA_A gradient
+            hf_gradient_name = f"layers.{i}.mlp.down_proj.lora_A.default.gradient"
+            hf_gradient = get_hf_tensor(hf_gradient_name)
+            ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name)
+            hf_original_weight_name = f"layers.{i}.mlp.down_proj.lora_A.default.weight_original"
+            hf_original_weight = get_hf_tensor(hf_original_weight_name)
+            hf_finetuned_weight_name = f"layers.{i}.mlp.down_proj.lora_A.default.weight_finetuned"
+            hf_finetuned_weight = get_hf_tensor(hf_finetuned_weight_name)
+            torch.testing.assert_close(hf_gradient, (hf_original_weight-hf_finetuned_weight)/learning_rate, rtol=1.3e-6, atol=1e-5)
+            ff_gradient_name = convert_hf_filename_to_ff(hf_gradient_name)
+            ff_gradient = get_ff_tensor(ff_gradient_name, hf_gradient.shape, tp_type=TPType.PARTITION)
+            compare(hf_gradient, ff_gradient, label=f"LoRA_A {i} gradient")
+
+parser = argparse.ArgumentParser(description='Argument Parser Example') 
+# Adding arguments
+parser.add_argument('-m', '--model-name', type=str, default="goliaro/llama-160m-lora", help='Name of the model')
+parser.add_argument('-n', '--num-steps', type=int, default=1, help='Number of finetuning steps')
+parser.add_argument('-tp', '--tensor-parallelism-degree', type=int, default=1, help='The tensor parallelism degree used when running FlexFlow')
+parser.add_argument('-lr', '--learning-rate', type=float, default=0.001, help='The learning rate used at finetuning time')
+
+# Parse the arguments from command line
+args = parser.parse_args()
+
+if __name__ == "__main__":
+    llama_alignment = LllamaAlignmentTest(args.model_name, tp_degree=args.tensor_parallelism_degree)
+    # llama_alignment.check_weights_alignment()
+    for i in range(args.num_steps):
+        llama_alignment.check_fwd_pass(i)
+        llama_alignment.check_bwd_pass(i)
+        llama_alignment.check_step(i, args.learning_rate)
diff --git a/tests/peft_test.sh b/tests/peft_test.sh
new file mode 100755
index 0000000000..5600d57edf
--- /dev/null
+++ b/tests/peft_test.sh
@@ -0,0 +1,66 @@
+#! /usr/bin/env bash
+# set -x
+set -e
+
+cleanup() {
+    rm -rf ~/.cache/flexflow/debug
+}
+
+# Cd into directory holding this script
+cd "${BASH_SOURCE[0]%/*}/.."
+
+# Token to access private huggingface models (e.g. LLAMA-2)
+HUGGINGFACE_TOKEN=${HUGGINGFACE_TOKEN:-none}
+if [[ "$HUGGINGFACE_TOKEN" != "none" ]]; then
+    huggingface-cli login --token "$HUGGINGFACE_TOKEN"
+fi
+
+# Clean up before test (just in case)
+cleanup
+
+# Create test prompt file
+mkdir -p ./inference/prompt
+echo '["Two things are infinite: "]' > ./inference/prompt/peft.json
+echo '["“Two things are infinite: the universe and human stupidity; and I'\''m not sure about the universe.”"]' > ./inference/prompt/peft_dataset.json
+
+
+# Create output folder
+mkdir -p ./inference/output
+
+# Enable backtrace in case we run into a segfault or assertion failure
+export LEGION_BACKTRACE=1
+
+# Download test model
+python ./inference/utils/download_peft_model.py goliaro/llama-160m-lora --base_model_name JackFram/llama-160m 
+
+# Run PEFT in Huggingface to get ground truth tensors
+python ./tests/peft/hf_finetune.py --peft-model-id goliaro/llama-160m-lora --save-peft-tensors --use-full-precision
+
+# Python test
+echo "Python test"
+python ./inference/python/ff_peft.py
+# Check alignment
+python ./tests/peft/peft_alignment_test.py -tp 2
+
+# C++ test
+echo "C++ test"
+./build/inference/peft/peft \
+    -ll:gpu 2 -ll:cpu 4 -ll:util 4 \
+    -tensor-parallelism-degree 2 \
+    -ll:fsize 8192 -ll:zsize 12000 \
+    -llm-model JackFram/llama-160m \
+    -finetuning-dataset ./inference/prompt/peft_dataset.json \
+    -peft-model goliaro/llama-160m-lora \
+    -enable-peft \
+    --use-full-precision \
+    --inference-debugging
+# Check alignment
+python ./tests/peft/peft_alignment_test.py -tp 2
+
+# Print succeess message
+echo ""
+echo "PEFT tests passed!"
+echo ""
+
+# Cleanup after the test
+cleanup