From 11484c3ebad9f868d0179a46de3d1330d9011822 Mon Sep 17 00:00:00 2001
From: Jing Xu <jing.xu@intel.com>
Date: Thu, 16 Nov 2023 06:36:21 +0900
Subject: [PATCH] add dockerfile for llm env setup (#2229)

---
 docker/Dockerfile.compile                     |  11 +-
 docker/Dockerfile.prebuilt                    |   2 +-
 examples/cpu/inference/python/llm/Dockerfile  |  64 +++++++
 examples/cpu/inference/python/llm/README.md   | 151 +++++++---------
 .../python/llm/tools/env_activate.sh          |  26 +++
 .../inference/python/llm/tools/env_setup.sh   | 143 +++++++++++++++
 scripts/compile_bundle.sh                     | 171 ++++++++++++++----
 7 files changed, 443 insertions(+), 125 deletions(-)
 create mode 100644 examples/cpu/inference/python/llm/Dockerfile
 create mode 100644 examples/cpu/inference/python/llm/tools/env_activate.sh
 create mode 100644 examples/cpu/inference/python/llm/tools/env_setup.sh

diff --git a/docker/Dockerfile.compile b/docker/Dockerfile.compile
index 279c5461d..30e5acc5a 100644
--- a/docker/Dockerfile.compile
+++ b/docker/Dockerfile.compile
@@ -17,10 +17,17 @@ RUN apt update && \
     vim \
     ccache \
     numactl \
+    gcc-12 \
+    g++-12 \
     make \
     libjpeg-dev \
     libpng-dev \
-    && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/* && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 && \
+    update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \
+    update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100
+
 RUN useradd -m ubuntu
 RUN echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
 USER ubuntu
@@ -31,7 +38,7 @@ RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Minicon
     rm miniconda.sh && \
     echo "source ~/miniconda3/bin/activate" >> ~/.bashrc
 
-RUN curl -fsSL -v -o compile_bundle.sh -O https://github.com/intel/intel-extension-for-pytorch/raw/v2.1.0%2Bcpu/scripts/compile_bundle.sh && \
+RUN curl -fsSL -v -o compile_bundle.sh -O https://github.com/intel/intel-extension-for-pytorch/raw/v2.1.100%2Bcpu/scripts/compile_bundle.sh && \
     . ~/miniconda3/bin/activate && \
     conda create -y -n py310 python=3.10 && \
     conda activate py310 && \
diff --git a/docker/Dockerfile.prebuilt b/docker/Dockerfile.prebuilt
index 8a6b22a75..cf4e64ee8 100644
--- a/docker/Dockerfile.prebuilt
+++ b/docker/Dockerfile.prebuilt
@@ -27,7 +27,7 @@ RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
 # Some TF tools expect a "python" binary
 RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 
-ARG IPEX_VERSION=2.1.0
+ARG IPEX_VERSION=2.1.100
 ARG PYTORCH_VERSION=2.1.0
 ARG TORCHAUDIO_VERSION=2.1.0
 ARG TORCHVISION_VERSION=0.16.0
diff --git a/examples/cpu/inference/python/llm/Dockerfile b/examples/cpu/inference/python/llm/Dockerfile
new file mode 100644
index 000000000..b11ef7680
--- /dev/null
+++ b/examples/cpu/inference/python/llm/Dockerfile
@@ -0,0 +1,64 @@
+# NOTE: To build this you will need a docker version >= 19.03 and DOCKER_BUILDKIT=1
+#
+#       If you do not use buildkit you are not going to have a good time
+#
+#       For reference:
+#           https://docs.docker.com/develop/develop-images/build_enhancements/
+
+ARG BASE_IMAGE=ubuntu:22.04
+FROM ${BASE_IMAGE} AS base
+RUN apt update && \
+    apt full-upgrade -y && \
+    DEBIAN_FRONTEND=noninteractive apt install -y \
+    sudo \
+    numactl \
+    wget \
+    vim \
+    git \
+    gcc-12 \
+    g++-12 \
+    make \
+    curl && \
+    rm -rf /var/lib/apt/lists/* && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 && \
+    update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \
+    update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100
+RUN useradd -m ubuntu && \
+    echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
+WORKDIR /home/ubuntu
+
+FROM base AS dev
+COPY . /home/ubuntu/llm
+RUN chown -R ubuntu:ubuntu /home/ubuntu/llm && \
+    rm /home/ubuntu/llm/Dockerfile
+
+USER ubuntu
+
+RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+    bash miniconda.sh -b -p ~/miniconda3 && \
+    rm miniconda.sh && \
+    echo "source ~/miniconda3/bin/activate" >> ~/.bashrc
+
+RUN . ~/miniconda3/bin/activate && \
+    conda create -y -n compile_py310 python=3.10 && \
+    conda activate compile_py310 && \
+    cd llm && \
+    bash tools/env_setup.sh 2 && \
+    conda deactivate && \
+    conda remove -y -n compile_py310 --all && \
+    conda create -y -n py310 python=3.10 && \
+    conda activate py310 && \
+    bash tools/env_setup.sh 1 && \
+    echo "conda activate py310" >> ~/.bashrc
+
+FROM base AS deploy
+USER ubuntu
+COPY --from=dev /home/ubuntu/miniconda3 /home/ubuntu/miniconda3
+COPY --from=dev /home/ubuntu/llm /home/ubuntu/llm
+COPY --from=dev /home/ubuntu/.bashrc /home/ubuntu/.bashrc
+RUN sudo chown -R ubuntu:ubuntu ~/miniconda3 ~/llm ~/.bashrc && \
+    sudo mv ~/llm/oneCCL/build/_install /opt/oneCCL && \
+    sudo chown -R root:root /opt/oneCCL && \
+    rm -rf ~/llm/oneCCL && \
+    sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ~/llm/tools/env_activate.sh
diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md
index 477448262..9ba7c57cc 100644
--- a/examples/cpu/inference/python/llm/README.md
+++ b/examples/cpu/inference/python/llm/README.md
@@ -1,103 +1,80 @@
 # Text Generation
+
 We provide the inference benchmarking scripts for large language models text generation.<br/>
 Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon, CodeGen.<br/>
 The scripts include both single instance and distributed (DeepSpeed) use cases.<br/>
 The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (bf16 AMP，static quantization and weight only quantization).<br/>
 
-# Setup
+# Supported Model List
+
+| MODEL FAMILY | Verified < MODEL ID > (Huggingface hub)| FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4| Static quantization INT8 |
+|---|:---:|:---:|:---:|:---:|:---:|
+|LLAMA| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Llama-2-70b-hf" | ✅ | ✅ | ✅ | ✅ |
+|GPT-J| "EleutherAI/gpt-j-6b" | ✅ | ✅ | ✅ | ✅ |
+|GPT-NEOX| "EleutherAI/gpt-neox-20b" | ✅ | ✅ | ✅ | ❎ \*\* |
+|FALCON\*|"tiiuae/falcon-40b" | ✅ | ✅ |  ✅ | ❎ \*\*|
+|OPT|"facebook/opt-30b", "facebook/opt-1.3b"| ✅ | ✅ |  ✅ | ❎ \*\*|
+|CodeGen|"Salesforce/codegen-2B-multi"| ✅ | ✅ |  ✅ | ❎ \*\*|
+
+\* For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.
+
+\*\* For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
+
+*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
+
+# Environment Setup
+
+1. Get the Intel® Extension for PyTorch\* source code
+
 ```bash
-WORK_DIR=$PWD
-# GCC 12.3 is required, please set it firstly
-# Create environment (conda recommended)
-conda create -n llm python=3.9 -y
-# install deps
-conda install cmake ninja mkl mkl-include -y
-conda install gperftools -c conda-forge -y
-
-# Install PyTorch 2.1 release
-python -m pip install torch==2.1 --index-url https://download.pytorch.org/whl/cpu
-
-# Install IPEX 2.1 release
-python -m pip install intel_extension_for_pytorch
-
-# Used for accuracy test only
-git clone https://github.com/EleutherAI/lm-evaluation-harness
-cd lm-evaluation-harness
-pip install -e .
-
-# Install transformers
-pip install transformers==4.31.0
-# Install others deps
-pip install cpuid accelerate datasets sentencepiece protobuf==3.20.3
-
-# Setup environment variables for performance on Xeon
-export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so.6
-export KMP_BLOCKTIME=INF
-export KMP_TPAUSE=0
-export KMP_SETTINGS=1
-export KMP_FORJOIN_BARRIER_PATTERN=dist,dist
-export KMP_PLAIN_BARRIER_PATTERN=dist,dist
-export KMP_REDUCTION_BARRIER_PATTERN=dist,dist
-export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so # Intel OpenMP
-# Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
-export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
-
-# [Optional] install neural-compressor for GPT-J static quantization and running GPTQ (see below)
-pip install neural-compressor==2.3.1
-
-# [Optional] The following is only for DeepSpeed case
-#Install oneccl-bind-pt(also named torch-ccl)
-git clone https://github.com/intel/torch-ccl.git
-cd torch-ccl && git checkout v2.1.0+cpu
-git submodule sync && git submodule update --init --recursive
-python setup.py install
-cd ../
-#Install DeepSpeed
-git clone https://github.com/delock/DeepSpeedSYCLSupport
-cd DeepSpeedSYCLSupport
-git checkout gma/run-opt-branch
-python -m pip install -r requirements/requirements.txt
-python setup.py install
-cd ../
-#Install OneCCL
-git clone https://github.com/oneapi-src/oneCCL.git
-cd oneCCL
-mkdir build
-cd build
-cmake ..
-make -j install
-source _install/env/setvars.sh
-cd ../..
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout v2.1.100+cpu
+cd examples/cpu/inference/python/llm
+```
 
-# Get the sample prompt.json
-# Make sure the downloaded prompt.json file is under the same directory as that of the python scripts mentioned above.
-wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
+2.a. It is highly recommended to build a Docker container from the provided `Dockerfile`.
 
+```bash
+# Build an image with the provided Dockerfile
+docker build -t ipex-llm:2.1.100 .
+
+# Run the container with command below
+docker run --rm -it --privileged ipex-llm:2.1.100 bash
+
+# When the command prompt shows inside the docker container, enter llm examples directory
+cd llm
 ```
 
-# Supported Model List
+2.b. Alternatively, you can take advantage of a provided environment configuration script to setup an environment without using a docker container.
 
-| MODEL FAMILY | Verified < MODEL ID > (Huggingface hub)| FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4| Static quantization INT8 | 
-|---|:---:|:---:|:---:|:---:|:---:|
-|LLAMA| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Llama-2-70b-hf" | ✅ | ✅ | ✅ | ✅ | 
-|GPT-J| "EleutherAI/gpt-j-6b" | ✅ | ✅ | ✅ | ✅ | 
-|GPT-NEOX| "EleutherAI/gpt-neox-20b" | ✅ | ✅ | ✅ | ❎ ** | 
-|FALCON*|"tiiuae/falcon-40b" | ✅ | ✅ |  ✅ | ❎ **| 
-|OPT|"facebook/opt-30b", "facebook/opt-1.3b"| ✅ | ✅ |  ✅ | ❎ **| 
-|CodeGen|"Salesforce/codegen-2B-multi"| ✅ | ✅ |  ✅ | ❎ **|
+```bash
+# GCC 12.3 is required. Installation can be taken care of by the environment configuration script.
+# Create a conda environment
+conda create -n llm python=3.9 -y
+conda activate llm
 
-*For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.
+# Setup the environment with the provided script
+bash ./tools/env_setup.sh
+```
 
-** For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
+3. Once an environment is configured with either method above, set necessary environment variables with an environment variables activation script and download the sample `prompt.json`.
 
-*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
+```bash
+# Activate environment variables
+source ./tools/env_activate.sh
+
+# Get the sample prompt.json
+wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
+
+```
 
 # Run Models Generations
 
-| Benchmark mode | FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4 | Static quantization INT8 | 
+| Benchmark mode | FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4 | Static quantization INT8 |
 |---|:---:|:---:|:---:|:---:|
-|Single instance | ✅ | ✅ | ✅ | ✅ | 
-| Distributed (autotp) |  ✅ | ✅ | ❎ | ❎ | 
+|Single instance | ✅ | ✅ | ✅ | ✅ |
+| Distributed (autotp) |  ✅ | ✅ | ❎ | ❎ |
 
 You can run LLM with a one-click Python script "run.py" for all inference cases.
 ```
@@ -107,7 +84,7 @@ python run.py --help # for more detailed usages
 ### Single Instance Performance
 ```bash
 # Get prompt file to the path of scripts
-mv PATH/TO/prompt.json ./single_instance
+cp prompt.json ./single_instance
 export WORK_DIR=./
 
 # bf16 benchmark
@@ -134,7 +111,7 @@ Notes:
 ### Distributed Performance with DeepSpeed (autoTP)
 ```bash
 # Get prompt file to the path of scripts
-mv PATH/TO/prompt.json ./distributed
+cp prompt.json ./distributed
 export WORK_DIR=./
 unset KMP_AFFINITY
 
@@ -158,13 +135,13 @@ Notes:
 # Get prompt file to the path of scripts
 export WORK_DIR=./
 cd single_instance
-mv PATH/TO/prompt.json ./
+cp PATH/TO/prompt.json ./
 # bfloat16 benchmark
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run_generation.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode
 
 # quantization benchmark
 #To run quantization performance, you need to firstly get the quantized model with the following step (1) and then run the performance benchmark with the following step (2)
-## (1) Do quantization to get the quantized model 
+## (1) Do quantization to get the quantized model
 ## note: llama/gptj we have both IPEX smooth quant and weight-only-quantization, while for rest models, we recommend weight-only-quantization
 mkdir saved_results
 
@@ -180,7 +157,7 @@ python run_gpt-neox_quantization.py --ipex-weight-only-quantization --output-dir
 ## Falcon quantization (example of config-file: utils/model_config/tiiuae_falcon-40b_config.json)
 python run_falcon_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <FALCON MODEL_ID> --config-file <CONFIG_FILE>
 ## OPT quantization
-python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <OPT MODEL_ID> 
+python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <OPT MODEL_ID>
 ## CodeGen quantization
 python run_codegen_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <CODEGEN MODEL_ID>
 
@@ -289,7 +266,7 @@ deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m E
 
 ## Distributed Accuracy with DeepSpeed (autoTP)
 ```bash
-# Run distributed accuracy with 2 ranks of one node for bfloat16 with ipex and jit 
+# Run distributed accuracy with 2 ranks of one node for bfloat16 with ipex and jit
 source ${ONECCL_DIR}/build/_install/env/setvars.sh
 
 export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so:${CONDA_PREFIX}/lib/libtcmalloc.so
diff --git a/examples/cpu/inference/python/llm/tools/env_activate.sh b/examples/cpu/inference/python/llm/tools/env_activate.sh
new file mode 100644
index 000000000..e37c9967e
--- /dev/null
+++ b/examples/cpu/inference/python/llm/tools/env_activate.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Setup environment variables for performance on Xeon
+export KMP_BLOCKTIME=INF
+export KMP_TPAUSE=0
+export KMP_SETTINGS=1
+export KMP_FORJOIN_BARRIER_PATTERN=dist,dist
+export KMP_PLAIN_BARRIER_PATTERN=dist,dist
+export KMP_REDUCTION_BARRIER_PATTERN=dist,dist
+
+env | grep CONDA_PREFIX > /dev/null
+if [ $? -eq 0 ]; then
+    export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so.6
+    export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so # Intel OpenMP
+    # Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
+    export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
+else
+    echo "Conda environment is not available. You need to set environment variable LD_PRELOAD to dynamic libraries of Intel OpenMP and TcMalloc manually."
+fi
+
+ONECCL_PATH=./oneCCL/build/_install
+if [ ! -d ${ONECCL_PATH} ]; then
+    echo "oneCCL is not available."
+else
+    source ${ONECCL_PATH}/env/setvars.sh
+fi
diff --git a/examples/cpu/inference/python/llm/tools/env_setup.sh b/examples/cpu/inference/python/llm/tools/env_setup.sh
new file mode 100644
index 000000000..492c47d69
--- /dev/null
+++ b/examples/cpu/inference/python/llm/tools/env_setup.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+set -e
+
+LM_EVA_COMMIT=cc9778fbe4fa1a709be2abed9deb6180fd40e7e2
+# gma/run-opt-branch
+DS_SYCL_COMMIT=57ff508ea592ff752fd323b383c32177d5bce7b5
+ONECCL_COMMIT=bfc879266e870b732bd165e399897419c44ad13d
+VER_TORCH=2.1.0+cpu
+VER_IPEX=2.1.100
+VER_TORCHCCL=2.1.0+cpu
+VER_GCC=12.3.0
+AUX_INSTALL_SCRIPT=aux_install.sh
+
+# Mode: Select to compile projects into wheel files or install wheel files compiled.
+# High bit: 8 7 6 5 4 3 2 1 :Low bit
+#           | | | | | | | └- Install wheel files
+#           | | | | | | └--- Compile wheel files
+#           | | | | | └----- Undefined
+#           | | | | └------- Undefined
+#           | | | └--------- Undefined
+#           | | └----------- Undefined
+#           | └------------- Undefined
+#           └--------------- Undefined
+MODE=0x03
+if [ $# -gt 0 ]; then
+    if [[ ! $1 =~ ^[0-9]+$ ]] && [[ ! $1 =~ ^0x[0-9a-fA-F]+$ ]]; then
+        echo "Warning: Unexpected argument. Using default value."
+    else
+        MODE=$1
+    fi
+fi
+if [ ! -f ${AUX_INSTALL_SCRIPT} ] ||
+   [ ! -d lm-evaluation-harness ] ||
+   [ ! -d DeepSpeedSYCLSupport ]; then
+    (( MODE |= 0x02 ))
+fi
+
+if [ $((${MODE} & 0x02)) -ne 0 ]; then
+    # Check existance of required Linux commands
+    for CMD in conda gcc g++ make git; do
+        command -v ${CMD} > /dev/null || (echo "Error: Command \"${CMD}\" is required."; exit 1;)
+    done
+    echo "#!/usr/bin/env bash" > ${AUX_INSTALL_SCRIPT}
+
+    function ver_compare() {
+        VER_MAJOR_CUR=$(echo $1 | cut -d "." -f 1)
+        VER_MINOR_CUR=$(echo $1 | cut -d "." -f 2)
+        VER_PATCH_CUR=$(echo $1 | cut -d "." -f 3)
+        VER_MAJOR_REQ=$(echo $2 | cut -d "." -f 1)
+        VER_MINOR_REQ=$(echo $2 | cut -d "." -f 2)
+        VER_PATCH_REQ=$(echo $2 | cut -d "." -f 3)
+        RET=0
+        if [[ ${VER_MAJOR_CUR} -lt ${VER_MAJOR_REQ} ]]; then
+            RET=1
+        else
+            if [[ ${VER_MAJOR_CUR} -eq ${VER_MAJOR_REQ} ]] &&
+               [[ ${VER_MINOR_CUR} -lt ${VER_MINOR_REQ} ]]; then
+                RET=2
+            else
+                if [[ ${VER_MAJOR_CUR} -eq ${VER_MAJOR_REQ} ]] &&
+                   [[ ${VER_MINOR_CUR} -eq ${VER_MINOR_REQ} ]] &&
+                   [[ ${VER_PATCH_CUR} -lt ${VER_PATCH_REQ} ]]; then
+                    RET=3
+                fi
+            fi
+        fi
+        echo ${RET}
+    }
+    VER_COMP=$(ver_compare $(gcc -dumpfullversion) ${VER_GCC})
+    if [ ${VER_COMP} -ne 0 ]; then
+        echo -e '\a'
+        echo "Warning: GCC version equal to or newer than ${VER_GCC} is required."
+        echo "         Found GCC version $(gcc -dumpfullversion)"
+        echo "         Installing gcc and g++ 12.3 with conda"
+        echo ""
+        conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge
+        conda update -y sysroot_linux-64
+        export CC=${CONDA_PREFIX}/bin/gcc
+        export CXX=${CONDA_PREFIX}/bin/g++
+        export PATH=${CONDA_PREFIX}/bin:${PATH}
+        export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+    fi
+
+    # Install deps
+    conda install -y cmake ninja mkl mkl-include
+
+    # Install PyTorch and Intel® Extension for PyTorch*
+    python -m pip install torch==${VER_TORCH} --index-url https://download.pytorch.org/whl/cpu
+    python -m pip install intel-extension-for-pytorch==${VER_IPEX}
+    echo "python -m pip install torch==${VER_TORCH} --index-url https://download.pytorch.org/whl/cpu" >> ${AUX_INSTALL_SCRIPT}
+    echo "python -m pip install intel-extension-for-pytorch==${VER_IPEX}" >> ${AUX_INSTALL_SCRIPT}
+
+    # Used for accuracy test only
+    if [ -d lm-evaluation-harness ]; then
+        rm -rf lm-evaluation-harness
+    fi
+    git clone https://github.com/EleutherAI/lm-evaluation-harness
+    cd lm-evaluation-harness
+    git checkout ${LM_EVA_COMMIT}
+    python setup.py bdist_wheel
+    cd ..
+
+    # The following is only for DeepSpeed case
+    #Install oneccl-bind-pt(also named torch-ccl)
+    python -m pip install oneccl-bind-pt==${VER_IPEX} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+    echo "python -m pip install oneccl-bind-pt==${VER_IPEX} --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/" >> ${AUX_INSTALL_SCRIPT}
+
+    # Install DeepSpeed
+    if [ -d DeepSpeedSYCLSupport ]; then
+        rm -rf DeepSpeedSYCLSupport
+    fi
+    git clone https://github.com/delock/DeepSpeedSYCLSupport
+    cd DeepSpeedSYCLSupport
+    git checkout ${DS_SYCL_COMMIT}
+    python -m pip install -r requirements/requirements.txt
+    python setup.py bdist_wheel
+    cd ..
+
+    # Install OneCCL
+    if [ -d oneCCL ]; then
+        rm -rf oneCCL
+    fi
+    git clone https://github.com/oneapi-src/oneCCL.git
+    cd oneCCL
+    git checkout ${ONECCL_COMMIT}
+    mkdir build
+    cd build
+    cmake ..
+    make -j install
+    cd ../..
+fi
+if [ $((${MODE} & 0x01)) -ne 0 ]; then
+    conda install -y mkl
+    conda install -y gperftools -c conda-forge
+    bash ${AUX_INSTALL_SCRIPT}
+    python -m pip install cpuid accelerate datasets sentencepiece protobuf==3.20.3 transformers==4.31.0 neural-compressor==2.3.1
+    python -m pip install lm-evaluation-harness/dist/*.whl
+    python -m pip install DeepSpeedSYCLSupport/dist/*.whl
+
+    rm ${AUX_INSTALL_SCRIPT}
+    rm -rf lm-evaluation-harness
+    rm -rf DeepSpeedSYCLSupport
+fi
diff --git a/scripts/compile_bundle.sh b/scripts/compile_bundle.sh
index 87237e979..cdb20f023 100644
--- a/scripts/compile_bundle.sh
+++ b/scripts/compile_bundle.sh
@@ -1,15 +1,74 @@
 #!/bin/bash
-set -x
-set -e
+set -eo pipefail
 
-VER_LLVM="llvmorg-16.0.6"
-VER_IPEX="v2.1.0+cpu"
+VER_TORCH=2.1.0
+VER_TORCHVISION=0.16.0
+VER_TORCHAUDIO=2.1.0
+VER_LLVM=llvmorg-16.0.6
+VER_IPEX=v2.1.100+cpu
+VER_GCC=12.3.0
+
+# Mode: Select which components to install. PyTorch and Intel® Extension for PyTorch* are always installed.
+# High bit: 8 7 6 5 4 3 2 1 :Low bit
+#           | | | | | | | └- TorchAudio
+#           | | | | | | └--- TorchVision
+#           | | | | | └----- Rebuild LLVM
+#           | | | | └------- Undefined
+#           | | | └--------- Undefined
+#           | | └----------- Undefined
+#           | └------------- Undefined
+#           └--------------- Undefined
+MODE=0x03
+if [ $# -gt 0 ]; then
+    if [[ ! $1 =~ ^[0-9]+$ ]] && [[ ! $1 =~ ^0x[0-9a-fA-F]+$ ]]; then
+        echo "Warning: Unexpected argument. Using default value."
+    else
+        MODE=$1
+    fi
+fi
 
 # Check existance of required Linux commands
-for CMD in conda git nproc make; do
-    command -v ${CMD} || (echo "Error: Command \"${CMD}\" not found." ; exit 4)
+for CMD in conda git nproc gcc g++ make; do
+    command -v ${CMD} > /dev/null || (echo "Error: Command \"${CMD}\" not found." ; exit 1)
 done
 
+function ver_compare() {
+    VER_MAJOR_CUR=$(echo $1 | cut -d "." -f 1)
+    VER_MINOR_CUR=$(echo $1 | cut -d "." -f 2)
+    VER_PATCH_CUR=$(echo $1 | cut -d "." -f 3)
+    VER_MAJOR_REQ=$(echo $2 | cut -d "." -f 1)
+    VER_MINOR_REQ=$(echo $2 | cut -d "." -f 2)
+    VER_PATCH_REQ=$(echo $2 | cut -d "." -f 3)
+    RET=0
+    if [[ ${VER_MAJOR_CUR} -lt ${VER_MAJOR_REQ} ]]; then
+        RET=1
+    else
+        if [[ ${VER_MAJOR_CUR} -eq ${VER_MAJOR_REQ} ]] &&
+           [[ ${VER_MINOR_CUR} -lt ${VER_MINOR_REQ} ]]; then
+            RET=2
+        else
+            if [[ ${VER_MAJOR_CUR} -eq ${VER_MAJOR_REQ} ]] &&
+               [[ ${VER_MINOR_CUR} -eq ${VER_MINOR_REQ} ]] &&
+               [[ ${VER_PATCH_CUR} -lt ${VER_PATCH_REQ} ]]; then
+                RET=3
+            fi
+        fi
+    fi
+    echo ${RET}
+}
+VER_COMP=$(ver_compare $(gcc -dumpfullversion) ${VER_GCC})
+GCC_CONDA=0
+if [ ${VER_COMP} -ne 0 ]; then
+    echo -e '\a'
+    echo "Warning: GCC version equal to or newer than ${VER_GCC} is required."
+    echo "         Found GCC version $(gcc -dumpfullversion)"
+    echo "         Installing gcc and g++ 12.3 with conda"
+    echo ""
+    GCC_CONDA=1
+    conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge
+    conda update -y sysroot_linux-64
+fi
+
 MAX_JOBS_VAR=$(nproc)
 if [ ! -z "${MAX_JOBS}" ]; then
     MAX_JOBS_VAR=${MAX_JOBS}
@@ -28,14 +87,22 @@ fi
 
 # Checkout required branch/commit and update submodules
 cd llvm-project
-if [ ! -z ${VER_LLVM} ]; then
+if [ ! -z "${VER_LLVM}" ]; then
+    git stash > /dev/null
+    git clean -fd > /dev/null
+    git checkout main > /dev/null
+    git pull > /dev/null
     git checkout ${VER_LLVM}
 fi
 git submodule sync
 git submodule update --init --recursive
 cd ..
 cd intel-extension-for-pytorch
-if [ ! -z ${VER_IPEX} ]; then
+if [ ! -z "${VER_IPEX}" ]; then
+    git stash > /dev/null
+    git clean -fd > /dev/null
+    git checkout main > /dev/null
+    git pull > /dev/null
     git checkout ${VER_IPEX}
 fi
 git submodule sync
@@ -43,38 +110,49 @@ git submodule update --init --recursive
 cd ..
 
 # Install dependencies
-conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge
-conda update -y sysroot_linux-64
 python -m pip install cmake
-python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+python -m pip uninstall -y torch torchvision torchaudio intel-extension-for-pytorch
+python -m pip install torch==${VER_TORCH} --index-url https://download.pytorch.org/whl/cpu
+if [ $((${MODE} & 0x02)) -ne 0 ]; then
+    python -m pip install torchvision==${VER_TORCHVISION} --index-url https://download.pytorch.org/whl/cpu
+fi
+if [ $((${MODE} & 0x01)) -ne 0 ]; then
+    python -m pip install torchaudio==${VER_TORCHAUDIO} --index-url https://download.pytorch.org/whl/cpu
+fi
 ABI=$(python -c "import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))")
 
 # Compile individual component
-export CC=${CONDA_PREFIX}/bin/gcc
-export CXX=${CONDA_PREFIX}/bin/g++
-export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+if [[ ${GCC_CONDA} -eq 1 ]]; then
+    export CC=${CONDA_PREFIX}/bin/gcc
+    export CXX=${CONDA_PREFIX}/bin/g++
+    export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
+fi
 
 #  LLVM
-cd llvm-project
-LLVM_ROOT="$(pwd)/release"
-if [ -d ${LLVM_ROOT} ]; then
-    rm -rf ${LLVM_ROOT}
+LLVM_ROOT="$(pwd)/llvm-release"
+if [ $((${MODE} & 0x04)) -ne 0 ]; then
+    if [ -d ${LLVM_ROOT} ]; then
+        rm -rf ${LLVM_ROOT}
+    fi
 fi
+cd llvm-project
 if [ -d build ]; then
     rm -rf build
 fi
-mkdir build
-cd build
-echo "***************************** cmake *****************************" > ../build.log
-cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=${ABI}" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_BENCHMARKS=OFF ../llvm 2>&1 | tee -a ../build.log
-echo "***************************** build *****************************" >> ../build.log
-cmake --build . -j ${MAX_JOBS_VAR} 2>&1 | tee -a ../build.log
-echo "**************************** install ****************************" >> ../build.log
-cmake -DCMAKE_INSTALL_PREFIX=${LLVM_ROOT} -P cmake_install.cmake 2>&1 | tee -a ../build.log
-#xargs rm -rf < install_manifest.txt
-cd ..
-rm -rf build
-ln -s ${LLVM_ROOT}/bin/llvm-config ${LLVM_ROOT}/bin/llvm-config-13
+if [ ! -d ${LLVM_ROOT} ]; then
+    mkdir build
+    cd build
+    echo "***************************** cmake *****************************" > ../build.log
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=${ABI}" -DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_BENCHMARKS=OFF ../llvm 2>&1 | tee -a ../build.log
+    echo "***************************** build *****************************" >> ../build.log
+    cmake --build . -j ${MAX_JOBS_VAR} 2>&1 | tee -a ../build.log
+    echo "**************************** install ****************************" >> ../build.log
+    cmake -DCMAKE_INSTALL_PREFIX=${LLVM_ROOT} -P cmake_install.cmake 2>&1 | tee -a ../build.log
+    #xargs rm -rf < install_manifest.txt
+    cd ..
+    rm -rf build
+    ln -s ${LLVM_ROOT}/bin/llvm-config ${LLVM_ROOT}/bin/llvm-config-13
+fi
 export PATH=${LLVM_ROOT}/bin:$PATH
 export LD_LIBRARY_PATH=${LLVM_ROOT}/lib:$LD_LIBRARY_PATH
 cd ..
@@ -90,11 +168,34 @@ python setup.py bdist_wheel 2>&1 | tee build.log
 export CXXFLAGS=${CXXFLAGS_BK}
 unset DNNL_GRAPH_BUILD_COMPILER_BACKEND
 unset LLVM_DIR
-python -m pip install --force-reinstall dist/*.whl
+python -m pip uninstall -y mkl-static mkl-include
+python -m pip install dist/*.whl
 cd ..
 
 # Sanity Test
-set +x
-export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so
-echo "Note: Should you experience \"version \`GLIBCXX_N.N.NN' not found\" error, run command \"export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so\" and try again."
-python -c "import torch; import torchvision; import torchaudio; import intel_extension_for_pytorch as ipex; print(f'torch_cxx11_abi:     {torch._C._GLIBCXX_USE_CXX11_ABI}'); print(f'torch_version:       {torch.__version__}'); print(f'torchvision_version: {torchvision.__version__}'); print(f'torchaudio_version:  {torchaudio.__version__}'); print(f'ipex_version:        {ipex.__version__}');"
+if [ ! -z ${CONDA_PREFIX} ]; then
+    LIBSTDCPP_SYS=$(find /usr -regextype sed -regex ".*libstdc++\.so\.[[:digit:]]*\.[[:digit:]]*\.[[:digit:]]*")
+    LIBSTDCPP_CONDA=$(find ${CONDA_PREFIX}/lib -regextype sed -regex ".*libstdc++\.so\.[[:digit:]]*\.[[:digit:]]*\.[[:digit:]]*")
+    LIBSTDCPP_VER_SYS=$(echo ${LIBSTDCPP_SYS} | sed "s/.*libstdc++.so.//")
+    LIBSTDCPP_VER_CONDA=$(echo ${LIBSTDCPP_CONDA} | sed "s/.*libstdc++.so.//")
+    VER_COMP=$(ver_compare ${LIBSTDCPP_VER_CONDA} ${LIBSTDCPP_VER_SYS})
+    LIBSTDCPP_ACTIVE=""
+    if [[ ${VER_COMP} -gt 0 ]]; then
+        LIBSTDCPP_ACTIVE=${LIBSTDCPP_SYS}
+    else
+        LIBSTDCPP_ACTIVE=${LIBSTDCPP_CONDA}
+    fi
+    export LD_PRELOAD=${LIBSTDCPP_ACTIVE}
+    echo "======================================================"
+    echo "Note: Set environment variable \"export LD_PRELOAD=${LIBSTDCPP_ACTIVE}\" to avoid the \"version \`GLIBCXX_N.N.NN' not found\" error."
+    echo "======================================================"
+fi
+CMD="import torch; print(f'torch_cxx11_abi:     {torch._C._GLIBCXX_USE_CXX11_ABI}'); print(f'torch_version:       {torch.__version__}');"
+if [ $((${MODE} & 0x02)) -ne 0 ]; then
+    CMD="${CMD} import torchvision; print(f'torchvision_version: {torchvision.__version__}');"
+fi
+if [ $((${MODE} & 0x01)) -ne 0 ]; then
+    CMD="${CMD} import torchaudio; print(f'torchaudio_version:  {torchaudio.__version__}');"
+fi
+CMD="${CMD} import intel_extension_for_pytorch as ipex; print(f'ipex_version:        {ipex.__version__}');"
+python -c "${CMD}"