add dockerfile for llm env setup (#2229)

intel · Nov 15, 2023 · 11484c3 · 11484c3
1 parent dae1ec8
commit 11484c3
Show file tree

Hide file tree

Showing 7 changed files with 443 additions and 125 deletions.
diff --git a/docker/Dockerfile.compile b/docker/Dockerfile.compile
@@ -17,10 +17,17 @@ RUN apt update && \
     vim \
     ccache \
     numactl \
+    gcc-12 \
+    g++-12 \
     make \
     libjpeg-dev \
     libpng-dev \
-    && rm -rf /var/lib/apt/lists/*
+    && rm -rf /var/lib/apt/lists/* && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 && \
+    update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \
+    update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100
+
 RUN useradd -m ubuntu
 RUN echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
 USER ubuntu
@@ -31,7 +38,7 @@ RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Minicon
     rm miniconda.sh && \
     echo "source ~/miniconda3/bin/activate" >> ~/.bashrc
 
-RUN curl -fsSL -v -o compile_bundle.sh -O https://github.com/intel/intel-extension-for-pytorch/raw/v2.1.0%2Bcpu/scripts/compile_bundle.sh && \
+RUN curl -fsSL -v -o compile_bundle.sh -O https://github.com/intel/intel-extension-for-pytorch/raw/v2.1.100%2Bcpu/scripts/compile_bundle.sh && \
     . ~/miniconda3/bin/activate && \
     conda create -y -n py310 python=3.10 && \
     conda activate py310 && \

diff --git a/docker/Dockerfile.prebuilt b/docker/Dockerfile.prebuilt
@@ -27,7 +27,7 @@ RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
 # Some TF tools expect a "python" binary
 RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
 
-ARG IPEX_VERSION=2.1.0
+ARG IPEX_VERSION=2.1.100
 ARG PYTORCH_VERSION=2.1.0
 ARG TORCHAUDIO_VERSION=2.1.0
 ARG TORCHVISION_VERSION=0.16.0

diff --git a/examples/cpu/inference/python/llm/Dockerfile b/examples/cpu/inference/python/llm/Dockerfile
@@ -0,0 +1,64 @@
+# NOTE: To build this you will need a docker version >= 19.03 and DOCKER_BUILDKIT=1
+#
+#       If you do not use buildkit you are not going to have a good time
+#
+#       For reference:
+#           https://docs.docker.com/develop/develop-images/build_enhancements/
+
+ARG BASE_IMAGE=ubuntu:22.04
+FROM ${BASE_IMAGE} AS base
+RUN apt update && \
+    apt full-upgrade -y && \
+    DEBIAN_FRONTEND=noninteractive apt install -y \
+    sudo \
+    numactl \
+    wget \
+    vim \
+    git \
+    gcc-12 \
+    g++-12 \
+    make \
+    curl && \
+    rm -rf /var/lib/apt/lists/* && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 && \
+    update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \
+    update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100
+RUN useradd -m ubuntu && \
+    echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
+WORKDIR /home/ubuntu
+
+FROM base AS dev
+COPY . /home/ubuntu/llm
+RUN chown -R ubuntu:ubuntu /home/ubuntu/llm && \
+    rm /home/ubuntu/llm/Dockerfile
+
+USER ubuntu
+
+RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
+    bash miniconda.sh -b -p ~/miniconda3 && \
+    rm miniconda.sh && \
+    echo "source ~/miniconda3/bin/activate" >> ~/.bashrc
+
+RUN . ~/miniconda3/bin/activate && \
+    conda create -y -n compile_py310 python=3.10 && \
+    conda activate compile_py310 && \
+    cd llm && \
+    bash tools/env_setup.sh 2 && \
+    conda deactivate && \
+    conda remove -y -n compile_py310 --all && \
+    conda create -y -n py310 python=3.10 && \
+    conda activate py310 && \
+    bash tools/env_setup.sh 1 && \
+    echo "conda activate py310" >> ~/.bashrc
+
+FROM base AS deploy
+USER ubuntu
+COPY --from=dev /home/ubuntu/miniconda3 /home/ubuntu/miniconda3
+COPY --from=dev /home/ubuntu/llm /home/ubuntu/llm
+COPY --from=dev /home/ubuntu/.bashrc /home/ubuntu/.bashrc
+RUN sudo chown -R ubuntu:ubuntu ~/miniconda3 ~/llm ~/.bashrc && \
+    sudo mv ~/llm/oneCCL/build/_install /opt/oneCCL && \
+    sudo chown -R root:root /opt/oneCCL && \
+    rm -rf ~/llm/oneCCL && \
+    sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ~/llm/tools/env_activate.sh
diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md
@@ -1,103 +1,80 @@
 # Text Generation
+
 We provide the inference benchmarking scripts for large language models text generation.<br/>
 Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon, CodeGen.<br/>
 The scripts include both single instance and distributed (DeepSpeed) use cases.<br/>
 The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (bf16 AMP，static quantization and weight only quantization).<br/>
 
-# Setup
+# Supported Model List
+
+| MODEL FAMILY | Verified < MODEL ID > (Huggingface hub)| FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4| Static quantization INT8 |
+|---|:---:|:---:|:---:|:---:|:---:|
+|LLAMA| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Llama-2-70b-hf" | ✅ | ✅ | ✅ | ✅ |
+|GPT-J| "EleutherAI/gpt-j-6b" | ✅ | ✅ | ✅ | ✅ |
+|GPT-NEOX| "EleutherAI/gpt-neox-20b" | ✅ | ✅ | ✅ | ❎ \*\* |
+|FALCON\*|"tiiuae/falcon-40b" | ✅ | ✅ |  ✅ | ❎ \*\*|
+|OPT|"facebook/opt-30b", "facebook/opt-1.3b"| ✅ | ✅ |  ✅ | ❎ \*\*|
+|CodeGen|"Salesforce/codegen-2B-multi"| ✅ | ✅ |  ✅ | ❎ \*\*|
+
+\* For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.
+
+\*\* For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
+
+*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
+
+# Environment Setup
+
+1. Get the Intel® Extension for PyTorch\* source code
+
 ```bash
-WORK_DIR=$PWD
-# GCC 12.3 is required, please set it firstly
-# Create environment (conda recommended)
-conda create -n llm python=3.9 -y
-# install deps
-conda install cmake ninja mkl mkl-include -y
-conda install gperftools -c conda-forge -y
-
-# Install PyTorch 2.1 release
-python -m pip install torch==2.1 --index-url https://download.pytorch.org/whl/cpu
-
-# Install IPEX 2.1 release
-python -m pip install intel_extension_for_pytorch
-
-# Used for accuracy test only
-git clone https://github.com/EleutherAI/lm-evaluation-harness
-cd lm-evaluation-harness
-pip install -e .
-
-# Install transformers
-pip install transformers==4.31.0
-# Install others deps
-pip install cpuid accelerate datasets sentencepiece protobuf==3.20.3
-
-# Setup environment variables for performance on Xeon
-export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so.6
-export KMP_BLOCKTIME=INF
-export KMP_TPAUSE=0
-export KMP_SETTINGS=1
-export KMP_FORJOIN_BARRIER_PATTERN=dist,dist
-export KMP_PLAIN_BARRIER_PATTERN=dist,dist
-export KMP_REDUCTION_BARRIER_PATTERN=dist,dist
-export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so # Intel OpenMP
-# Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
-export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
-
-# [Optional] install neural-compressor for GPT-J static quantization and running GPTQ (see below)
-pip install neural-compressor==2.3.1
-
-# [Optional] The following is only for DeepSpeed case
-#Install oneccl-bind-pt(also named torch-ccl)
-git clone https://github.com/intel/torch-ccl.git
-cd torch-ccl && git checkout v2.1.0+cpu
-git submodule sync && git submodule update --init --recursive
-python setup.py install
-cd ../
-#Install DeepSpeed
-git clone https://github.com/delock/DeepSpeedSYCLSupport
-cd DeepSpeedSYCLSupport
-git checkout gma/run-opt-branch
-python -m pip install -r requirements/requirements.txt
-python setup.py install
-cd ../
-#Install OneCCL
-git clone https://github.com/oneapi-src/oneCCL.git
-cd oneCCL
-mkdir build
-cd build
-cmake ..
-make -j install
-source _install/env/setvars.sh
-cd ../..
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout v2.1.100+cpu
+cd examples/cpu/inference/python/llm
+```
 
-# Get the sample prompt.json
-# Make sure the downloaded prompt.json file is under the same directory as that of the python scripts mentioned above.
-wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
+2.a. It is highly recommended to build a Docker container from the provided `Dockerfile`.
 
+```bash
+# Build an image with the provided Dockerfile
+docker build -t ipex-llm:2.1.100 .
+
+# Run the container with command below
+docker run --rm -it --privileged ipex-llm:2.1.100 bash
+
+# When the command prompt shows inside the docker container, enter llm examples directory
+cd llm
 ```
 
-# Supported Model List
+2.b. Alternatively, you can take advantage of a provided environment configuration script to setup an environment without using a docker container.
 
-| MODEL FAMILY | Verified < MODEL ID > (Huggingface hub)| FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4| Static quantization INT8 | 
-|---|:---:|:---:|:---:|:---:|:---:|
-|LLAMA| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Llama-2-70b-hf" | ✅ | ✅ | ✅ | ✅ | 
-|GPT-J| "EleutherAI/gpt-j-6b" | ✅ | ✅ | ✅ | ✅ | 
-|GPT-NEOX| "EleutherAI/gpt-neox-20b" | ✅ | ✅ | ✅ | ❎ ** | 
-|FALCON*|"tiiuae/falcon-40b" | ✅ | ✅ |  ✅ | ❎ **| 
-|OPT|"facebook/opt-30b", "facebook/opt-1.3b"| ✅ | ✅ |  ✅ | ❎ **| 
-|CodeGen|"Salesforce/codegen-2B-multi"| ✅ | ✅ |  ✅ | ❎ **|
+```bash
+# GCC 12.3 is required. Installation can be taken care of by the environment configuration script.
+# Create a conda environment
+conda create -n llm python=3.9 -y
+conda activate llm
 
-*For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.
+# Setup the environment with the provided script
+bash ./tools/env_setup.sh
+```
 
-** For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
+3. Once an environment is configured with either method above, set necessary environment variables with an environment variables activation script and download the sample `prompt.json`.
 
-*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
+```bash
+# Activate environment variables
+source ./tools/env_activate.sh
+
+# Get the sample prompt.json
+wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
+
+```
 
 # Run Models Generations
 
-| Benchmark mode | FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4 | Static quantization INT8 | 
+| Benchmark mode | FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4 | Static quantization INT8 |
 |---|:---:|:---:|:---:|:---:|
-|Single instance | ✅ | ✅ | ✅ | ✅ | 
-| Distributed (autotp) |  ✅ | ✅ | ❎ | ❎ | 
+|Single instance | ✅ | ✅ | ✅ | ✅ |
+| Distributed (autotp) |  ✅ | ✅ | ❎ | ❎ |
 
 You can run LLM with a one-click Python script "run.py" for all inference cases.
 ```
@@ -107,7 +84,7 @@ python run.py --help # for more detailed usages
 ### Single Instance Performance
 ```bash
 # Get prompt file to the path of scripts
-mv PATH/TO/prompt.json ./single_instance
+cp prompt.json ./single_instance
 export WORK_DIR=./
 
 # bf16 benchmark
@@ -134,7 +111,7 @@ Notes:
 ### Distributed Performance with DeepSpeed (autoTP)
 ```bash
 # Get prompt file to the path of scripts
-mv PATH/TO/prompt.json ./distributed
+cp prompt.json ./distributed
 export WORK_DIR=./
 unset KMP_AFFINITY
 
@@ -158,13 +135,13 @@ Notes:
 # Get prompt file to the path of scripts
 export WORK_DIR=./
 cd single_instance
-mv PATH/TO/prompt.json ./
+cp PATH/TO/prompt.json ./
 # bfloat16 benchmark
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run_generation.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode
 
 # quantization benchmark
 #To run quantization performance, you need to firstly get the quantized model with the following step (1) and then run the performance benchmark with the following step (2)
-## (1) Do quantization to get the quantized model 
+## (1) Do quantization to get the quantized model
 ## note: llama/gptj we have both IPEX smooth quant and weight-only-quantization, while for rest models, we recommend weight-only-quantization
 mkdir saved_results
 
@@ -180,7 +157,7 @@ python run_gpt-neox_quantization.py --ipex-weight-only-quantization --output-dir
 ## Falcon quantization (example of config-file: utils/model_config/tiiuae_falcon-40b_config.json)
 python run_falcon_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <FALCON MODEL_ID> --config-file <CONFIG_FILE>
 ## OPT quantization
-python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <OPT MODEL_ID> 
+python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <OPT MODEL_ID>
 ## CodeGen quantization
 python run_codegen_quantization.py --ipex-weight-only-quantization --output-dir "saved_results"  --int8-bf16-mixed -m <CODEGEN MODEL_ID>
 
@@ -289,7 +266,7 @@ deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m E
 
 ## Distributed Accuracy with DeepSpeed (autoTP)
 ```bash
-# Run distributed accuracy with 2 ranks of one node for bfloat16 with ipex and jit 
+# Run distributed accuracy with 2 ranks of one node for bfloat16 with ipex and jit
 source ${ONECCL_DIR}/build/_install/env/setvars.sh
 
 export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so:${CONDA_PREFIX}/lib/libtcmalloc.so

diff --git a/examples/cpu/inference/python/llm/tools/env_activate.sh b/examples/cpu/inference/python/llm/tools/env_activate.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Setup environment variables for performance on Xeon
+export KMP_BLOCKTIME=INF
+export KMP_TPAUSE=0
+export KMP_SETTINGS=1
+export KMP_FORJOIN_BARRIER_PATTERN=dist,dist
+export KMP_PLAIN_BARRIER_PATTERN=dist,dist
+export KMP_REDUCTION_BARRIER_PATTERN=dist,dist
+
+env | grep CONDA_PREFIX > /dev/null
+if [ $? -eq 0 ]; then
+    export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so.6
+    export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so # Intel OpenMP
+    # Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
+    export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
+else
+    echo "Conda environment is not available. You need to set environment variable LD_PRELOAD to dynamic libraries of Intel OpenMP and TcMalloc manually."
+fi
+
+ONECCL_PATH=./oneCCL/build/_install
+if [ ! -d ${ONECCL_PATH} ]; then
+    echo "oneCCL is not available."
+else
+    source ${ONECCL_PATH}/env/setvars.sh
+fi