Skip to content

Commit

Permalink
add dockerfile for llm env setup (#2229)
Browse files Browse the repository at this point in the history
  • Loading branch information
jingxu10 authored Nov 15, 2023
1 parent dae1ec8 commit 11484c3
Show file tree
Hide file tree
Showing 7 changed files with 443 additions and 125 deletions.
11 changes: 9 additions & 2 deletions docker/Dockerfile.compile
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,17 @@ RUN apt update && \
vim \
ccache \
numactl \
gcc-12 \
g++-12 \
make \
libjpeg-dev \
libpng-dev \
&& rm -rf /var/lib/apt/lists/*
&& rm -rf /var/lib/apt/lists/* && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 && \
update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \
update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100

RUN useradd -m ubuntu
RUN echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
USER ubuntu
Expand All @@ -31,7 +38,7 @@ RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Minicon
rm miniconda.sh && \
echo "source ~/miniconda3/bin/activate" >> ~/.bashrc

RUN curl -fsSL -v -o compile_bundle.sh -O https://github.com/intel/intel-extension-for-pytorch/raw/v2.1.0%2Bcpu/scripts/compile_bundle.sh && \
RUN curl -fsSL -v -o compile_bundle.sh -O https://github.com/intel/intel-extension-for-pytorch/raw/v2.1.100%2Bcpu/scripts/compile_bundle.sh && \
. ~/miniconda3/bin/activate && \
conda create -y -n py310 python=3.10 && \
conda activate py310 && \
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.prebuilt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
# Some TF tools expect a "python" binary
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python

ARG IPEX_VERSION=2.1.0
ARG IPEX_VERSION=2.1.100
ARG PYTORCH_VERSION=2.1.0
ARG TORCHAUDIO_VERSION=2.1.0
ARG TORCHVISION_VERSION=0.16.0
Expand Down
64 changes: 64 additions & 0 deletions examples/cpu/inference/python/llm/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# NOTE: To build this you will need a docker version >= 19.03 and DOCKER_BUILDKIT=1
#
# If you do not use buildkit you are not going to have a good time
#
# For reference:
# https://docs.docker.com/develop/develop-images/build_enhancements/

ARG BASE_IMAGE=ubuntu:22.04
FROM ${BASE_IMAGE} AS base
RUN apt update && \
apt full-upgrade -y && \
DEBIAN_FRONTEND=noninteractive apt install -y \
sudo \
numactl \
wget \
vim \
git \
gcc-12 \
g++-12 \
make \
curl && \
rm -rf /var/lib/apt/lists/* && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 && \
update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \
update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100
RUN useradd -m ubuntu && \
echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers
WORKDIR /home/ubuntu

FROM base AS dev
COPY . /home/ubuntu/llm
RUN chown -R ubuntu:ubuntu /home/ubuntu/llm && \
rm /home/ubuntu/llm/Dockerfile

USER ubuntu

RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
bash miniconda.sh -b -p ~/miniconda3 && \
rm miniconda.sh && \
echo "source ~/miniconda3/bin/activate" >> ~/.bashrc

RUN . ~/miniconda3/bin/activate && \
conda create -y -n compile_py310 python=3.10 && \
conda activate compile_py310 && \
cd llm && \
bash tools/env_setup.sh 2 && \
conda deactivate && \
conda remove -y -n compile_py310 --all && \
conda create -y -n py310 python=3.10 && \
conda activate py310 && \
bash tools/env_setup.sh 1 && \
echo "conda activate py310" >> ~/.bashrc

FROM base AS deploy
USER ubuntu
COPY --from=dev /home/ubuntu/miniconda3 /home/ubuntu/miniconda3
COPY --from=dev /home/ubuntu/llm /home/ubuntu/llm
COPY --from=dev /home/ubuntu/.bashrc /home/ubuntu/.bashrc
RUN sudo chown -R ubuntu:ubuntu ~/miniconda3 ~/llm ~/.bashrc && \
sudo mv ~/llm/oneCCL/build/_install /opt/oneCCL && \
sudo chown -R root:root /opt/oneCCL && \
rm -rf ~/llm/oneCCL && \
sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ~/llm/tools/env_activate.sh
151 changes: 64 additions & 87 deletions examples/cpu/inference/python/llm/README.md
Original file line number Diff line number Diff line change
@@ -1,103 +1,80 @@
# Text Generation

We provide the inference benchmarking scripts for large language models text generation.<br/>
Support large language model families, including GPT-J, LLaMA, GPT-Neox, OPT, Falcon, CodeGen.<br/>
The scripts include both single instance and distributed (DeepSpeed) use cases.<br/>
The scripts cover model generation inference with low precions cases for different models with best perf and accuracy (bf16 AMP,static quantization and weight only quantization).<br/>

# Setup
# Supported Model List

| MODEL FAMILY | Verified < MODEL ID > (Huggingface hub)| FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4| Static quantization INT8 |
|---|:---:|:---:|:---:|:---:|:---:|
|LLAMA| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Llama-2-70b-hf" |||||
|GPT-J| "EleutherAI/gpt-j-6b" |||||
|GPT-NEOX| "EleutherAI/gpt-neox-20b" ||||\*\* |
|FALCON\*|"tiiuae/falcon-40b" ||||\*\*|
|OPT|"facebook/opt-30b", "facebook/opt-1.3b"||||\*\*|
|CodeGen|"Salesforce/codegen-2B-multi"||||\*\*|

\* For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.

\*\* For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.

*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.

# Environment Setup

1. Get the Intel® Extension for PyTorch\* source code

```bash
WORK_DIR=$PWD
# GCC 12.3 is required, please set it firstly
# Create environment (conda recommended)
conda create -n llm python=3.9 -y
# install deps
conda install cmake ninja mkl mkl-include -y
conda install gperftools -c conda-forge -y

# Install PyTorch 2.1 release
python -m pip install torch==2.1 --index-url https://download.pytorch.org/whl/cpu

# Install IPEX 2.1 release
python -m pip install intel_extension_for_pytorch

# Used for accuracy test only
git clone https://github.com/EleutherAI/lm-evaluation-harness
cd lm-evaluation-harness
pip install -e .

# Install transformers
pip install transformers==4.31.0
# Install others deps
pip install cpuid accelerate datasets sentencepiece protobuf==3.20.3

# Setup environment variables for performance on Xeon
export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so.6
export KMP_BLOCKTIME=INF
export KMP_TPAUSE=0
export KMP_SETTINGS=1
export KMP_FORJOIN_BARRIER_PATTERN=dist,dist
export KMP_PLAIN_BARRIER_PATTERN=dist,dist
export KMP_REDUCTION_BARRIER_PATTERN=dist,dist
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so # Intel OpenMP
# Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so

# [Optional] install neural-compressor for GPT-J static quantization and running GPTQ (see below)
pip install neural-compressor==2.3.1

# [Optional] The following is only for DeepSpeed case
#Install oneccl-bind-pt(also named torch-ccl)
git clone https://github.com/intel/torch-ccl.git
cd torch-ccl && git checkout v2.1.0+cpu
git submodule sync && git submodule update --init --recursive
python setup.py install
cd ../
#Install DeepSpeed
git clone https://github.com/delock/DeepSpeedSYCLSupport
cd DeepSpeedSYCLSupport
git checkout gma/run-opt-branch
python -m pip install -r requirements/requirements.txt
python setup.py install
cd ../
#Install OneCCL
git clone https://github.com/oneapi-src/oneCCL.git
cd oneCCL
mkdir build
cd build
cmake ..
make -j install
source _install/env/setvars.sh
cd ../..
git clone https://github.com/intel/intel-extension-for-pytorch.git
cd intel-extension-for-pytorch
git checkout v2.1.100+cpu
cd examples/cpu/inference/python/llm
```

# Get the sample prompt.json
# Make sure the downloaded prompt.json file is under the same directory as that of the python scripts mentioned above.
wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
2.a. It is highly recommended to build a Docker container from the provided `Dockerfile`.

```bash
# Build an image with the provided Dockerfile
docker build -t ipex-llm:2.1.100 .

# Run the container with command below
docker run --rm -it --privileged ipex-llm:2.1.100 bash

# When the command prompt shows inside the docker container, enter llm examples directory
cd llm
```

# Supported Model List
2.b. Alternatively, you can take advantage of a provided environment configuration script to setup an environment without using a docker container.

| MODEL FAMILY | Verified < MODEL ID > (Huggingface hub)| FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4| Static quantization INT8 |
|---|:---:|:---:|:---:|:---:|:---:|
|LLAMA| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", "meta-llama/Llama-2-70b-hf" |||||
|GPT-J| "EleutherAI/gpt-j-6b" |||||
|GPT-NEOX| "EleutherAI/gpt-neox-20b" ||||** |
|FALCON*|"tiiuae/falcon-40b" ||||**|
|OPT|"facebook/opt-30b", "facebook/opt-1.3b"||||**|
|CodeGen|"Salesforce/codegen-2B-multi"||||**|
```bash
# GCC 12.3 is required. Installation can be taken care of by the environment configuration script.
# Create a conda environment
conda create -n llm python=3.9 -y
conda activate llm

*For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations.
# Setup the environment with the provided script
bash ./tools/env_setup.sh
```

** For GPT-NEOX/FALCON/OPT/CodeGen models, the accuracy recipes of static quantization INT8 are not ready thus they will be skipped in our coverage.
3. Once an environment is configured with either method above, set necessary environment variables with an environment variables activation script and download the sample `prompt.json`.

*Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). For other LLM model families, we are working in progress to cover those optimizations, which will expand the model list above.
```bash
# Activate environment variables
source ./tools/env_activate.sh

# Get the sample prompt.json
wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json

```

# Run Models Generations

| Benchmark mode | FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4 | Static quantization INT8 |
| Benchmark mode | FP32/BF16 | Weight only quantzation INT8 | Weight only quantization INT4 | Static quantization INT8 |
|---|:---:|:---:|:---:|:---:|
|Single instance |||||
| Distributed (autotp) |||||
|Single instance |||||
| Distributed (autotp) |||||

You can run LLM with a one-click Python script "run.py" for all inference cases.
```
Expand All @@ -107,7 +84,7 @@ python run.py --help # for more detailed usages
### Single Instance Performance
```bash
# Get prompt file to the path of scripts
mv PATH/TO/prompt.json ./single_instance
cp prompt.json ./single_instance
export WORK_DIR=./

# bf16 benchmark
Expand All @@ -134,7 +111,7 @@ Notes:
### Distributed Performance with DeepSpeed (autoTP)
```bash
# Get prompt file to the path of scripts
mv PATH/TO/prompt.json ./distributed
cp prompt.json ./distributed
export WORK_DIR=./
unset KMP_AFFINITY

Expand All @@ -158,13 +135,13 @@ Notes:
# Get prompt file to the path of scripts
export WORK_DIR=./
cd single_instance
mv PATH/TO/prompt.json ./
cp PATH/TO/prompt.json ./
# bfloat16 benchmark
OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run_generation.py --benchmark -m <MODEL_ID> --dtype bfloat16 --ipex --deployment-mode

# quantization benchmark
#To run quantization performance, you need to firstly get the quantized model with the following step (1) and then run the performance benchmark with the following step (2)
## (1) Do quantization to get the quantized model
## (1) Do quantization to get the quantized model
## note: llama/gptj we have both IPEX smooth quant and weight-only-quantization, while for rest models, we recommend weight-only-quantization
mkdir saved_results

Expand All @@ -180,7 +157,7 @@ python run_gpt-neox_quantization.py --ipex-weight-only-quantization --output-dir
## Falcon quantization (example of config-file: utils/model_config/tiiuae_falcon-40b_config.json)
python run_falcon_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <FALCON MODEL_ID> --config-file <CONFIG_FILE>
## OPT quantization
python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <OPT MODEL_ID>
python run_opt_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <OPT MODEL_ID>
## CodeGen quantization
python run_codegen_quantization.py --ipex-weight-only-quantization --output-dir "saved_results" --int8-bf16-mixed -m <CODEGEN MODEL_ID>

Expand Down Expand Up @@ -289,7 +266,7 @@ deepspeed --bind_cores_to_rank run_generation_with_deepspeed.py --benchmark -m E
## Distributed Accuracy with DeepSpeed (autoTP)
```bash
# Run distributed accuracy with 2 ranks of one node for bfloat16 with ipex and jit
# Run distributed accuracy with 2 ranks of one node for bfloat16 with ipex and jit
source ${ONECCL_DIR}/build/_install/env/setvars.sh
export LD_PRELOAD=${CONDA_PREFIX}/lib/libiomp5.so:${CONDA_PREFIX}/lib/libtcmalloc.so
Expand Down
26 changes: 26 additions & 0 deletions examples/cpu/inference/python/llm/tools/env_activate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

# Setup environment variables for performance on Xeon
export KMP_BLOCKTIME=INF
export KMP_TPAUSE=0
export KMP_SETTINGS=1
export KMP_FORJOIN_BARRIER_PATTERN=dist,dist
export KMP_PLAIN_BARRIER_PATTERN=dist,dist
export KMP_REDUCTION_BARRIER_PATTERN=dist,dist

env | grep CONDA_PREFIX > /dev/null
if [ $? -eq 0 ]; then
export LD_PRELOAD=${CONDA_PREFIX}/lib/libstdc++.so.6
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so # Intel OpenMP
# Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
else
echo "Conda environment is not available. You need to set environment variable LD_PRELOAD to dynamic libraries of Intel OpenMP and TcMalloc manually."
fi

ONECCL_PATH=./oneCCL/build/_install
if [ ! -d ${ONECCL_PATH} ]; then
echo "oneCCL is not available."
else
source ${ONECCL_PATH}/env/setvars.sh
fi
Loading

0 comments on commit 11484c3

Please sign in to comment.