From bee4a423d99b4dea7362d8cb31b1d48e38344a8f Mon Sep 17 00:00:00 2001 From: Zaili Wang <109502517+ZailiWang@users.noreply.github.com> Date: Wed, 17 Jul 2024 18:54:36 +0800 Subject: [PATCH] LLM example path re-structure (release 2.4) (#3080) * LLM example files restructure * update * update path in docs * symlink * cherry-pick the typo fix (#3083) * fix path in quant script --------- Co-authored-by: WeizhuoZhang-intel --- README.md | 4 +- docs/tutorials/examples.md | 2 +- .../features/int8_recipe_tuning_api.md | 2 +- .../features/sq_recipe_tuning_api.md | 5 +- docs/tutorials/getting_started.md | 2 +- docs/tutorials/installation.md | 2 +- docs/tutorials/llm.rst | 4 +- docs/tutorials/llm/llm_optimize.md | 15 +- .../python/llm/tools/env_activate.sh | 27 -- .../python/llm/tools/get_libstdcpp_lib.sh | 1 - .../cpu/{inference/python => }/llm/Dockerfile | 7 +- examples/cpu/llm/README.md | 133 ++++++++++ .../llm => llm/fine-tuning}/README.md | 31 +-- .../llm => llm/fine-tuning}/finetune.py | 0 .../llm => llm/fine-tuning}/requirements.txt | 0 .../fine-tuning}/run_lora_finetune_ddp.sh | 0 .../llm => llm/fine-tuning}/utils/README.md | 0 .../llm => llm/fine-tuning}/utils/__init__.py | 0 .../llm => llm/fine-tuning}/utils/prompter.py | 0 .../python/llm => llm/inference}/README.md | 250 +++++------------- .../run_accuracy_with_deepspeed.py | 0 .../distributed/run_generation_tp.py | 0 .../run_generation_with_deepspeed.py | 0 .../llm => llm/inference}/llm_sq_recipes.md | 51 ++-- .../python/llm => llm/inference}/run.py | 0 .../single_instance/run_accuracy.py | 0 .../single_instance/run_generation.py | 0 .../run_int4_gpt-j_on_cnndailymail.py | 0 .../run_int4_gpt-j_on_cnndailymail.sh | 0 .../single_instance/run_quantization.py | 54 ++-- .../llm => llm/inference}/tools/llava.patch | 0 .../inference}/tools/prepare_llava.sh | 0 .../inference}/tools/run_scaling.sh | 0 .../inference}/utils/create_shard_model.py | 0 .../inference}/utils/model_class/baichuan.py | 0 .../inference}/utils/model_class/bloom.py | 0 .../inference}/utils/model_class/chatglm.py | 0 .../inference}/utils/model_class/codegen.py | 0 .../inference}/utils/model_class/falcon.py | 0 .../inference}/utils/model_class/git.py | 0 .../utils/model_class/gptbigcode.py | 0 .../inference}/utils/model_class/gptj.py | 0 .../inference}/utils/model_class/gptneox.py | 0 .../inference}/utils/model_class/llama.py | 0 .../inference}/utils/model_class/llava.py | 0 .../inference}/utils/model_class/llm.py | 0 .../inference}/utils/model_class/mistral.py | 0 .../inference}/utils/model_class/mixtral.py | 0 .../inference}/utils/model_class/mpt.py | 0 .../inference}/utils/model_class/opt.py | 0 .../inference}/utils/model_class/phi.py | 0 .../inference}/utils/model_class/qwen.py | 0 .../inference}/utils/model_class/qwen2.py | 0 .../inference}/utils/model_class/stablelm.py | 0 .../inference}/utils/model_class/t5.py | 0 .../inference}/utils/model_class/whisper.py | 0 .../inference}/utils/model_class/yuan.py | 0 .../model_config/mosaicml_mpt-7b_config.json | 0 .../tiiuae_falcon-40b_config.json | 0 .../llm => llm/inference}/utils/run_gptq.py | 0 examples/cpu/llm/tools/env_activate.sh | 75 ++++++ .../python => }/llm/tools/env_setup.sh | 23 +- examples/cpu/llm/tools/get_libstdcpp_lib.sh | 1 + scripts/compile_bundle.sh | 4 +- 64 files changed, 370 insertions(+), 323 deletions(-) delete mode 100644 examples/cpu/inference/python/llm/tools/env_activate.sh delete mode 120000 examples/cpu/inference/python/llm/tools/get_libstdcpp_lib.sh rename examples/cpu/{inference/python => }/llm/Dockerfile (87%) create mode 100644 examples/cpu/llm/README.md rename examples/cpu/{training/llm => llm/fine-tuning}/README.md (57%) rename examples/cpu/{training/llm => llm/fine-tuning}/finetune.py (100%) rename examples/cpu/{training/llm => llm/fine-tuning}/requirements.txt (100%) rename examples/cpu/{training/llm => llm/fine-tuning}/run_lora_finetune_ddp.sh (100%) rename examples/cpu/{training/llm => llm/fine-tuning}/utils/README.md (100%) rename examples/cpu/{training/llm => llm/fine-tuning}/utils/__init__.py (100%) rename examples/cpu/{training/llm => llm/fine-tuning}/utils/prompter.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/README.md (82%) rename examples/cpu/{inference/python/llm => llm/inference}/distributed/run_accuracy_with_deepspeed.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/distributed/run_generation_tp.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/distributed/run_generation_with_deepspeed.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/llm_sq_recipes.md (84%) rename examples/cpu/{inference/python/llm => llm/inference}/run.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_accuracy.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_generation.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_int4_gpt-j_on_cnndailymail.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_int4_gpt-j_on_cnndailymail.sh (100%) rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_quantization.py (97%) rename examples/cpu/{inference/python/llm => llm/inference}/tools/llava.patch (100%) rename examples/cpu/{inference/python/llm => llm/inference}/tools/prepare_llava.sh (100%) rename examples/cpu/{inference/python/llm => llm/inference}/tools/run_scaling.sh (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/create_shard_model.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/baichuan.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/bloom.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/chatglm.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/codegen.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/falcon.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/git.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/gptbigcode.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/gptj.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/gptneox.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/llama.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/llava.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/llm.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/mistral.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/mixtral.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/mpt.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/opt.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/phi.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/qwen.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/qwen2.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/stablelm.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/t5.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/whisper.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/yuan.py (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_config/mosaicml_mpt-7b_config.json (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_config/tiiuae_falcon-40b_config.json (100%) rename examples/cpu/{inference/python/llm => llm/inference}/utils/run_gptq.py (100%) create mode 100644 examples/cpu/llm/tools/env_activate.sh rename examples/cpu/{inference/python => }/llm/tools/env_setup.sh (94%) create mode 120000 examples/cpu/llm/tools/get_libstdcpp_lib.sh diff --git a/README.md b/README.md index f725aa751..4b13ebd5d 100644 --- a/README.md +++ b/README.md @@ -5,14 +5,14 @@ Intel® Extension for PyTorch\* -**CPU** [💻main branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)   |   [🌱Quick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)   |   [📖Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)   |   [🏃Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2Bcpu)   |   [💻LLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm)
+**CPU** [💻main branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)   |   [🌱Quick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)   |   [📖Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)   |   [🏃Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2Bcpu)   |   [💻LLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm)
**GPU** [💻main branch](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main)   |   [🌱Quick Start](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/getting_started.html)   |   [📖Documentations](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)   |   [🏃Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu)   |   [💻LLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/inference/python/llm)
Intel® Extension for PyTorch\* extends PyTorch\* with up-to-date features optimizations for an extra performance boost on Intel hardware. Optimizations take advantage of Intel® Advanced Vector Extensions 512 (Intel® AVX-512) Vector Neural Network Instructions (VNNI) and Intel® Advanced Matrix Extensions (Intel® AMX) on Intel CPUs as well as Intel Xe Matrix Extensions (XMX) AI engines on Intel discrete GPUs. Moreover, Intel® Extension for PyTorch* provides easy GPU acceleration for Intel discrete GPUs through the PyTorch* xpu device. ## ipex.llm - Large Language Models (LLMs) Optimization -In the current technological landscape, Generative AI (GenAI) workloads and models have gained widespread attention and popularity. Large Language Models (LLMs) have emerged as the dominant models driving these GenAI applications. Starting from 2.1.0, specific optimizations for certain LLM models are introduced in the Intel® Extension for PyTorch\*. Check [**LLM optimizations**](./examples/cpu/inference/python/llm) for details. +In the current technological landscape, Generative AI (GenAI) workloads and models have gained widespread attention and popularity. Large Language Models (LLMs) have emerged as the dominant models driving these GenAI applications. Starting from 2.1.0, specific optimizations for certain LLM models are introduced in the Intel® Extension for PyTorch\*. Check [**LLM optimizations**](./examples/cpu/llm) for details. ### Optimized Model List diff --git a/docs/tutorials/examples.md b/docs/tutorials/examples.md index baf23c7d3..d0a555b77 100644 --- a/docs/tutorials/examples.md +++ b/docs/tutorials/examples.md @@ -240,7 +240,7 @@ generate results for the input prompt. [//]: # (marker_llm_optimize_woq) [//]: # (marker_llm_optimize_woq) -**Note:** Please check [LLM Best Known Practice Page](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm) +**Note:** Please check [LLM Best Known Practice Page](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm) for detailed environment setup and LLM workload running instructions. ## C++ diff --git a/docs/tutorials/features/int8_recipe_tuning_api.md b/docs/tutorials/features/int8_recipe_tuning_api.md index 8bce1766e..31987838f 100644 --- a/docs/tutorials/features/int8_recipe_tuning_api.md +++ b/docs/tutorials/features/int8_recipe_tuning_api.md @@ -10,7 +10,7 @@ Users need to provide a fp32 model and some parameters required for tuning. The Please refer to [static_quant example](../../../examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py). - Smooth Quantization -Please refer to [llm sq example](../../../examples/cpu/inference/python/llm/single_instance/run_generation.py). +Please refer to [LLM SmoothQuant example](../../../examples/cpu/llm/inference/single_instance/run_generation.py). ## Smooth Quantization Autotune ### Algorithm: Auto-tuning of $\alpha$. diff --git a/docs/tutorials/features/sq_recipe_tuning_api.md b/docs/tutorials/features/sq_recipe_tuning_api.md index 4c19fb625..115548585 100644 --- a/docs/tutorials/features/sq_recipe_tuning_api.md +++ b/docs/tutorials/features/sq_recipe_tuning_api.md @@ -1,7 +1,8 @@ Smooth Quant Recipe Tuning API (Prototype) ============================================= -Smooth Quantization is a popular method to improve the accuracy of int8 quantization. The [autotune API](../api_doc.html#ipex.quantization.autotune) allows automatic global alpha tuning, and automatic layer-by-layer alpha tuning provided by Intel® Neural Compressor for the best INT8 accuracy. +Smooth Quantization is a popular method to improve the accuracy of int8 quantization. +The [autotune API](../api_doc.html#ipex.quantization.autotune) allows automatic global alpha tuning, and automatic layer-by-layer alpha tuning provided by Intel® Neural Compressor for the best INT8 accuracy. SmoothQuant will introduce alpha to calculate the ratio of input and weight updates to reduce quantization error. SmoothQuant arguments are as below: @@ -15,6 +16,6 @@ SmoothQuant will introduce alpha to calculate the ratio of input and weight upda | shared_criterion | "mean" | ["min", "mean","max"] | criterion for input LayerNorm op of a transformer block. | | enable_blockwise_loss | False | [True, False] | whether to enable block-wise auto-tuning | -For LLM examples, please refer to [example](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/inference/python/llm). +Please refer to the [LLM examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm) for complete examples. **Note**: When defining dataloaders for calibration, please follow INC's dataloader [format](https://github.com/intel/neural-compressor/blob/master/docs/source/dataloader.md). diff --git a/docs/tutorials/getting_started.md b/docs/tutorials/getting_started.md index 3abf1f77e..67874f6d4 100644 --- a/docs/tutorials/getting_started.md +++ b/docs/tutorials/getting_started.md @@ -157,4 +157,4 @@ with torch.inference_mode(), torch.cpu.amp.autocast(enabled=amp_enabled): print(gen_text, total_new_tokens, flush=True) ``` -More LLM examples, including usage of low precision data types are available in the [LLM Examples](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm) section. +More LLM examples, including usage of low precision data types are available in the [LLM Examples](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm) section. diff --git a/docs/tutorials/installation.md b/docs/tutorials/installation.md index a8d3a439a..707a091db 100644 --- a/docs/tutorials/installation.md +++ b/docs/tutorials/installation.md @@ -5,4 +5,4 @@ Select your preferences and follow the installation instructions provided on the After successful installation, refer to the [Quick Start](getting_started.md) and [Examples](examples.md) sections to start using the extension in your code. -**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/inference/python/llm). +**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm). diff --git a/docs/tutorials/llm.rst b/docs/tutorials/llm.rst index e1e117e5d..3c2878a72 100644 --- a/docs/tutorials/llm.rst +++ b/docs/tutorials/llm.rst @@ -13,7 +13,7 @@ These LLM-specific optimizations can be automatically applied with a single fron llm/llm_optimize -`ipex.llm` Optimized Model List +`ipex.llm` Optimized Model List for Inference ------------------------------- Verified for single instance mode @@ -30,7 +30,7 @@ Verified for distributed inference mode via DeepSpeed *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future. -Please check `LLM best known practice `_ for instructions to install/setup environment and example scripts. +Please check `LLM best known practice `_ for instructions to install/setup environment and example scripts. Module Level Optimization API for customized LLM (Prototype) ------------------------------------------------------------ diff --git a/docs/tutorials/llm/llm_optimize.md b/docs/tutorials/llm/llm_optimize.md index ab2c0b06b..7da2706a7 100644 --- a/docs/tutorials/llm/llm_optimize.md +++ b/docs/tutorials/llm/llm_optimize.md @@ -1,15 +1,20 @@ -Transformers Optimization Frontend API +LLM Optimizations Frontend API ====================================== -The new API function, `ipex.llm.optimize`, is designed to optimize transformer-based models within frontend Python modules, with a particular focus on Large Language Models (LLMs). It provides optimizations for both model-wise and content-generation-wise. You just need to invoke the `ipex.llm.optimize` function instead of the `ipex.optimize` function to apply all optimizations transparently. +The new API function, `ipex.llm.optimize`, is designed to optimize transformer-based models within frontend Python modules, with a particular focus on Large Language Models (LLMs). +It provides optimizations for both model-wise and content-generation-wise. +You just need to invoke the `ipex.llm.optimize` function instead of the `ipex.optimize` function to apply all optimizations transparently. -This API currently works for inference workloads. Support for training is undergoing. Currently, this API supports certain models. Supported model list can be found at [Overview](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#ipexllm-optimized-model-list). +This API currently works for inference workloads. +Currently, this API supports certain models. Supported model list can be found at [this page](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#ipexllm-optimized-model-list-for-inference). +For LLM fine-tuning, please check the [LLM fine-tuning tutorial](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm/fine-tuning). API documentation is available at [API Docs page](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/api_doc.html#ipex.llm.optimize). ## Pseudocode of Common Usage Scenarios -The following sections show pseudocode snippets to invoke Intel® Extension for PyTorch\* APIs to work with LLM models. Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/inference/python/llm). +The following sections show pseudocode snippets to invoke Intel® Extension for PyTorch\* APIs to work with LLM models. +Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm/inference). ### FP32/BF16 @@ -98,7 +103,7 @@ model = ipex.llm.optimize(model, quantization_config=qconfig, low_precision_chec Distributed inference can be performed with `DeepSpeed`. Based on original Intel® Extension for PyTorch\* scripts, the following code changes are required. -Check [LLM distributed inference examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/inference/python/llm/distributed) for complete codes. +Check [LLM distributed inference examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm/inference/distributed) for complete codes. ``` python import torch diff --git a/examples/cpu/inference/python/llm/tools/env_activate.sh b/examples/cpu/inference/python/llm/tools/env_activate.sh deleted file mode 100644 index 759c008f7..000000000 --- a/examples/cpu/inference/python/llm/tools/env_activate.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -# Setup environment variables for performance on Xeon -export KMP_BLOCKTIME=1 -export KMP_TPAUSE=0 -export KMP_FORKJOIN_BARRIER_PATTERN=dist,dist -export KMP_PLAIN_BARRIER_PATTERN=dist,dist -export KMP_REDUCTION_BARRIER_PATTERN=dist,dist - -BASEFOLDER=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) -export LD_PRELOAD=$(bash ${BASEFOLDER}/get_libstdcpp_lib.sh):${LD_PRELOAD} - -env | grep CONDA_PREFIX > /dev/null -if [ $? -eq 0 ]; then - export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so # Intel OpenMP - # Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support. - export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so -else - echo "Conda environment is not available. You need to set environment variable LD_PRELOAD to dynamic libraries of Intel OpenMP and TcMalloc manually if they are not in library search paths." -fi - -ONECCL_PATH=${BASEFOLDER}/../oneCCL_release -if [ ! -d ${ONECCL_PATH} ]; then - echo "Warning: oneCCL is not available." -else - source ${ONECCL_PATH}/env/setvars.sh -fi diff --git a/examples/cpu/inference/python/llm/tools/get_libstdcpp_lib.sh b/examples/cpu/inference/python/llm/tools/get_libstdcpp_lib.sh deleted file mode 120000 index 52891777e..000000000 --- a/examples/cpu/inference/python/llm/tools/get_libstdcpp_lib.sh +++ /dev/null @@ -1 +0,0 @@ -../../../../../../tools/get_libstdcpp_lib.sh \ No newline at end of file diff --git a/examples/cpu/inference/python/llm/Dockerfile b/examples/cpu/llm/Dockerfile similarity index 87% rename from examples/cpu/inference/python/llm/Dockerfile rename to examples/cpu/llm/Dockerfile index 9f7dd58ec..d56ba2404 100644 --- a/examples/cpu/inference/python/llm/Dockerfile +++ b/examples/cpu/llm/Dockerfile @@ -39,7 +39,7 @@ ENV PATH=/root/.local/bin:${PATH} FROM base AS dev ARG COMPILE COPY . ./intel-extension-for-pytorch -RUN cd intel-extension-for-pytorch/examples/cpu/inference/python/llm && \ +RUN cd intel-extension-for-pytorch/examples/cpu/llm && \ export CC=gcc && export CXX=g++ && \ if [ -z ${COMPILE} ]; then bash tools/env_setup.sh 6; else bash tools/env_setup.sh 2; fi && \ unset CC && unset CXX @@ -53,7 +53,7 @@ RUN apt update && \ apt clean && \ rm -rf /var/lib/apt/lists/* && \ if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi -COPY --from=dev /root/intel-extension-for-pytorch/examples/cpu/inference/python/llm ./llm +COPY --from=dev /root/intel-extension-for-pytorch/examples/cpu/llm ./llm COPY --from=dev /root/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh ./llm/tools RUN cd /usr/lib/x86_64-linux-gnu/ && ln -s libtcmalloc.so.4 libtcmalloc.so && cd && \ echo "echo \"**Note:** For better performance, please consider to launch workloads with command 'ipexrun'.\"" >> ./.bashrc && \ @@ -62,8 +62,7 @@ RUN cd /usr/lib/x86_64-linux-gnu/ && ln -s libtcmalloc.so.4 libtcmalloc.so && cd python -m pip cache purge && \ mv ./oneCCL_release /opt/oneCCL && \ chown -R root:root /opt/oneCCL && \ - sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ./tools/env_activate.sh && \ - LN=$(grep "Conda environment is not available." -n ./tools/env_activate.sh | cut -d ":" -f 1) && sed -i "${LN}s|.*| export LD_PRELOAD=\${LD_PRELOAD}:/usr/lib/x86_64-linux-gnu/libtcmalloc.so:/usr/local/lib/libiomp5.so|" ./tools/env_activate.sh + sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ./tools/env_activate.sh ARG PORT_SSH=22 RUN mkdir /var/run/sshd && \ sed -i "s/#Port.*/Port ${PORT_SSH}/" /etc/ssh/sshd_config && \ diff --git a/examples/cpu/llm/README.md b/examples/cpu/llm/README.md new file mode 100644 index 000000000..c99c6ecd5 --- /dev/null +++ b/examples/cpu/llm/README.md @@ -0,0 +1,133 @@ +# 1. LLM Optimization Overview + +`ipex.llm` provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc. +And a set of data types are supported for various scenarios, including FP32, BF16, Smooth Quantization INT8, Weight Only Quantization INT8/INT4 (prototype). + +
+ +# 2. Environment Setup + +There are several environment setup methodologies provided. You can choose either of them according to your usage scenario. The Docker-based ones are recommended. + +## 2.1 [RECOMMENDED] Docker-based environment setup with pre-built wheels + +```bash +# Get the Intel® Extension for PyTorch\* source code +git clone https://github.com/intel/intel-extension-for-pytorch.git +cd intel-extension-for-pytorch +git checkout v2.4.0+cpu +git submodule sync +git submodule update --init --recursive + +# Build an image with the provided Dockerfile by installing from Intel® Extension for PyTorch\* prebuilt wheel files +# To have a custom ssh server port for multi-nodes run, please add --build-arg PORT_SSH= ex: 2345, otherwise use the default 22 SSH port +DOCKER_BUILDKIT=1 docker build -f examples/cpu/llm/Dockerfile --build-arg PORT_SSH=2345 -t ipex-llm:2.4.0 . + +# Run the container with command below +docker run --rm -it --privileged -v /dev/shm:/dev/shm ipex-llm:2.4.0 bash + +# When the command prompt shows inside the docker container, enter llm examples directory +cd llm + +# Activate environment variables +# set bash script argument to "inference" or "fine-tuning" for different usages +source ./tools/env_activate.sh [inference|fine-tuning] +``` + +## 2.2 Conda-based environment setup with pre-built wheels + +```bash +# Get the Intel® Extension for PyTorch\* source code +git clone https://github.com/intel/intel-extension-for-pytorch.git +cd intel-extension-for-pytorch +git checkout v2.4.0+cpu +git submodule sync +git submodule update --init --recursive + +# GCC 12.3 is required. Installation can be taken care of by the environment configuration script. +# Create a conda environment +conda create -n llm python=3.10 -y +conda activate llm + +# Setup the environment with the provided script +cd examples/cpu/llm +bash ./tools/env_setup.sh 7 + +# Activate environment variables +# set bash script argument to "inference" or "fine-tuning" for different usages +source ./tools/env_activate.sh [inference|fine-tuning] +``` + +## 2.3 Docker-based environment setup with compilation from source + +```bash +# Get the Intel® Extension for PyTorch\* source code +git clone https://github.com/intel/intel-extension-for-pytorch.git +cd intel-extension-for-pytorch +git checkout v2.4.0+cpu +git submodule sync +git submodule update --init --recursive + +# Build an image with the provided Dockerfile by compiling Intel® Extension for PyTorch\* from source +# To have a custom ssh server port for multi-nodes run, please add --build-arg PORT_SSH= ex: 2345, otherwise use the default 22 SSH port +docker build -f examples/cpu/llm/Dockerfile --build-arg COMPILE=ON --build-arg PORT_SSH=2345 -t ipex-llm:2.4.0 . + +# Run the container with command below +docker run --rm -it --privileged -v /dev/shm:/dev/shm ipex-llm:2.4.0 bash + +# When the command prompt shows inside the docker container, enter llm examples directory +cd llm + +# Activate environment variables +# set bash script argument to "inference" or "fine-tuning" for different usages +source ./tools/env_activate.sh [inference|fine-tuning] +``` + +## 2.4 Conda-based environment setup with compilation from source + +```bash +# Get the Intel® Extension for PyTorch\* source code +git clone https://github.com/intel/intel-extension-for-pytorch.git +cd intel-extension-for-pytorch +git checkout v2.4.0+cpu +git submodule sync +git submodule update --init --recursive + +# GCC 12.3 is required. Installation can be taken care of by the environment configuration script. +# Create a conda environment +conda create -n llm python=3.10 -y +conda activate llm + +# Setup the environment with the provided script +cd examples/cpu/llm +bash ./tools/env_setup.sh + +# Activate environment variables +# set bash script argument to "inference" or "fine-tuning" for different usages +source ./tools/env_activate.sh [inference|fine-tuning] +``` + +
+ +*Note*: In `env_activate.sh` script a `prompt.json` file is downloaded, which provides prompt samples with pre-defined input token lengths for benchmarking. +For **Llama-3 models** benchmarking, the users need to download a specific `prompt.json` file, overwriting the original one. + +```bash +wget -O prompt.json https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt-3.json +``` + +The original `prompt.json` file can be restored from the repository if needed. + +```bash +wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json +``` + +
+ +# 3. How To Run LLM with ipex.llm + +Inference and fine-tuning are supported in respective directories. + +For inference example scripts, visit the [inference](./inference/) directory. + +For fine-tuning example scripts, visit the [fine-tuning](./fine-tuning/) directory. \ No newline at end of file diff --git a/examples/cpu/training/llm/README.md b/examples/cpu/llm/fine-tuning/README.md similarity index 57% rename from examples/cpu/training/llm/README.md rename to examples/cpu/llm/fine-tuning/README.md index 99a655d30..4b1de026a 100644 --- a/examples/cpu/training/llm/README.md +++ b/examples/cpu/llm/fine-tuning/README.md @@ -1,33 +1,26 @@ -# IPEX LLAMA2 7B lora apalca finetuning training on CPUs (distributed) +# IPEX LLAMA2 7B lora alpaca finetuning training on CPUs (distributed) ## Description -This document has instructions for running [LLaMA2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) lora apalca finetuning using Intel-optimized PyTorch (enable the recipes from [apalca-lora](https://github.com/tloen/alpaca-lora/tree/main) on CPUs ). +This document has instructions for running [LLaMA2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) lora alpaca finetuning using Intel-optimized PyTorch (enable the recipes from [alpaca-lora](https://github.com/tloen/alpaca-lora/tree/main) on CPUs ). -## Bare Metal -### General setup +## Distributed Computation Environment Setup -Follow [link](https://github.com/intel/intel-extension-for-pytorch/tree/release/2.4/examples/cpu/inference/python/llm#3-environment-setup) to setup PyTorch/IPEX and some other dependency. +In this case, we use data-parallel distributed training and every rank will hold same model replica. The NNODES is the number of ip in the HOSTFILE. -### Prepare dependency ``` - pip install -r requirements.txt - ``` -### Specific Setup - -* Set ENV to use multi-nodes distributed training (no need for single-node multi-sockets) - -In this case, we use data-parallel distributed training and every rank will hold same model replica. The NNODES is the number of ip in the HOSTFILE. To use multi-nodes distributed training you should firstly setup the passwordless login (you can refer to [link](https://linuxize.com/post/how-to-setup-passwordless-ssh-login/)) between these nodes. -``` -export NNODES=#your_node_number (default using 1 node) +export NNODES=#number_of_nodes (default using 1 node) # create your_ip_list_file, one ip per line, like (or self edit): scontrol show hostname > ./hostfile - export HOSTFILE=hostfile - ``` + +*Note:* To use multi-nodes distributed training you should firstly setup the passwordless login (you can refer to [link](https://linuxize.com/post/how-to-setup-passwordless-ssh-login/)) among computation nodes. If you are using the Dockerfile, you can skip this step. + # Quick Start Scripts + ## Run the model + ``` # Get the dataset here: https://github.com/tloen/alpaca-lora/blob/main/alpaca_data.json export DATASET="./alpaca_data.json" @@ -46,7 +39,9 @@ Apply the access in this page [LLaMA2 7B](https://huggingface.co/meta-llama/Llam huggingface-cli login {your huggingface token} ``` + ## Launch command + | DataType | Throughput | | ----------- | ----------- | -| BF16 | bash run_lora_finetune_ddp.sh bf16 | +| BF16 | bash run_lora_finetune_ddp.sh bf16 | \ No newline at end of file diff --git a/examples/cpu/training/llm/finetune.py b/examples/cpu/llm/fine-tuning/finetune.py similarity index 100% rename from examples/cpu/training/llm/finetune.py rename to examples/cpu/llm/fine-tuning/finetune.py diff --git a/examples/cpu/training/llm/requirements.txt b/examples/cpu/llm/fine-tuning/requirements.txt similarity index 100% rename from examples/cpu/training/llm/requirements.txt rename to examples/cpu/llm/fine-tuning/requirements.txt diff --git a/examples/cpu/training/llm/run_lora_finetune_ddp.sh b/examples/cpu/llm/fine-tuning/run_lora_finetune_ddp.sh similarity index 100% rename from examples/cpu/training/llm/run_lora_finetune_ddp.sh rename to examples/cpu/llm/fine-tuning/run_lora_finetune_ddp.sh diff --git a/examples/cpu/training/llm/utils/README.md b/examples/cpu/llm/fine-tuning/utils/README.md similarity index 100% rename from examples/cpu/training/llm/utils/README.md rename to examples/cpu/llm/fine-tuning/utils/README.md diff --git a/examples/cpu/training/llm/utils/__init__.py b/examples/cpu/llm/fine-tuning/utils/__init__.py similarity index 100% rename from examples/cpu/training/llm/utils/__init__.py rename to examples/cpu/llm/fine-tuning/utils/__init__.py diff --git a/examples/cpu/training/llm/utils/prompter.py b/examples/cpu/llm/fine-tuning/utils/prompter.py similarity index 100% rename from examples/cpu/training/llm/utils/prompter.py rename to examples/cpu/llm/fine-tuning/utils/prompter.py diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/llm/inference/README.md similarity index 82% rename from examples/cpu/inference/python/llm/README.md rename to examples/cpu/llm/inference/README.md index 493241ba4..2f333f28a 100644 --- a/examples/cpu/inference/python/llm/README.md +++ b/examples/cpu/llm/inference/README.md @@ -1,13 +1,6 @@ -# 1. LLM Optimization Overview +# 1. ipex.llm Optimized Model List for Inference -`ipex.llm` provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc. -And a set of data types are supported for various scenarios, including FP32, BF16, Smooth Quantization INT8, Weight Only Quantization INT8/INT4 (prototype). - -
- -# 2. ipex.llm Optimized Model List - -## 2.1 Verified for single instance mode +## 1.1 Verified for single instance mode | MODEL FAMILY | MODEL NAME (Huggingface hub) | FP32 | BF16 | Static quantization INT8 | Weight only quantization INT8 | Weight only quantization INT4 | |:---:|:---:|:---:|:---:|:---:|:---:|:---:| @@ -46,7 +39,7 @@ And a set of data types are supported for various scenarios, including FP32, BF1 |Phi| microsoft/Phi-3-medium-4k-instruct | 🟩 | 🟩 | 🟨 | 🟩 | 🟨 | |Phi| microsoft/Phi-3-medium-128k-instruct | 🟩 | 🟩 | 🟨 | 🟩 | 🟨 | -## 2.2 Verified for distributed inference mode via DeepSpeed +## 1.2 Verified for distributed inference mode via DeepSpeed | MODEL FAMILY | MODEL NAME (Huggingface hub) | BF16 | Weight only quantization INT8 | |:---:|:---:|:---:|:---:| @@ -89,119 +82,7 @@ We are working in progress to better support the models in the tables with vario
-# 3. Environment Setup -There are several environment setup methodologies provided. You can choose either of them according to your usage scenario. The Docker-based ones are recommended. - -## 3.1 [RECOMMENDED] Docker-based environment setup with pre-built wheels - -```bash -# Get the Intel® Extension for PyTorch\* source code -git clone https://github.com/intel/intel-extension-for-pytorch.git -cd intel-extension-for-pytorch -git checkout v2.4.0+cpu -git submodule sync -git submodule update --init --recursive - -# Build an image with the provided Dockerfile by installing from Intel® Extension for PyTorch\* prebuilt wheel files -DOCKER_BUILDKIT=1 docker build -f examples/cpu/inference/python/llm/Dockerfile -t ipex-llm:2.4.0 . - -# Run the container with command below -docker run --rm -it --privileged ipex-llm:2.4.0 bash - -# When the command prompt shows inside the docker container, enter llm examples directory -cd llm - -# Activate environment variables -source ./tools/env_activate.sh -``` - -## 3.2 Conda-based environment setup with pre-built wheels - -```bash -# Get the Intel® Extension for PyTorch\* source code -git clone https://github.com/intel/intel-extension-for-pytorch.git -cd intel-extension-for-pytorch -git checkout v2.4.0+cpu -git submodule sync -git submodule update --init --recursive - -# GCC 12.3 is required. Installation can be taken care of by the environment configuration script. -# Create a conda environment -conda create -n llm python=3.10 -y -conda activate llm - -# Setup the environment with the provided script -# A sample "prompt.json" file for benchmarking is also downloaded -cd examples/cpu/inference/python/llm -bash ./tools/env_setup.sh 7 - -# Activate environment variables -source ./tools/env_activate.sh -``` - -## 3.3 Docker-based environment setup with compilation from source - -```bash -# Get the Intel® Extension for PyTorch\* source code -git clone https://github.com/intel/intel-extension-for-pytorch.git -cd intel-extension-for-pytorch -git checkout v2.4.0+cpu -git submodule sync -git submodule update --init --recursive - -# Build an image with the provided Dockerfile by compiling Intel® Extension for PyTorch\* from source -DOCKER_BUILDKIT=1 docker build -f examples/cpu/inference/python/llm/Dockerfile --build-arg COMPILE=ON -t ipex-llm:2.4.0 . - -# Run the container with command below -docker run --rm -it --privileged ipex-llm:2.4.0 bash - -# When the command prompt shows inside the docker container, enter llm examples directory -cd llm - -# Activate environment variables -source ./tools/env_activate.sh -``` - -## 3.4 Conda-based environment setup with compilation from source - -```bash -# Get the Intel® Extension for PyTorch\* source code -git clone https://github.com/intel/intel-extension-for-pytorch.git -cd intel-extension-for-pytorch -git checkout v2.4.0+cpu -git submodule sync -git submodule update --init --recursive - -# GCC 12.3 is required. Installation can be taken care of by the environment configuration script. -# Create a conda environment -conda create -n llm python=3.10 -y -conda activate llm - -# Setup the environment with the provided script -# A sample "prompt.json" file for benchmarking is also downloaded -cd examples/cpu/inference/python/llm -bash ./tools/env_setup.sh - -# Activate environment variables -source ./tools/env_activate.sh -``` - -*Note*: In `env_setup.sh` script a `prompt.json` file is downloaded, which provides prompt samples with pre-defined input token lengths for benchmarking. -For **Llama-3 models** benchmarking, the users need to download a specific `prompt.json` file, overwriting the original one. - -```bash -wget -O prompt.json https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt-3.json -``` - -The original `prompt.json` file can be restored from the repository if needed. - -```bash -wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json -``` - -
- -# 4. How To Run LLM with ipex.llm +# 2. How To Run LLM with ipex.llm **ipex.llm provides a single script to facilitate running generation tasks as below:** @@ -221,68 +102,68 @@ python run.py --help # for more detailed usages *Note:* You may need to log in your HuggingFace account to access the model files. Please refer to [HuggingFace login](https://huggingface.co/docs/huggingface_hub/quick-start#login). -## 4.1 Quick example for running Llama2-7b +## 2.1 Quick example for running Llama2-7b -### 4.1.1 To run generation task and benchmark performance +### 2.1.1 To run generation task and benchmark performance *Note:* The following "OMP_NUM_THREADS" and "numactl" settings are based on the assumption that the target server has 56 physical cores per numa socket, and we benchmark with 1 socket. Please adjust the settings per your hardware. -#### 4.1.1.1 Run in FP32 with stock PyTorch +#### 2.1.1.1 Run in FP32 with stock PyTorch ```bash OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype float32 ``` -#### 4.1.1.2 Run in FP32 with ipex.llm +#### 2.1.1.2 Run in FP32 with ipex.llm ```bash OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype float32 --ipex ``` -#### 4.1.1.3 Run in BF16 with ipex.llm +#### 2.1.1.3 Run in BF16 with ipex.llm ```bash OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex ``` -#### 4.1.1.4 Run in static quantization INT8 with ipex.llm +#### 2.1.1.4 Run in static quantization INT8 with ipex.llm ```bash wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/cpu/2/llama2-7b_qconfig.json OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex-smooth-quant --qconfig-summary-file llama2-7b_qconfig.json --output-dir "saved_results" ``` -#### 4.1.1.5 Run in weight-only quantization INT8 with ipex.llm +#### 2.1.1.5 Run in weight-only quantization INT8 with ipex.llm ```bash OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --output-dir "saved_results" ``` -#### 4.1.1.6 Run in weight-only quantization INT4 with ipex.llm +#### 2.1.1.6 Run in weight-only quantization INT4 with ipex.llm ```bash OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT4 --gptq --quant-with-amp --output-dir "saved_results" ``` -#### 4.1.1.7 Run in BF16 with ipex.llm in distributed way +#### 2.1.1.7 Run in BF16 with ipex.llm in distributed way ```bash deepspeed --bind_cores_to_rank run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --autotp --shard-model ``` -#### 4.1.1.8 Run in weight-only quantization INT8 with ipex.llm in distributed way +#### 2.1.1.8 Run in weight-only quantization INT8 with ipex.llm in distributed way ```bash deepspeed --bind_cores_to_rank run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --autotp --shard-model --output-dir "saved_results" ``` -### 4.1.2 To run generation task and test accuracy +### 2.1.2 To run generation task and test accuracy For the quantized models used in accuracy tests below, we can reuse the model files that are named "best_model.pt" in the "--output-dir" path ([generated during inference performance tests above](#generation_sq)). -Check [Advanced Usage](#52-accuracy-test) for details. +Check [Advanced Usage](#32-accuracy-test) for details. -#### 4.1.2.1 Single instance +#### 2.1.2.1 Single instance ```bash # The following "OMP_NUM_THREADS" and "numactl" settings are based on the assumption that @@ -302,7 +183,7 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Ll OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Llama-2-7b-hf --quantized-model-path "../saved_results/best_model.pt" --dtype int8 --tasks lambada_openai ``` -#### 4.1.2.2 Distributed inference +#### 2.1.2.2 Distributed inference ```bash # run_accuracy_with_deepspeed.py script is inside distributed directory. @@ -319,23 +200,23 @@ deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --tasks lambada_openai ``` -#### 4.1.2.3 Distributed inference among multiple nodes with TCP +#### 2.1.2.3 Distributed inference among multiple nodes with TCP -A bash script (`tools/run_scaling.sh`) is provided to simplify environment configuration and the command launch. +A [bash script](./tools/run_scaling.sh) is provided to simplify environment configuration and the command launch. Steps: -2. Enter the `llm` directory -3. Create a `hostfile.txt` following [instructions of deepspeed](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) -4. Find out the network interface name used for node communication via `ifconfig` or `ibv_devices` ex : eth0 -5. Open `tools/run_scaling.sh` script to update required information in line 3 to line 11 according to your environment and needs -6. run the command below to run distributed inference among nodes +1. Enter the `llm` directory +2. Create a `hostfile.txt` following [instructions of deepspeed](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) +3. Find out the network interface name used for node communication via `ifconfig` or `ibv_devices` ex : eth0 +4. Open `tools/run_scaling.sh` script to update required information in line 3 to line 11 according to your environment and needs +5. run the command below to run distributed inference among nodes ```bash bash tools/run_scaling.sh ``` -The docker image built in Section 3.1 functions ssh connection for distributed executions across multiple machines via Ethernet. However, it is supposed to be running with 1 single container on each machine. Inside each docker container, multiple inference instances can be launched by the `deepspeed` command. +The docker image built in [the environment setup tutorial](../README.md#2-environment-setup) functions ssh connection for distributed executions across multiple machines via Ethernet. However, it is supposed to be running with 1 single container on each machine. Inside each docker container, multiple inference instances can be launched by the `deepspeed` command. Use the command below on all machines to launch the docker containers. This command uses the host network interfaces inside the docker container. Thus, you need to put the host ip addresses into the `hostfile.txt`. Do NOT launch multiple docker containers on one single machine from the same docker image. These docker containers listen on the same machine on the same port, will result in unpredicable ssh connections. @@ -345,11 +226,11 @@ docker run --rm -it --privileged --net host ipex-llm:main bash **Note:** For models on HuggingFace require access privileges, you need to run the `huggingface-cli login` command in each docker container to config a HuggingFace access token. -## 4.2 Detail usage of running LLM models +## 2.2 Detail usage of running LLM models -### 4.2.1 Run generation with one instance +### 2.2.1 Run generation with one instance -#### 4.2.1.1 FP32: +#### 2.2.1.1 FP32: - Command: ```bash @@ -361,7 +242,7 @@ OMP_NUM_THREADS= numactl -m -C numactl -m -C _ specifies the [numa](https://en.wikipedia.org/wiki/Non-uniform_memory_access) node id (e.g., 0 to use the memory from the first numa node). _\_ specifies phsysical cores which you are using from the _\_ numa node (e.g., 0-56 from the first numa node). You can use [_lscpu_](https://man7.org/linux/man-pages/man1/lscpu.1.html) command in Linux to check the numa node information. -(2) The _\_ (e.g., "meta-llama/Llama-2-13b-hf") specifies the model you will run. we provide some verified _\_ in the [Optimized Model List](#2-ipexllm-optimized-model-list). You can also try other models from [HuggingFace Models](https://huggingface.co/models). +(2) The _\_ (e.g., "meta-llama/Llama-2-13b-hf") specifies the model you will run. we provide some verified _\_ in the [Optimized Model List](#1-ipexllm-optimized-model-list-for-inference). You can also try other models from [HuggingFace Models](https://huggingface.co/models). (3) for all quantization benchmarks, both quantization and inference stages will be triggered by default. For quantization stage, it will auto-generate the quantized model named "best_model.pt" in the "--output-dir" path, and for inference stage, it will launch the inference with the quantized model "best_model.pt". For inference-only benchmarks (avoid the repeating quantization stage), you can also reuse these quantized models for by adding "--quantized-model-path " . -### 4.2.2 Run generation in distributed way +### 2.2.2 Run generation in distributed way -#### 4.2.2.1 Prepare: +#### 2.2.2.1 Prepare: ```bash unset KMP_AFFINITY @@ -464,9 +345,9 @@ In the DeepSpeed cases below, we recommend "--shard-model" to shard model weight If using "--shard-model", it will save a copy of the shard model weights file in the path of "--output-dir" (default path is "./saved_results" if not provided). If you have used "--shard-model" and generated such a shard model path (or your model weights files are already well sharded), in further repeated benchmarks, please remove "--shard-model", and replace "-m " with "-m " to skip the repeated shard steps. -Besides, the standalone shard model function/scripts are also provided in the [Advanced Usage](#53-how-to-shard-model-for-distributed-tests-with-deepspeed-autotp) section, in case you would like to generate the shard model weights files in advance before running distributed inference. +Besides, the standalone shard model function/scripts are also provided in the [Advanced Usage](#33-how-to-shard-model-for-distributed-tests-with-deepspeed-autotp) section, in case you would like to generate the shard model weights files in advance before running distributed inference. -#### 4.2.2.2 FP32: +#### 2.2.2.2 FP32: - Command: ```bash @@ -478,7 +359,7 @@ deepspeed --bind_cores_to_rank run.py --benchmark -m --dtype float32 deepspeed --bind_cores_to_rank run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype float32 --ipex --autotp --shard-model ``` -#### 4.2.2.3 BF16: +#### 2.2.2.3 BF16: - Command: ```bash @@ -490,7 +371,7 @@ deepspeed --bind_cores_to_rank run.py --benchmark -m --dtype bfloat1 deepspeed --bind_cores_to_rank run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --autotp --shard-model ``` -#### 4.2.2.4 Weight-only quantization: +#### 2.2.2.4 Weight-only quantization: By default, for weight-only quantization, we use quantization with [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html) inference ("--quant-with-amp") to get peak performance and fair accuracy. For weight-only quantization with deepspeed, we quantize the model then run the benchmark. The quantized model won't be saved. @@ -520,7 +401,7 @@ Similar to single instance usage, we need to update some arguments of the runnin deepspeed --bind_cores_to_rank run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --autotp --shard-model --output-dir "saved_results" ``` -### 4.2.3 Additional configuration for specific models +### 2.2.3 Additional configuration for specific models There are some model-specific requirements to be aware of, as follows: @@ -532,17 +413,17 @@ There are some model-specific requirements to be aware of, as follows: - For mistralai/Mistral-7B-v0.1 and mistralai/Mixtral-8x7B-Instruct-v0.1, we use a fixed model version because the latest version is not compatible with transformers 4.38.1 and tokenizers 0.15.2. -## 4.3 Instructions for Running LLM with Intel® Xeon® CPU Max Series +## 2.3 Instructions for Running LLM with Intel® Xeon® CPU Max Series Intel® Xeon® CPU Max Series are equipped with high bandwidth memory (HBM), which further accelerates LLM inference. For the common case that HBM and DDR are both installed in a Xeon® CPU Max Series server, the memory mode can be configured to Flat Mode or Cache Mode. Details about memory modes can be found at Section 3.1 in [the Xeon® CPU Max Series Configuration Guide](https://cdrdv2-public.intel.com/769060/354227-intel-xeon-cpu-max-series-configuration-and-tuning-guide.pdf). -### 4.3.1 Single Instance Inference with Xeon® CPU Max Series +### 2.3.1 Single Instance Inference with Xeon® CPU Max Series -#### 4.3.1.1 Cache Mode HBM +#### 2.3.1.1 Cache Mode HBM -In cache mode, only DDR address space is visible to software and HBM functions as a transparent memory-side cache for DDR. Therefore the usage is the same with [the common usage](#421-run-generation-with-one-instance). +In cache mode, only DDR address space is visible to software and HBM functions as a transparent memory-side cache for DDR. Therefore the usage is the same with [the common usage](#221-run-generation-with-one-instance). -#### 4.3.1.2 Flat Mode HBM +#### 2.3.1.2 Flat Mode HBM In flat mode, HBM and DDR are exposed to software as separate address spaces in this mode. Therefore we need to check the `HBM_NODE_INDEX` of interest with commands like `lscpu`, then the LLM inference invoking command would be like: @@ -567,9 +448,9 @@ OMP_NUM_THREADS= numactl -p -C +# 3. Advanced Usage -# 5. Advanced Usage - -## 5.1 Weight-only quantization with low precision checkpoint (Prototype) +## 3.1 Weight-only quantization with low precision checkpoint (Prototype) Using INT4 weights can further improve performance by reducing memory bandwidth. However, direct per-channel quantization of weights to INT4 probably results in poor accuracy. Some algorithms can modify weights through calibration before quantizing weights to minimize accuracy drop. GPTQ is one of such algorithms. You may generate modified weights and quantization info (scales, zero points) for a certain model with a dataset by such algorithms. The low precision checkpoint is saved as a `state_dict` in a `.pt` file and can be loaded later for weight only quantization. We provide an example here to run GPTQ. @@ -618,7 +498,7 @@ python single_instance/run_quantization.py --ipex-weight-only-quantization --qua OMP_NUM_THREADS= numactl -m -C python single_instance/run_quantization.py -m --benchmark --quant-with-amp --quantized-model-path "./saved_results/best_model.pt" ``` -To run accuracy tests, please follow the instructions in the [Accuracy Test](#52-accuracy-test) part +To run accuracy tests, please follow the instructions in the [Accuracy Test](#32-accuracy-test) part If the checkpoint is generated by some other methods and has different keys in the state_dict, you will need to specify the keys for weight, scales, zero points and bias. Bias is optional in the state_dict while others are required. Default keys are: @@ -667,7 +547,7 @@ Please note that 100 GB disk space, 100 GB memory and Internet access are needed IPEX now only supports some certain cases. Weights must be N by K and asymmetrically quantized to UINT4 and then compressed along K axis to `torch.int32`. Data type of scales can be any floating point types. Shape of scales should be [N, number_of_groups] or with additional dimensions whose length is 1. Zero points should have the same shape as scales and stored as `torch.int32` but the true data type is UINT4. Bias is optional in the `state_dict` (checkpoint). If it is present, we read bias in the `state_dict`. Otherwise we read bias from the original model. Bias is `None` if it cannot be found in both cases. -## 5.2 Accuracy test +## 3.2 Accuracy test We leverage [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) for the accuracy test. @@ -675,13 +555,13 @@ We verify and recommend to test accuracy of most models with "lambada_openai" ta For some models, like `Salesforce/codegen-2B-multi` and `mosaicml/mpt-7b`, we verify and recommend to test their accuracy with "hellaswag" task. For more candidate tasks for accuracy validation, please check [lm-evaluation-harness task table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md). -### 5.2.1 Run with one instance +### 3.2.1 Run with one instance ```bash cd ./single_instance ``` -#### 5.2.1.1 FP32: +#### 3.2.1.1 FP32: - Command: ```bash @@ -693,7 +573,7 @@ OMP_NUM_THREADS= numactl -m -C numactl -m -C numactl -m -C python ru OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Llama-2-7b-hf --quantized-model-path "../saved_results/best_model.pt" --dtype int8 --tasks lambada_openai ``` -### 5.2.2 Run in distributed way +### 3.2.2 Run in distributed way -#### 5.2.2.1 Prepare: +#### 3.2.2.1 Prepare: ```bash # Run distributed accuracy with 2 ranks of one node @@ -730,7 +610,7 @@ cd ./distributed unset KMP_AFFINITY ``` -#### 5.2.2.2 FP32: +#### 3.2.2.2 FP32: - Command: ```bash @@ -741,7 +621,7 @@ deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai ``` -#### 5.2.2.3 BF16: +#### 3.2.2.3 BF16: - Command: ```bash deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --dtype bfloat16 -ipex --tasks @@ -751,7 +631,7 @@ deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai ``` -#### 5.2.2.4 Weight-only quantization (INT8): +#### 3.2.2.4 Weight-only quantization (INT8): - Command: ```bash @@ -776,7 +656,7 @@ Similar to script usage for performance benchmarking, we need to update some arg deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks ``` -## 5.3 How to Shard model for Distributed tests with DeepSpeed (autoTP) +## 3.3 How to Shard model for Distributed tests with DeepSpeed (autoTP) To save memory usage, we could shard the model weights under the local path before we launch distributed tests with DeepSpeed. @@ -794,12 +674,12 @@ python create_shard_model.py -m meta-llama/Llama-2-7b-hf --save-path ./local_lla # 6. Performance Results -The performance results on AWS instances can be found [here](../../../../../docs/tutorials/performance.md#llm-performance). +The performance results on AWS instances can be found [here](../../../../docs/tutorials/performance.md#llm-performance).
# 7. Miscellaneous Tips -- We can build up LLM services optimized by Intel® Extension for PyTorch\* with Triton Server. Please refer [here](../../../serving/triton/README.md) for best practice. +- We can build up LLM services optimized by Intel® Extension for PyTorch\* with Triton Server. Please refer [here](../../serving/triton/README.md) for best practice. - The LLM inference methods introduced in this page can be well applied for AWS. We can just follow the above instructions and enjoy the boosted performance of LLM with Intel® Extension for PyTorch\* optimizations on the AWS instances. diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py similarity index 100% rename from examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py rename to examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py b/examples/cpu/llm/inference/distributed/run_generation_tp.py similarity index 100% rename from examples/cpu/inference/python/llm/distributed/run_generation_tp.py rename to examples/cpu/llm/inference/distributed/run_generation_tp.py diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py similarity index 100% rename from examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py rename to examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py diff --git a/examples/cpu/inference/python/llm/llm_sq_recipes.md b/examples/cpu/llm/inference/llm_sq_recipes.md similarity index 84% rename from examples/cpu/inference/python/llm/llm_sq_recipes.md rename to examples/cpu/llm/inference/llm_sq_recipes.md index 22df336e2..88a80b7d0 100644 --- a/examples/cpu/inference/python/llm/llm_sq_recipes.md +++ b/examples/cpu/llm/inference/llm_sq_recipes.md @@ -1,25 +1,26 @@ -## Smooth Quantization Autotune Feature (Prototype): -SmoothQuant is a popular method to improve the accuracy of int8 quantization. The [autotune API](../../../../../docs/tutorials/features/sq_recipe_tuning_api.md) allows automatic global alpha tuning, and automatic layer-by-layer alpha tuning provided by Intel® Neural Compressor for the best accuracy. Below is the basic command to generate the qconfig summary files (and quantized model ".pt" file) with the SmoothQuant autotune API. - -```bash -# general command: -OMP_NUM_THREADS= numactl -m -C python run.py --benchmark -m --ipex-smooth-quant --alpha auto --output-dir "saved_results" - -# An example of llama2 7b model: -OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex-smooth-quant --alpha auto -``` - -## Example command for model tuning with AutoTune API -| Model ID | Command | -|---|:---:| -| meta-llama/Llama-2-13b-hf | python run.py -m meta-llama/Llama-2-13b-hf --ipex-smooth-quant --alpha auto --init-alpha 0.8 --alpha-min 0.75 --alpha-max 0.99 --alpha-step 0.01 --shared-criterion 'max' --calib-len 1024 --calib-padding --fallback-add | -| meta-llama/Llama-2-70b-hf | python run.py -m meta-llama/Llama-2-70b-hf --ipex-smooth-quant --batch-size 56 --calib-shuffle --fallback-add --alpha 0.8 | -| EleutherAI/gpt-j-6b | python run.py -m EleutherAI/gpt-j-6b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --fallback-add --alpha 0.85 | -| tiiuae/falcon-7b | python run.py -m tiiuae/falcon-7b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 | -| tiiuae/falcon-40b | python run.py -m tiiuae/falcon-40b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.9 | -| facebook/opt-30b | python run.py -m facebook/opt-30b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle | -| facebook/opt-1.3b | python run.py -m facebook/opt-1.3b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.85 | -| baichuan-inc/Baichuan2-7B-Chat | python run.py -m baichuan-inc/Baichuan2-7B-Chat --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 | -| baichuan-inc/Baichuan2-13B-Chat | python run.py -m baichuan-inc/Baichuan2-13B-Chat --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.65 | - -*Note*: The above examples are validated with good accuracy on the "lamada_openai" dataset. +## Smooth Quantization Autotune Feature (Prototype): + +SmoothQuant is a popular method to improve the accuracy of int8 quantization. The [autotune API](../../../../docs/tutorials/features/sq_recipe_tuning_api.md) allows automatic global alpha tuning, and automatic layer-by-layer alpha tuning provided by Intel® Neural Compressor for the best accuracy. Below is the basic command to generate the qconfig summary files (and quantized model ".pt" file) with the SmoothQuant autotune API. + +```bash +# general command: +OMP_NUM_THREADS= numactl -m -C python run.py --benchmark -m --ipex-smooth-quant --alpha auto --output-dir "saved_results" + +# An example of llama2 7b model: +OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex-smooth-quant --alpha auto +``` + +## Example command for model tuning with AutoTune API +| Model ID | Command | +|---|:---:| +| meta-llama/Llama-2-13b-hf | python run.py -m meta-llama/Llama-2-13b-hf --ipex-smooth-quant --alpha auto --init-alpha 0.8 --alpha-min 0.75 --alpha-max 0.99 --alpha-step 0.01 --shared-criterion 'max' --calib-len 1024 --calib-padding --fallback-add | +| meta-llama/Llama-2-70b-hf | python run.py -m meta-llama/Llama-2-70b-hf --ipex-smooth-quant --batch-size 56 --calib-shuffle --fallback-add --alpha 0.8 | +| EleutherAI/gpt-j-6b | python run.py -m EleutherAI/gpt-j-6b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --fallback-add --alpha 0.85 | +| tiiuae/falcon-7b | python run.py -m tiiuae/falcon-7b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 | +| tiiuae/falcon-40b | python run.py -m tiiuae/falcon-40b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.9 | +| facebook/opt-30b | python run.py -m facebook/opt-30b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle | +| facebook/opt-1.3b | python run.py -m facebook/opt-1.3b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.85 | +| baichuan-inc/Baichuan2-7B-Chat | python run.py -m baichuan-inc/Baichuan2-7B-Chat --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 | +| baichuan-inc/Baichuan2-13B-Chat | python run.py -m baichuan-inc/Baichuan2-13B-Chat --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.65 | + +*Note*: The above examples are validated with good accuracy on the "lamada_openai" dataset. diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/llm/inference/run.py similarity index 100% rename from examples/cpu/inference/python/llm/run.py rename to examples/cpu/llm/inference/run.py diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/llm/inference/single_instance/run_accuracy.py similarity index 100% rename from examples/cpu/inference/python/llm/single_instance/run_accuracy.py rename to examples/cpu/llm/inference/single_instance/run_accuracy.py diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/llm/inference/single_instance/run_generation.py similarity index 100% rename from examples/cpu/inference/python/llm/single_instance/run_generation.py rename to examples/cpu/llm/inference/single_instance/run_generation.py diff --git a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py b/examples/cpu/llm/inference/single_instance/run_int4_gpt-j_on_cnndailymail.py similarity index 100% rename from examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py rename to examples/cpu/llm/inference/single_instance/run_int4_gpt-j_on_cnndailymail.py diff --git a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.sh b/examples/cpu/llm/inference/single_instance/run_int4_gpt-j_on_cnndailymail.sh similarity index 100% rename from examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.sh rename to examples/cpu/llm/inference/single_instance/run_int4_gpt-j_on_cnndailymail.sh diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/llm/inference/single_instance/run_quantization.py similarity index 97% rename from examples/cpu/inference/python/llm/single_instance/run_quantization.py rename to examples/cpu/llm/inference/single_instance/run_quantization.py index a16a277bc..6016198d2 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/llm/inference/single_instance/run_quantization.py @@ -13,33 +13,33 @@ from ast import literal_eval import sys -sys.path.append(sys.path[0] + "/../../") - - -from llm.utils.model_class.llm import EXAMPLE_INPUTS_MODE -from llm.utils.model_class.llama import LLAMAConfig -from llm.utils.model_class.gptj import GPTJConfig -from llm.utils.model_class.gptneox import GPTNEOXConfig -from llm.utils.model_class.falcon import FALCONConfig -from llm.utils.model_class.opt import OPTConfig -from llm.utils.model_class.bloom import BloomConfig -from llm.utils.model_class.codegen import CodeGenConfig -from llm.utils.model_class.baichuan import BaichuanConfig -from llm.utils.model_class.chatglm import ChatGLMConfig -from llm.utils.model_class.gptbigcode import GPTJBigCodeConfig -from llm.utils.model_class.t5 import T5Config -from llm.utils.model_class.mistral import MistralConfig -from llm.utils.model_class.mixtral import MixtralConfig -from llm.utils.model_class.mpt import MPTConfig -from llm.utils.model_class.stablelm import StableLMConfig -from llm.utils.model_class.qwen import QwenConfig -from llm.utils.model_class.qwen2 import Qwen2Config -from llm.utils.model_class.git import GitConfig -from llm.utils.model_class.llava import LlavaConfig -from llm.utils.model_class.phi import PhiConfig -from llm.utils.model_class.phi import Phi3Config -from llm.utils.model_class.yuan import YuanConfig -from llm.utils.model_class.whisper import WhisperConfig +sys.path.append(sys.path[0] + "/../../../") + + +from llm.inference.utils.model_class.llm import EXAMPLE_INPUTS_MODE +from llm.inference.utils.model_class.llama import LLAMAConfig +from llm.inference.utils.model_class.gptj import GPTJConfig +from llm.inference.utils.model_class.gptneox import GPTNEOXConfig +from llm.inference.utils.model_class.falcon import FALCONConfig +from llm.inference.utils.model_class.opt import OPTConfig +from llm.inference.utils.model_class.bloom import BloomConfig +from llm.inference.utils.model_class.codegen import CodeGenConfig +from llm.inference.utils.model_class.baichuan import BaichuanConfig +from llm.inference.utils.model_class.chatglm import ChatGLMConfig +from llm.inference.utils.model_class.gptbigcode import GPTJBigCodeConfig +from llm.inference.utils.model_class.t5 import T5Config +from llm.inference.utils.model_class.mistral import MistralConfig +from llm.inference.utils.model_class.mixtral import MixtralConfig +from llm.inference.utils.model_class.mpt import MPTConfig +from llm.inference.utils.model_class.stablelm import StableLMConfig +from llm.inference.utils.model_class.qwen import QwenConfig +from llm.inference.utils.model_class.qwen2 import Qwen2Config +from llm.inference.utils.model_class.git import GitConfig +from llm.inference.utils.model_class.llava import LlavaConfig +from llm.inference.utils.model_class.phi import PhiConfig +from llm.inference.utils.model_class.phi import Phi3Config +from llm.inference.utils.model_class.yuan import YuanConfig +from llm.inference.utils.model_class.whisper import WhisperConfig # The latest model is not compatible with the current transformers/tokenizers, so we specify the revision of the model diff --git a/examples/cpu/inference/python/llm/tools/llava.patch b/examples/cpu/llm/inference/tools/llava.patch similarity index 100% rename from examples/cpu/inference/python/llm/tools/llava.patch rename to examples/cpu/llm/inference/tools/llava.patch diff --git a/examples/cpu/inference/python/llm/tools/prepare_llava.sh b/examples/cpu/llm/inference/tools/prepare_llava.sh similarity index 100% rename from examples/cpu/inference/python/llm/tools/prepare_llava.sh rename to examples/cpu/llm/inference/tools/prepare_llava.sh diff --git a/examples/cpu/inference/python/llm/tools/run_scaling.sh b/examples/cpu/llm/inference/tools/run_scaling.sh similarity index 100% rename from examples/cpu/inference/python/llm/tools/run_scaling.sh rename to examples/cpu/llm/inference/tools/run_scaling.sh diff --git a/examples/cpu/inference/python/llm/utils/create_shard_model.py b/examples/cpu/llm/inference/utils/create_shard_model.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/create_shard_model.py rename to examples/cpu/llm/inference/utils/create_shard_model.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/baichuan.py b/examples/cpu/llm/inference/utils/model_class/baichuan.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/baichuan.py rename to examples/cpu/llm/inference/utils/model_class/baichuan.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/bloom.py b/examples/cpu/llm/inference/utils/model_class/bloom.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/bloom.py rename to examples/cpu/llm/inference/utils/model_class/bloom.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/chatglm.py b/examples/cpu/llm/inference/utils/model_class/chatglm.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/chatglm.py rename to examples/cpu/llm/inference/utils/model_class/chatglm.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/codegen.py b/examples/cpu/llm/inference/utils/model_class/codegen.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/codegen.py rename to examples/cpu/llm/inference/utils/model_class/codegen.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/falcon.py b/examples/cpu/llm/inference/utils/model_class/falcon.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/falcon.py rename to examples/cpu/llm/inference/utils/model_class/falcon.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/git.py b/examples/cpu/llm/inference/utils/model_class/git.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/git.py rename to examples/cpu/llm/inference/utils/model_class/git.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/gptbigcode.py b/examples/cpu/llm/inference/utils/model_class/gptbigcode.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/gptbigcode.py rename to examples/cpu/llm/inference/utils/model_class/gptbigcode.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/gptj.py b/examples/cpu/llm/inference/utils/model_class/gptj.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/gptj.py rename to examples/cpu/llm/inference/utils/model_class/gptj.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/gptneox.py b/examples/cpu/llm/inference/utils/model_class/gptneox.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/gptneox.py rename to examples/cpu/llm/inference/utils/model_class/gptneox.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/llama.py b/examples/cpu/llm/inference/utils/model_class/llama.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/llama.py rename to examples/cpu/llm/inference/utils/model_class/llama.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/llava.py b/examples/cpu/llm/inference/utils/model_class/llava.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/llava.py rename to examples/cpu/llm/inference/utils/model_class/llava.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/llm.py b/examples/cpu/llm/inference/utils/model_class/llm.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/llm.py rename to examples/cpu/llm/inference/utils/model_class/llm.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/mistral.py b/examples/cpu/llm/inference/utils/model_class/mistral.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/mistral.py rename to examples/cpu/llm/inference/utils/model_class/mistral.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/mixtral.py b/examples/cpu/llm/inference/utils/model_class/mixtral.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/mixtral.py rename to examples/cpu/llm/inference/utils/model_class/mixtral.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/mpt.py b/examples/cpu/llm/inference/utils/model_class/mpt.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/mpt.py rename to examples/cpu/llm/inference/utils/model_class/mpt.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/opt.py b/examples/cpu/llm/inference/utils/model_class/opt.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/opt.py rename to examples/cpu/llm/inference/utils/model_class/opt.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/phi.py b/examples/cpu/llm/inference/utils/model_class/phi.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/phi.py rename to examples/cpu/llm/inference/utils/model_class/phi.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/qwen.py b/examples/cpu/llm/inference/utils/model_class/qwen.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/qwen.py rename to examples/cpu/llm/inference/utils/model_class/qwen.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/qwen2.py b/examples/cpu/llm/inference/utils/model_class/qwen2.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/qwen2.py rename to examples/cpu/llm/inference/utils/model_class/qwen2.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/stablelm.py b/examples/cpu/llm/inference/utils/model_class/stablelm.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/stablelm.py rename to examples/cpu/llm/inference/utils/model_class/stablelm.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/t5.py b/examples/cpu/llm/inference/utils/model_class/t5.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/t5.py rename to examples/cpu/llm/inference/utils/model_class/t5.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/whisper.py b/examples/cpu/llm/inference/utils/model_class/whisper.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/whisper.py rename to examples/cpu/llm/inference/utils/model_class/whisper.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/yuan.py b/examples/cpu/llm/inference/utils/model_class/yuan.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_class/yuan.py rename to examples/cpu/llm/inference/utils/model_class/yuan.py diff --git a/examples/cpu/inference/python/llm/utils/model_config/mosaicml_mpt-7b_config.json b/examples/cpu/llm/inference/utils/model_config/mosaicml_mpt-7b_config.json similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_config/mosaicml_mpt-7b_config.json rename to examples/cpu/llm/inference/utils/model_config/mosaicml_mpt-7b_config.json diff --git a/examples/cpu/inference/python/llm/utils/model_config/tiiuae_falcon-40b_config.json b/examples/cpu/llm/inference/utils/model_config/tiiuae_falcon-40b_config.json similarity index 100% rename from examples/cpu/inference/python/llm/utils/model_config/tiiuae_falcon-40b_config.json rename to examples/cpu/llm/inference/utils/model_config/tiiuae_falcon-40b_config.json diff --git a/examples/cpu/inference/python/llm/utils/run_gptq.py b/examples/cpu/llm/inference/utils/run_gptq.py similarity index 100% rename from examples/cpu/inference/python/llm/utils/run_gptq.py rename to examples/cpu/llm/inference/utils/run_gptq.py diff --git a/examples/cpu/llm/tools/env_activate.sh b/examples/cpu/llm/tools/env_activate.sh new file mode 100644 index 000000000..c5415df0c --- /dev/null +++ b/examples/cpu/llm/tools/env_activate.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +MSG_USAGE="Usage: source $0 [inference|fine-tuning]" +if [ $# -eq 0 ]; then + echo ${MSG_USAGE} + return 1 +fi +MODE=$1 +if [ ${MODE} != "inference" ] && [ ${MODE} != "fine-tuning" ]; then + echo ${MSG_USAGE} + return 2 +fi + +# Setup environment variables for performance on Xeon +export KMP_BLOCKTIME=1 +export KMP_TPAUSE=0 +export KMP_FORKJOIN_BARRIER_PATTERN=dist,dist +export KMP_PLAIN_BARRIER_PATTERN=dist,dist +export KMP_REDUCTION_BARRIER_PATTERN=dist,dist + +BASEFOLDER=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +export LD_PRELOAD=$(bash ${BASEFOLDER}/get_libstdcpp_lib.sh):${LD_PRELOAD} + +function set_ld_preload() { + LIB_DIR=$1 + LIB_NAME=$2 + LIB= + while read line; do + LIB=${line} + break + done < <(find ${LIB_DIR} -name ${LIB_NAME}) + if [ ! -z ${LIB} ]; then + export LD_PRELOAD=${LD_PRELOAD}:${LIB} + echo "Appending ${LIB} to environment variable LD_PRELOAD." + else + echo "Library ${LIB_NAME} is not found. Please append it manually to environment variable LD_PRELOAD." + fi +} + +env | grep CONDA_PREFIX > /dev/null +if [ $? -eq 0 ]; then + set_ld_preload ${CONDA_PREFIX} libiomp5.so + set_ld_preload ${CONDA_PREFIX} libtcmalloc.so +else + set_ld_preload /usr libiomp5.so + set_ld_preload /usr libtcmalloc.so +fi + +ONECCL_PATH=${BASEFOLDER}/../oneCCL_release +if [ ! -d ${ONECCL_PATH} ]; then + echo "Warning: oneCCL is not available." +else + source ${ONECCL_PATH}/env/setvars.sh +fi + +cd ${BASEFOLDER}/../${MODE} +if [ ${MODE} == "inference" ]; then + if [ -f prompt.json ]; then + rm -f prompt.json + fi + wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json + cd single_instance + if [ -f prompt.json ]; then + rm -f prompt.json + fi + ln -s ../prompt.json + cd ../distributed + if [ -f prompt.json ]; then + rm -f prompt.json + fi + ln -s ../prompt.json + cd .. +elif [ ${MODE} == "fine-tuning" ]; then + python -m pip install -r requirements.txt +fi \ No newline at end of file diff --git a/examples/cpu/inference/python/llm/tools/env_setup.sh b/examples/cpu/llm/tools/env_setup.sh similarity index 94% rename from examples/cpu/inference/python/llm/tools/env_setup.sh rename to examples/cpu/llm/tools/env_setup.sh index 5fa6b1155..b239f530d 100644 --- a/examples/cpu/inference/python/llm/tools/env_setup.sh +++ b/examples/cpu/llm/tools/env_setup.sh @@ -38,7 +38,7 @@ done if [ $((${MODE} & 0x02)) -ne 0 ]; then # Enter IPEX root dir - cd ../../../../.. + cd ../../.. if [ ! -f dependency_version.yml ]; then echo "Please check if `pwd` is a valid Intel® Extension for PyTorch* source code directory." @@ -139,7 +139,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then echo "[Error] Command \"conda\" is not available." exit 5 else - conda install -y sysroot_linux-64 + conda install -y sysroot_linux-64 -c conda-forge conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge if [ -z ${CONDA_BUILD_SYSROOT} ]; then source ${CONDA_PREFIX}/etc/conda/activate.d/activate-gcc_linux-64.sh @@ -212,7 +212,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then cd ../.. cp -r oneCCL/build/_install ${CCLFOLDER} rm -rf oneCCL - cd intel-extension-for-pytorch/examples/cpu/inference/python/llm + cd intel-extension-for-pytorch/examples/cpu/llm fi if [ $((${MODE} & 0x01)) -ne 0 ]; then set +e @@ -227,19 +227,4 @@ if [ $((${MODE} & 0x01)) -ne 0 ]; then bash ${AUX_INSTALL_SCRIPT} python -m pip install ${WHEELFOLDER}/*.whl rm -rf ${WHEELFOLDER} - if [ -f prompt.json ]; then - rm -f prompt.json - fi - wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json - cd single_instance - if [ -f prompt.json ]; then - rm -f prompt.json - fi - ln -s ../prompt.json - cd ../distributed - if [ -f prompt.json ]; then - rm -f prompt.json - fi - ln -s ../prompt.json -fi -python -m pip install numpy==1.26.4 --force-reinstall \ No newline at end of file +fi \ No newline at end of file diff --git a/examples/cpu/llm/tools/get_libstdcpp_lib.sh b/examples/cpu/llm/tools/get_libstdcpp_lib.sh new file mode 120000 index 000000000..06ce62d04 --- /dev/null +++ b/examples/cpu/llm/tools/get_libstdcpp_lib.sh @@ -0,0 +1 @@ +../../../../tools/get_libstdcpp_lib.sh \ No newline at end of file diff --git a/scripts/compile_bundle.sh b/scripts/compile_bundle.sh index 0fcd74bcb..0bc2437af 100644 --- a/scripts/compile_bundle.sh +++ b/scripts/compile_bundle.sh @@ -220,8 +220,8 @@ if [ ${GCC_CONDA} -eq 1 ]; then echo "Command \"conda\" not found. Exit." exit 2 fi - conda install -y sysroot_linux-64 - conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge + conda install -y sysroot_linux-64 -c conda-forge + conda install -y gcc==12.3 gxx==12.3 cxx-compiler zstd -c conda-forge fi if [ ${GCC_CONDA} -ge 1 ]; then if [ -z ${CONDA_BUILD_SYSROOT} ]; then