From bee4a423d99b4dea7362d8cb31b1d48e38344a8f Mon Sep 17 00:00:00 2001
From: Zaili Wang <109502517+ZailiWang@users.noreply.github.com>
Date: Wed, 17 Jul 2024 18:54:36 +0800
Subject: [PATCH] LLM example path re-structure (release 2.4) (#3080)

* LLM example files restructure

* update

* update path in docs

* symlink

* cherry-pick the typo fix (#3083)

* fix path in quant script

---------

Co-authored-by: WeizhuoZhang-intel <weizhuo.zhang@intel.com>
---
 README.md                                     |   4 +-
 docs/tutorials/examples.md                    |   2 +-
 .../features/int8_recipe_tuning_api.md        |   2 +-
 .../features/sq_recipe_tuning_api.md          |   5 +-
 docs/tutorials/getting_started.md             |   2 +-
 docs/tutorials/installation.md                |   2 +-
 docs/tutorials/llm.rst                        |   4 +-
 docs/tutorials/llm/llm_optimize.md            |  15 +-
 .../python/llm/tools/env_activate.sh          |  27 --
 .../python/llm/tools/get_libstdcpp_lib.sh     |   1 -
 .../cpu/{inference/python => }/llm/Dockerfile |   7 +-
 examples/cpu/llm/README.md                    | 133 ++++++++++
 .../llm => llm/fine-tuning}/README.md         |  31 +--
 .../llm => llm/fine-tuning}/finetune.py       |   0
 .../llm => llm/fine-tuning}/requirements.txt  |   0
 .../fine-tuning}/run_lora_finetune_ddp.sh     |   0
 .../llm => llm/fine-tuning}/utils/README.md   |   0
 .../llm => llm/fine-tuning}/utils/__init__.py |   0
 .../llm => llm/fine-tuning}/utils/prompter.py |   0
 .../python/llm => llm/inference}/README.md    | 250 +++++-------------
 .../run_accuracy_with_deepspeed.py            |   0
 .../distributed/run_generation_tp.py          |   0
 .../run_generation_with_deepspeed.py          |   0
 .../llm => llm/inference}/llm_sq_recipes.md   |  51 ++--
 .../python/llm => llm/inference}/run.py       |   0
 .../single_instance/run_accuracy.py           |   0
 .../single_instance/run_generation.py         |   0
 .../run_int4_gpt-j_on_cnndailymail.py         |   0
 .../run_int4_gpt-j_on_cnndailymail.sh         |   0
 .../single_instance/run_quantization.py       |  54 ++--
 .../llm => llm/inference}/tools/llava.patch   |   0
 .../inference}/tools/prepare_llava.sh         |   0
 .../inference}/tools/run_scaling.sh           |   0
 .../inference}/utils/create_shard_model.py    |   0
 .../inference}/utils/model_class/baichuan.py  |   0
 .../inference}/utils/model_class/bloom.py     |   0
 .../inference}/utils/model_class/chatglm.py   |   0
 .../inference}/utils/model_class/codegen.py   |   0
 .../inference}/utils/model_class/falcon.py    |   0
 .../inference}/utils/model_class/git.py       |   0
 .../utils/model_class/gptbigcode.py           |   0
 .../inference}/utils/model_class/gptj.py      |   0
 .../inference}/utils/model_class/gptneox.py   |   0
 .../inference}/utils/model_class/llama.py     |   0
 .../inference}/utils/model_class/llava.py     |   0
 .../inference}/utils/model_class/llm.py       |   0
 .../inference}/utils/model_class/mistral.py   |   0
 .../inference}/utils/model_class/mixtral.py   |   0
 .../inference}/utils/model_class/mpt.py       |   0
 .../inference}/utils/model_class/opt.py       |   0
 .../inference}/utils/model_class/phi.py       |   0
 .../inference}/utils/model_class/qwen.py      |   0
 .../inference}/utils/model_class/qwen2.py     |   0
 .../inference}/utils/model_class/stablelm.py  |   0
 .../inference}/utils/model_class/t5.py        |   0
 .../inference}/utils/model_class/whisper.py   |   0
 .../inference}/utils/model_class/yuan.py      |   0
 .../model_config/mosaicml_mpt-7b_config.json  |   0
 .../tiiuae_falcon-40b_config.json             |   0
 .../llm => llm/inference}/utils/run_gptq.py   |   0
 examples/cpu/llm/tools/env_activate.sh        |  75 ++++++
 .../python => }/llm/tools/env_setup.sh        |  23 +-
 examples/cpu/llm/tools/get_libstdcpp_lib.sh   |   1 +
 scripts/compile_bundle.sh                     |   4 +-
 64 files changed, 370 insertions(+), 323 deletions(-)
 delete mode 100644 examples/cpu/inference/python/llm/tools/env_activate.sh
 delete mode 120000 examples/cpu/inference/python/llm/tools/get_libstdcpp_lib.sh
 rename examples/cpu/{inference/python => }/llm/Dockerfile (87%)
 create mode 100644 examples/cpu/llm/README.md
 rename examples/cpu/{training/llm => llm/fine-tuning}/README.md (57%)
 rename examples/cpu/{training/llm => llm/fine-tuning}/finetune.py (100%)
 rename examples/cpu/{training/llm => llm/fine-tuning}/requirements.txt (100%)
 rename examples/cpu/{training/llm => llm/fine-tuning}/run_lora_finetune_ddp.sh (100%)
 rename examples/cpu/{training/llm => llm/fine-tuning}/utils/README.md (100%)
 rename examples/cpu/{training/llm => llm/fine-tuning}/utils/__init__.py (100%)
 rename examples/cpu/{training/llm => llm/fine-tuning}/utils/prompter.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/README.md (82%)
 rename examples/cpu/{inference/python/llm => llm/inference}/distributed/run_accuracy_with_deepspeed.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/distributed/run_generation_tp.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/distributed/run_generation_with_deepspeed.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/llm_sq_recipes.md (84%)
 rename examples/cpu/{inference/python/llm => llm/inference}/run.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_accuracy.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_generation.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_int4_gpt-j_on_cnndailymail.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_int4_gpt-j_on_cnndailymail.sh (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/single_instance/run_quantization.py (97%)
 rename examples/cpu/{inference/python/llm => llm/inference}/tools/llava.patch (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/tools/prepare_llava.sh (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/tools/run_scaling.sh (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/create_shard_model.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/baichuan.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/bloom.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/chatglm.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/codegen.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/falcon.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/git.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/gptbigcode.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/gptj.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/gptneox.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/llama.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/llava.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/llm.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/mistral.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/mixtral.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/mpt.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/opt.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/phi.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/qwen.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/qwen2.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/stablelm.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/t5.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/whisper.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_class/yuan.py (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_config/mosaicml_mpt-7b_config.json (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/model_config/tiiuae_falcon-40b_config.json (100%)
 rename examples/cpu/{inference/python/llm => llm/inference}/utils/run_gptq.py (100%)
 create mode 100644 examples/cpu/llm/tools/env_activate.sh
 rename examples/cpu/{inference/python => }/llm/tools/env_setup.sh (94%)
 create mode 120000 examples/cpu/llm/tools/get_libstdcpp_lib.sh

diff --git a/README.md b/README.md
index f725aa751..4b13ebd5d 100644
--- a/README.md
+++ b/README.md
@@ -5,14 +5,14 @@ Intel® Extension for PyTorch\*
 
 </div>
 
-**CPU** [💻main branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[🌱Quick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[📖Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[🏃Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2Bcpu)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[💻LLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm) <br>
+**CPU** [💻main branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[🌱Quick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[📖Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[🏃Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.4.0%2Bcpu)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[💻LLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm) <br>
 **GPU** [💻main branch](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[🌱Quick Start](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/getting_started.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[📖Documentations](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[🏃Installation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[💻LLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/inference/python/llm)<br>  
 
 Intel® Extension for PyTorch\* extends PyTorch\* with up-to-date features optimizations for an extra performance boost on Intel hardware. Optimizations take advantage of Intel® Advanced Vector Extensions 512 (Intel® AVX-512) Vector Neural Network Instructions (VNNI) and Intel® Advanced Matrix Extensions (Intel® AMX) on Intel CPUs as well as Intel X<sup>e</sup> Matrix Extensions (XMX) AI engines on Intel discrete GPUs. Moreover, Intel® Extension for PyTorch* provides easy GPU acceleration for Intel discrete GPUs through the PyTorch* xpu device.
 
 ## ipex.llm - Large Language Models (LLMs) Optimization
 
-In the current technological landscape, Generative AI (GenAI) workloads and models have gained widespread attention and popularity. Large Language Models (LLMs) have emerged as the dominant models driving these GenAI applications. Starting from 2.1.0, specific optimizations for certain LLM models are introduced in the Intel® Extension for PyTorch\*. Check [**LLM optimizations**](./examples/cpu/inference/python/llm) for details.
+In the current technological landscape, Generative AI (GenAI) workloads and models have gained widespread attention and popularity. Large Language Models (LLMs) have emerged as the dominant models driving these GenAI applications. Starting from 2.1.0, specific optimizations for certain LLM models are introduced in the Intel® Extension for PyTorch\*. Check [**LLM optimizations**](./examples/cpu/llm) for details.
 
 ### Optimized Model List
 
diff --git a/docs/tutorials/examples.md b/docs/tutorials/examples.md
index baf23c7d3..d0a555b77 100644
--- a/docs/tutorials/examples.md
+++ b/docs/tutorials/examples.md
@@ -240,7 +240,7 @@ generate results for the input prompt.
 [//]: # (marker_llm_optimize_woq)
 [//]: # (marker_llm_optimize_woq)
 
-**Note:** Please check [LLM Best Known Practice Page](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm)
+**Note:** Please check [LLM Best Known Practice Page](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm)
 for detailed environment setup and LLM workload running instructions.
 
 ## C++
diff --git a/docs/tutorials/features/int8_recipe_tuning_api.md b/docs/tutorials/features/int8_recipe_tuning_api.md
index 8bce1766e..31987838f 100644
--- a/docs/tutorials/features/int8_recipe_tuning_api.md
+++ b/docs/tutorials/features/int8_recipe_tuning_api.md
@@ -10,7 +10,7 @@ Users need to provide a fp32 model and some parameters required for tuning. The
 Please refer to [static_quant example](../../../examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py).
 
 - Smooth Quantization
-Please refer to [llm sq example](../../../examples/cpu/inference/python/llm/single_instance/run_generation.py).
+Please refer to [LLM SmoothQuant example](../../../examples/cpu/llm/inference/single_instance/run_generation.py).
 
 ## Smooth Quantization Autotune
 ### Algorithm: Auto-tuning of $\alpha$.
diff --git a/docs/tutorials/features/sq_recipe_tuning_api.md b/docs/tutorials/features/sq_recipe_tuning_api.md
index 4c19fb625..115548585 100644
--- a/docs/tutorials/features/sq_recipe_tuning_api.md
+++ b/docs/tutorials/features/sq_recipe_tuning_api.md
@@ -1,7 +1,8 @@
 Smooth Quant Recipe Tuning API (Prototype)
 =============================================
 
-Smooth Quantization is a popular method to improve the accuracy of int8 quantization. The [autotune API](../api_doc.html#ipex.quantization.autotune) allows automatic global alpha tuning, and automatic layer-by-layer alpha tuning provided by Intel® Neural Compressor for the best INT8 accuracy.
+Smooth Quantization is a popular method to improve the accuracy of int8 quantization.
+The [autotune API](../api_doc.html#ipex.quantization.autotune) allows automatic global alpha tuning, and automatic layer-by-layer alpha tuning provided by Intel® Neural Compressor for the best INT8 accuracy.
 
 SmoothQuant will introduce alpha to calculate the ratio of input and weight updates to reduce quantization error. SmoothQuant arguments are as below:
 
@@ -15,6 +16,6 @@ SmoothQuant will introduce alpha to calculate the ratio of input and weight upda
 | shared_criterion |     "mean"    | ["min", "mean","max"] |   criterion for input LayerNorm op of a transformer block.  |
 |   enable_blockwise_loss   |     False     |     [True, False]     |          whether to enable block-wise auto-tuning          |
 
-For LLM examples, please refer to [example](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/inference/python/llm).
+Please refer to the [LLM examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm) for complete examples.
 
 **Note**: When defining dataloaders for calibration, please follow INC's dataloader [format](https://github.com/intel/neural-compressor/blob/master/docs/source/dataloader.md).
diff --git a/docs/tutorials/getting_started.md b/docs/tutorials/getting_started.md
index 3abf1f77e..67874f6d4 100644
--- a/docs/tutorials/getting_started.md
+++ b/docs/tutorials/getting_started.md
@@ -157,4 +157,4 @@ with torch.inference_mode(), torch.cpu.amp.autocast(enabled=amp_enabled):
     print(gen_text, total_new_tokens, flush=True)
 ```
 
-More LLM examples, including usage of low precision data types are available in the [LLM Examples](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm) section.
+More LLM examples, including usage of low precision data types are available in the [LLM Examples](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm) section.
diff --git a/docs/tutorials/installation.md b/docs/tutorials/installation.md
index a8d3a439a..707a091db 100644
--- a/docs/tutorials/installation.md
+++ b/docs/tutorials/installation.md
@@ -5,4 +5,4 @@ Select your preferences and follow the installation instructions provided on the
 
 After successful installation, refer to the [Quick Start](getting_started.md) and [Examples](examples.md) sections to start using the extension in your code.
 
-**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/inference/python/llm).
+**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm).
diff --git a/docs/tutorials/llm.rst b/docs/tutorials/llm.rst
index e1e117e5d..3c2878a72 100644
--- a/docs/tutorials/llm.rst
+++ b/docs/tutorials/llm.rst
@@ -13,7 +13,7 @@ These LLM-specific optimizations can be automatically applied with a single fron
 
    llm/llm_optimize
 
-`ipex.llm` Optimized Model List
+`ipex.llm` Optimized Model List for Inference
 -------------------------------
 
 Verified for single instance mode
@@ -30,7 +30,7 @@ Verified for distributed inference mode via DeepSpeed
 
 *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future.
 
-Please check `LLM best known practice <https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm>`_ for instructions to install/setup environment and example scripts.
+Please check `LLM best known practice <https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm>`_ for instructions to install/setup environment and example scripts.
 
 Module Level Optimization API for customized LLM (Prototype)
 ------------------------------------------------------------
diff --git a/docs/tutorials/llm/llm_optimize.md b/docs/tutorials/llm/llm_optimize.md
index ab2c0b06b..7da2706a7 100644
--- a/docs/tutorials/llm/llm_optimize.md
+++ b/docs/tutorials/llm/llm_optimize.md
@@ -1,15 +1,20 @@
-Transformers Optimization Frontend API
+LLM Optimizations Frontend API
 ======================================
 
-The new API function, `ipex.llm.optimize`, is designed to optimize transformer-based models within frontend Python modules, with a particular focus on Large Language Models (LLMs). It provides optimizations for both model-wise and content-generation-wise. You just need to invoke the `ipex.llm.optimize` function instead of the `ipex.optimize` function to apply all optimizations transparently.
+The new API function, `ipex.llm.optimize`, is designed to optimize transformer-based models within frontend Python modules, with a particular focus on Large Language Models (LLMs).
+It provides optimizations for both model-wise and content-generation-wise.
+You just need to invoke the `ipex.llm.optimize` function instead of the `ipex.optimize` function to apply all optimizations transparently.
 
-This API currently works for inference workloads. Support for training is undergoing. Currently, this API supports certain models. Supported model list can be found at [Overview](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#ipexllm-optimized-model-list).
+This API currently works for inference workloads.
+Currently, this API supports certain models. Supported model list can be found at [this page](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/llm.html#ipexllm-optimized-model-list-for-inference).
+For LLM fine-tuning, please check the [LLM fine-tuning tutorial](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm/fine-tuning).
 
 API documentation is available at [API Docs page](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/api_doc.html#ipex.llm.optimize).
 
 ## Pseudocode of Common Usage Scenarios
 
-The following sections show pseudocode snippets to invoke Intel® Extension for PyTorch\* APIs to work with LLM models. Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/inference/python/llm).
+The following sections show pseudocode snippets to invoke Intel® Extension for PyTorch\* APIs to work with LLM models.
+Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm/inference).
 
 ### FP32/BF16
 
@@ -98,7 +103,7 @@ model = ipex.llm.optimize(model, quantization_config=qconfig, low_precision_chec
 
 Distributed inference can be performed with `DeepSpeed`. Based on original Intel® Extension for PyTorch\* scripts, the following code changes are required.
 
-Check [LLM distributed inference examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/inference/python/llm/distributed) for complete codes.
+Check [LLM distributed inference examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.4.0%2Bcpu/examples/cpu/llm/inference/distributed) for complete codes.
 
 ``` python
 import torch
diff --git a/examples/cpu/inference/python/llm/tools/env_activate.sh b/examples/cpu/inference/python/llm/tools/env_activate.sh
deleted file mode 100644
index 759c008f7..000000000
--- a/examples/cpu/inference/python/llm/tools/env_activate.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-
-# Setup environment variables for performance on Xeon
-export KMP_BLOCKTIME=1
-export KMP_TPAUSE=0
-export KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
-export KMP_PLAIN_BARRIER_PATTERN=dist,dist
-export KMP_REDUCTION_BARRIER_PATTERN=dist,dist
-
-BASEFOLDER=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-export LD_PRELOAD=$(bash ${BASEFOLDER}/get_libstdcpp_lib.sh):${LD_PRELOAD}
-
-env | grep CONDA_PREFIX > /dev/null
-if [ $? -eq 0 ]; then
-    export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libiomp5.so # Intel OpenMP
-    # Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support.
-    export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so
-else
-    echo "Conda environment is not available. You need to set environment variable LD_PRELOAD to dynamic libraries of Intel OpenMP and TcMalloc manually if they are not in library search paths."
-fi
-
-ONECCL_PATH=${BASEFOLDER}/../oneCCL_release
-if [ ! -d ${ONECCL_PATH} ]; then
-    echo "Warning: oneCCL is not available."
-else
-    source ${ONECCL_PATH}/env/setvars.sh
-fi
diff --git a/examples/cpu/inference/python/llm/tools/get_libstdcpp_lib.sh b/examples/cpu/inference/python/llm/tools/get_libstdcpp_lib.sh
deleted file mode 120000
index 52891777e..000000000
--- a/examples/cpu/inference/python/llm/tools/get_libstdcpp_lib.sh
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../../tools/get_libstdcpp_lib.sh
\ No newline at end of file
diff --git a/examples/cpu/inference/python/llm/Dockerfile b/examples/cpu/llm/Dockerfile
similarity index 87%
rename from examples/cpu/inference/python/llm/Dockerfile
rename to examples/cpu/llm/Dockerfile
index 9f7dd58ec..d56ba2404 100644
--- a/examples/cpu/inference/python/llm/Dockerfile
+++ b/examples/cpu/llm/Dockerfile
@@ -39,7 +39,7 @@ ENV PATH=/root/.local/bin:${PATH}
 FROM base AS dev
 ARG COMPILE
 COPY . ./intel-extension-for-pytorch
-RUN cd intel-extension-for-pytorch/examples/cpu/inference/python/llm && \
+RUN cd intel-extension-for-pytorch/examples/cpu/llm && \
     export CC=gcc && export CXX=g++ && \
     if [ -z ${COMPILE} ]; then bash tools/env_setup.sh 6; else bash tools/env_setup.sh 2; fi && \
     unset CC && unset CXX
@@ -53,7 +53,7 @@ RUN apt update && \
     apt clean && \
     rm -rf /var/lib/apt/lists/* && \
     if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi
-COPY --from=dev /root/intel-extension-for-pytorch/examples/cpu/inference/python/llm ./llm
+COPY --from=dev /root/intel-extension-for-pytorch/examples/cpu/llm ./llm
 COPY --from=dev /root/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh ./llm/tools
 RUN cd /usr/lib/x86_64-linux-gnu/ && ln -s libtcmalloc.so.4 libtcmalloc.so && cd && \
     echo "echo \"**Note:** For better performance, please consider to launch workloads with command 'ipexrun'.\"" >> ./.bashrc && \
@@ -62,8 +62,7 @@ RUN cd /usr/lib/x86_64-linux-gnu/ && ln -s libtcmalloc.so.4 libtcmalloc.so && cd
     python -m pip cache purge && \
     mv ./oneCCL_release /opt/oneCCL && \
     chown -R root:root /opt/oneCCL && \
-    sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ./tools/env_activate.sh && \
-    LN=$(grep "Conda environment is not available." -n ./tools/env_activate.sh | cut -d ":" -f 1) && sed -i "${LN}s|.*|    export LD_PRELOAD=\${LD_PRELOAD}:/usr/lib/x86_64-linux-gnu/libtcmalloc.so:/usr/local/lib/libiomp5.so|" ./tools/env_activate.sh
+    sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ./tools/env_activate.sh
 ARG PORT_SSH=22
 RUN mkdir /var/run/sshd && \
     sed -i "s/#Port.*/Port ${PORT_SSH}/" /etc/ssh/sshd_config && \
diff --git a/examples/cpu/llm/README.md b/examples/cpu/llm/README.md
new file mode 100644
index 000000000..c99c6ecd5
--- /dev/null
+++ b/examples/cpu/llm/README.md
@@ -0,0 +1,133 @@
+# 1. LLM Optimization Overview
+
+`ipex.llm` provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc.
+And a set of data types are supported for various scenarios, including FP32, BF16, Smooth Quantization INT8, Weight Only Quantization INT8/INT4 (prototype).
+
+<br>
+
+# 2. Environment Setup
+
+There are several environment setup methodologies provided. You can choose either of them according to your usage scenario. The Docker-based ones are recommended.
+
+## 2.1 [RECOMMENDED] Docker-based environment setup with pre-built wheels
+
+```bash
+# Get the Intel® Extension for PyTorch\* source code
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout v2.4.0+cpu
+git submodule sync
+git submodule update --init --recursive
+
+# Build an image with the provided Dockerfile by installing from Intel® Extension for PyTorch\* prebuilt wheel files
+# To have a custom ssh server port for multi-nodes run, please add --build-arg PORT_SSH=<CUSTOM_PORT> ex: 2345, otherwise use the default 22 SSH port
+DOCKER_BUILDKIT=1 docker build -f examples/cpu/llm/Dockerfile --build-arg PORT_SSH=2345 -t ipex-llm:2.4.0 .
+
+# Run the container with command below
+docker run --rm -it --privileged -v /dev/shm:/dev/shm ipex-llm:2.4.0 bash
+
+# When the command prompt shows inside the docker container, enter llm examples directory
+cd llm
+
+# Activate environment variables
+# set bash script argument to "inference" or "fine-tuning" for different usages
+source ./tools/env_activate.sh [inference|fine-tuning]
+```
+
+## 2.2 Conda-based environment setup with pre-built wheels
+
+```bash
+# Get the Intel® Extension for PyTorch\* source code
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout v2.4.0+cpu
+git submodule sync
+git submodule update --init --recursive
+
+# GCC 12.3 is required. Installation can be taken care of by the environment configuration script.
+# Create a conda environment
+conda create -n llm python=3.10 -y
+conda activate llm
+
+# Setup the environment with the provided script
+cd examples/cpu/llm
+bash ./tools/env_setup.sh 7
+
+# Activate environment variables
+# set bash script argument to "inference" or "fine-tuning" for different usages
+source ./tools/env_activate.sh [inference|fine-tuning]
+```
+
+## 2.3 Docker-based environment setup with compilation from source
+
+```bash
+# Get the Intel® Extension for PyTorch\* source code
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout v2.4.0+cpu
+git submodule sync
+git submodule update --init --recursive
+
+# Build an image with the provided Dockerfile by compiling Intel® Extension for PyTorch\* from source
+# To have a custom ssh server port for multi-nodes run, please add --build-arg PORT_SSH=<CUSTOM_PORT> ex: 2345, otherwise use the default 22 SSH port
+docker build -f examples/cpu/llm/Dockerfile --build-arg COMPILE=ON --build-arg PORT_SSH=2345 -t ipex-llm:2.4.0 .
+
+# Run the container with command below
+docker run --rm -it --privileged -v /dev/shm:/dev/shm ipex-llm:2.4.0 bash
+
+# When the command prompt shows inside the docker container, enter llm examples directory
+cd llm
+
+# Activate environment variables
+# set bash script argument to "inference" or "fine-tuning" for different usages
+source ./tools/env_activate.sh [inference|fine-tuning]
+```
+
+## 2.4 Conda-based environment setup with compilation from source
+
+```bash
+# Get the Intel® Extension for PyTorch\* source code
+git clone https://github.com/intel/intel-extension-for-pytorch.git
+cd intel-extension-for-pytorch
+git checkout v2.4.0+cpu
+git submodule sync
+git submodule update --init --recursive
+
+# GCC 12.3 is required. Installation can be taken care of by the environment configuration script.
+# Create a conda environment
+conda create -n llm python=3.10 -y
+conda activate llm
+
+# Setup the environment with the provided script
+cd examples/cpu/llm
+bash ./tools/env_setup.sh
+
+# Activate environment variables
+# set bash script argument to "inference" or "fine-tuning" for different usages
+source ./tools/env_activate.sh [inference|fine-tuning]
+```
+
+<br>
+
+*Note*: In `env_activate.sh` script a `prompt.json` file is downloaded, which provides prompt samples with pre-defined input token lengths for benchmarking.
+For **Llama-3 models** benchmarking, the users need to download a specific `prompt.json` file, overwriting the original one.
+
+```bash
+wget -O prompt.json https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt-3.json
+```
+
+The original `prompt.json` file can be restored from the repository if needed.
+
+```bash
+wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
+```
+
+<br>
+
+# 3. How To Run LLM with ipex.llm
+
+Inference and fine-tuning are supported in respective directories.
+
+For inference example scripts, visit the [inference](./inference/) directory.
+
+For fine-tuning example scripts, visit the [fine-tuning](./fine-tuning/) directory.
\ No newline at end of file
diff --git a/examples/cpu/training/llm/README.md b/examples/cpu/llm/fine-tuning/README.md
similarity index 57%
rename from examples/cpu/training/llm/README.md
rename to examples/cpu/llm/fine-tuning/README.md
index 99a655d30..4b1de026a 100644
--- a/examples/cpu/training/llm/README.md
+++ b/examples/cpu/llm/fine-tuning/README.md
@@ -1,33 +1,26 @@
-# IPEX LLAMA2 7B lora apalca finetuning training on CPUs (distributed)
+# IPEX LLAMA2 7B lora alpaca finetuning training on CPUs (distributed)
 
 ## Description
 
-This document has instructions for running [LLaMA2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf)  lora apalca finetuning using Intel-optimized PyTorch (enable the recipes from [apalca-lora](https://github.com/tloen/alpaca-lora/tree/main) on CPUs ).
+This document has instructions for running [LLaMA2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf)  lora alpaca finetuning using Intel-optimized PyTorch (enable the recipes from [alpaca-lora](https://github.com/tloen/alpaca-lora/tree/main) on CPUs ).
 
-## Bare Metal
-### General setup
+## Distributed Computation Environment Setup
 
-Follow [link](https://github.com/intel/intel-extension-for-pytorch/tree/release/2.4/examples/cpu/inference/python/llm#3-environment-setup) to setup PyTorch/IPEX and some other dependency.
+In this case, we use data-parallel distributed training and every rank will hold same model replica. The NNODES is the number of ip in the HOSTFILE.
 
-### Prepare dependency
 ```
-  pip install -r requirements.txt
- ```
-### Specific Setup
-
-* Set ENV to use multi-nodes distributed training (no need for single-node multi-sockets)
-
-In this case, we use data-parallel distributed training and every rank will hold same model replica. The NNODES is the number of ip in the HOSTFILE. To use multi-nodes distributed training you should firstly setup the passwordless login (you can refer to [link](https://linuxize.com/post/how-to-setup-passwordless-ssh-login/)) between these nodes. 
-```
-export NNODES=#your_node_number (default using 1 node)
+export NNODES=#number_of_nodes (default using 1 node)
 # create your_ip_list_file, one ip per line, like (or self edit):
 scontrol show hostname > ./hostfile
-
 export HOSTFILE=hostfile 
-
 ```
+
+*Note:* To use multi-nodes distributed training you should firstly setup the passwordless login (you can refer to [link](https://linuxize.com/post/how-to-setup-passwordless-ssh-login/)) among computation nodes. If you are using the Dockerfile, you can skip this step.
+
 # Quick Start Scripts  
+
 ## Run the model
+
 ```
 # Get the dataset here: https://github.com/tloen/alpaca-lora/blob/main/alpaca_data.json
 export DATASET="./alpaca_data.json"
@@ -46,7 +39,9 @@ Apply the access in this page [LLaMA2 7B](https://huggingface.co/meta-llama/Llam
 huggingface-cli login
 {your huggingface token}
 ```
+
 ## Launch command
+
 |  DataType   | Throughput  |
 | ----------- | ----------- |
-| BF16        | bash run_lora_finetune_ddp.sh bf16  |
+| BF16        | bash run_lora_finetune_ddp.sh bf16  |
\ No newline at end of file
diff --git a/examples/cpu/training/llm/finetune.py b/examples/cpu/llm/fine-tuning/finetune.py
similarity index 100%
rename from examples/cpu/training/llm/finetune.py
rename to examples/cpu/llm/fine-tuning/finetune.py
diff --git a/examples/cpu/training/llm/requirements.txt b/examples/cpu/llm/fine-tuning/requirements.txt
similarity index 100%
rename from examples/cpu/training/llm/requirements.txt
rename to examples/cpu/llm/fine-tuning/requirements.txt
diff --git a/examples/cpu/training/llm/run_lora_finetune_ddp.sh b/examples/cpu/llm/fine-tuning/run_lora_finetune_ddp.sh
similarity index 100%
rename from examples/cpu/training/llm/run_lora_finetune_ddp.sh
rename to examples/cpu/llm/fine-tuning/run_lora_finetune_ddp.sh
diff --git a/examples/cpu/training/llm/utils/README.md b/examples/cpu/llm/fine-tuning/utils/README.md
similarity index 100%
rename from examples/cpu/training/llm/utils/README.md
rename to examples/cpu/llm/fine-tuning/utils/README.md
diff --git a/examples/cpu/training/llm/utils/__init__.py b/examples/cpu/llm/fine-tuning/utils/__init__.py
similarity index 100%
rename from examples/cpu/training/llm/utils/__init__.py
rename to examples/cpu/llm/fine-tuning/utils/__init__.py
diff --git a/examples/cpu/training/llm/utils/prompter.py b/examples/cpu/llm/fine-tuning/utils/prompter.py
similarity index 100%
rename from examples/cpu/training/llm/utils/prompter.py
rename to examples/cpu/llm/fine-tuning/utils/prompter.py
diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/llm/inference/README.md
similarity index 82%
rename from examples/cpu/inference/python/llm/README.md
rename to examples/cpu/llm/inference/README.md
index 493241ba4..2f333f28a 100644
--- a/examples/cpu/inference/python/llm/README.md
+++ b/examples/cpu/llm/inference/README.md
@@ -1,13 +1,6 @@
-# 1. LLM Optimization Overview
+# 1. ipex.llm Optimized Model List for Inference
 
-`ipex.llm` provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc.
-And a set of data types are supported for various scenarios, including FP32, BF16, Smooth Quantization INT8, Weight Only Quantization INT8/INT4 (prototype).
-
-<br>
-
-# 2. ipex.llm Optimized Model List
-
-## 2.1 Verified for single instance mode
+## 1.1 Verified for single instance mode
 
 | MODEL FAMILY | MODEL NAME (Huggingface hub) | FP32 | BF16 | Static quantization INT8 | Weight only quantization INT8 | Weight only quantization INT4 |
 |:---:|:---:|:---:|:---:|:---:|:---:|:---:|
@@ -46,7 +39,7 @@ And a set of data types are supported for various scenarios, including FP32, BF1
 |Phi| microsoft/Phi-3-medium-4k-instruct | 🟩 | 🟩 | 🟨 | 🟩 | 🟨 |
 |Phi| microsoft/Phi-3-medium-128k-instruct | 🟩 | 🟩 | 🟨 | 🟩 | 🟨 |
 
-## 2.2 Verified for distributed inference mode via DeepSpeed
+## 1.2 Verified for distributed inference mode via DeepSpeed
 
 | MODEL FAMILY | MODEL NAME (Huggingface hub) | BF16 | Weight only quantization INT8 |
 |:---:|:---:|:---:|:---:|
@@ -89,119 +82,7 @@ We are working in progress to better support the models in the tables with vario
 
 <br>
 
-# 3. Environment Setup
-There are several environment setup methodologies provided. You can choose either of them according to your usage scenario. The Docker-based ones are recommended.
-
-## 3.1 [RECOMMENDED] Docker-based environment setup with pre-built wheels
-
-```bash
-# Get the Intel® Extension for PyTorch\* source code
-git clone https://github.com/intel/intel-extension-for-pytorch.git
-cd intel-extension-for-pytorch
-git checkout v2.4.0+cpu
-git submodule sync
-git submodule update --init --recursive
-
-# Build an image with the provided Dockerfile by installing from Intel® Extension for PyTorch\* prebuilt wheel files
-DOCKER_BUILDKIT=1 docker build -f examples/cpu/inference/python/llm/Dockerfile -t ipex-llm:2.4.0 .
-
-# Run the container with command below
-docker run --rm -it --privileged ipex-llm:2.4.0 bash
-
-# When the command prompt shows inside the docker container, enter llm examples directory
-cd llm
-
-# Activate environment variables
-source ./tools/env_activate.sh
-```
-
-## 3.2 Conda-based environment setup with pre-built wheels
-
-```bash
-# Get the Intel® Extension for PyTorch\* source code
-git clone https://github.com/intel/intel-extension-for-pytorch.git
-cd intel-extension-for-pytorch
-git checkout v2.4.0+cpu
-git submodule sync
-git submodule update --init --recursive
-
-# GCC 12.3 is required. Installation can be taken care of by the environment configuration script.
-# Create a conda environment
-conda create -n llm python=3.10 -y
-conda activate llm
-
-# Setup the environment with the provided script
-# A sample "prompt.json" file for benchmarking is also downloaded
-cd examples/cpu/inference/python/llm
-bash ./tools/env_setup.sh 7
-
-# Activate environment variables
-source ./tools/env_activate.sh
-```
-
-## 3.3 Docker-based environment setup with compilation from source
-
-```bash
-# Get the Intel® Extension for PyTorch\* source code
-git clone https://github.com/intel/intel-extension-for-pytorch.git
-cd intel-extension-for-pytorch
-git checkout v2.4.0+cpu
-git submodule sync
-git submodule update --init --recursive
-
-# Build an image with the provided Dockerfile by compiling Intel® Extension for PyTorch\* from source
-DOCKER_BUILDKIT=1 docker build -f examples/cpu/inference/python/llm/Dockerfile --build-arg COMPILE=ON -t ipex-llm:2.4.0 .
-
-# Run the container with command below
-docker run --rm -it --privileged ipex-llm:2.4.0 bash
-
-# When the command prompt shows inside the docker container, enter llm examples directory
-cd llm
-
-# Activate environment variables
-source ./tools/env_activate.sh
-```
-
-## 3.4 Conda-based environment setup with compilation from source
-
-```bash
-# Get the Intel® Extension for PyTorch\* source code
-git clone https://github.com/intel/intel-extension-for-pytorch.git
-cd intel-extension-for-pytorch
-git checkout v2.4.0+cpu
-git submodule sync
-git submodule update --init --recursive
-
-# GCC 12.3 is required. Installation can be taken care of by the environment configuration script.
-# Create a conda environment
-conda create -n llm python=3.10 -y
-conda activate llm
-
-# Setup the environment with the provided script
-# A sample "prompt.json" file for benchmarking is also downloaded
-cd examples/cpu/inference/python/llm
-bash ./tools/env_setup.sh
-
-# Activate environment variables
-source ./tools/env_activate.sh
-```
-
-*Note*: In `env_setup.sh` script a `prompt.json` file is downloaded, which provides prompt samples with pre-defined input token lengths for benchmarking.
-For **Llama-3 models** benchmarking, the users need to download a specific `prompt.json` file, overwriting the original one.
-
-```bash
-wget -O prompt.json https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt-3.json
-```
-
-The original `prompt.json` file can be restored from the repository if needed.
-
-```bash
-wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
-```
-
-<br>
-
-# 4. How To Run LLM with ipex.llm
+# 2. How To Run LLM with ipex.llm
 
 **ipex.llm provides a single script to facilitate running generation tasks as below:**
 
@@ -221,68 +102,68 @@ python run.py --help # for more detailed usages
 
 *Note:* You may need to log in your HuggingFace account to access the model files. Please refer to [HuggingFace login](https://huggingface.co/docs/huggingface_hub/quick-start#login).
 
-## 4.1 Quick example for running Llama2-7b
+## 2.1 Quick example for running Llama2-7b
 
-### 4.1.1 To run generation task and benchmark performance
+### 2.1.1 To run generation task and benchmark performance
 
 *Note:* The following "OMP_NUM_THREADS" and "numactl" settings are based on the assumption that the target server has 56 physical cores per numa socket, and we benchmark with 1 socket. Please adjust the settings per your hardware.
 
-#### 4.1.1.1 Run in FP32 with stock PyTorch
+#### 2.1.1.1 Run in FP32 with stock PyTorch
 
 ```bash
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype float32
 ```
 
-#### 4.1.1.2 Run in FP32 with ipex.llm
+#### 2.1.1.2 Run in FP32 with ipex.llm
 
 ```bash
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype float32 --ipex 
 ```
 
-#### 4.1.1.3 Run in BF16 with ipex.llm
+#### 2.1.1.3 Run in BF16 with ipex.llm
 
 ```bash
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex 
 ```
 
-#### 4.1.1.4 Run in static quantization INT8 with ipex.llm
+#### 2.1.1.4 Run in static quantization INT8 with ipex.llm
 
 ```bash
 wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/cpu/2/llama2-7b_qconfig.json
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-hf --ipex-smooth-quant --qconfig-summary-file llama2-7b_qconfig.json --output-dir "saved_results"
 ```
 
-#### 4.1.1.5 Run in weight-only quantization INT8 with ipex.llm
+#### 2.1.1.5 Run in weight-only quantization INT8 with ipex.llm
 
 ```bash
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --output-dir "saved_results" 
 ```
 
-#### 4.1.1.6 Run in weight-only quantization INT4 with ipex.llm
+#### 2.1.1.6 Run in weight-only quantization INT4 with ipex.llm
 
 ```bash
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT4  --gptq --quant-with-amp --output-dir "saved_results" 
 ```
 
-#### 4.1.1.7 Run in BF16 with ipex.llm in distributed way
+#### 2.1.1.7 Run in BF16 with ipex.llm in distributed way
 
 ```bash
 deepspeed --bind_cores_to_rank  run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex  --autotp --shard-model
 ```
 
-#### 4.1.1.8 Run in weight-only quantization INT8 with ipex.llm in distributed way
+#### 2.1.1.8 Run in weight-only quantization INT8 with ipex.llm in distributed way
 
 ```bash
 deepspeed --bind_cores_to_rank  run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp  --autotp --shard-model --output-dir "saved_results"
 ```
 
-### 4.1.2 To run generation task and test accuracy
+### 2.1.2 To run generation task and test accuracy
 
 For the quantized models used in accuracy tests below, we can reuse the model files that are named "best_model.pt" in the "--output-dir" path ([generated during inference performance tests above](#generation_sq)).
 
-Check [Advanced Usage](#52-accuracy-test) for details.
+Check [Advanced Usage](#32-accuracy-test) for details.
 
-#### 4.1.2.1 Single instance
+#### 2.1.2.1 Single instance
 
 ```bash
 # The following "OMP_NUM_THREADS" and "numactl" settings are based on the assumption that
@@ -302,7 +183,7 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py  -m meta-llama/Ll
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Llama-2-7b-hf --quantized-model-path "../saved_results/best_model.pt" --dtype int8  --tasks lambada_openai
 ```
 
-#### 4.1.2.2 Distributed inference
+#### 2.1.2.2 Distributed inference
 
 ```bash
 # run_accuracy_with_deepspeed.py script is inside distributed directory.
@@ -319,23 +200,23 @@ deepspeed  --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'`
 deepspeed  --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py  --model  meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --tasks lambada_openai  
 ```
 
-#### 4.1.2.3 Distributed inference among multiple nodes with TCP 
+#### 2.1.2.3 Distributed inference among multiple nodes with TCP 
 
-A bash script (`tools/run_scaling.sh`) is provided to simplify environment configuration and the command launch.
+A [bash script](./tools/run_scaling.sh) is provided to simplify environment configuration and the command launch.
 
 Steps:
 
-2. Enter the `llm` directory
-3. Create a `hostfile.txt` following [instructions of deepspeed](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node)
-4. Find out the network interface name used for node communication via `ifconfig` or `ibv_devices` ex : eth0
-5. Open `tools/run_scaling.sh` script to update required information in line 3 to line 11 according to your environment and needs
-6. run the command below to run distributed inference among nodes
+1. Enter the `llm` directory
+2. Create a `hostfile.txt` following [instructions of deepspeed](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node)
+3. Find out the network interface name used for node communication via `ifconfig` or `ibv_devices` ex : eth0
+4. Open `tools/run_scaling.sh` script to update required information in line 3 to line 11 according to your environment and needs
+5. run the command below to run distributed inference among nodes
 
 ```bash
 bash tools/run_scaling.sh
 ```
 
-The docker image built in Section 3.1 functions ssh connection for distributed executions across multiple machines via Ethernet. However, it is supposed to be running with 1 single container on each machine. Inside each docker container, multiple inference instances can be launched by the `deepspeed` command.
+The docker image built in [the environment setup tutorial](../README.md#2-environment-setup) functions ssh connection for distributed executions across multiple machines via Ethernet. However, it is supposed to be running with 1 single container on each machine. Inside each docker container, multiple inference instances can be launched by the `deepspeed` command.
 
 Use the command below on all machines to launch the docker containers. This command uses the host network interfaces inside the docker container. Thus, you need to put the host ip addresses into the `hostfile.txt`. Do NOT launch multiple docker containers on one single machine from the same docker image. These docker containers listen on the same machine on the same port, will result in unpredicable ssh connections.
 
@@ -345,11 +226,11 @@ docker run --rm -it --privileged --net host ipex-llm:main bash
 
 **Note:** For models on HuggingFace require access privileges, you need to run the `huggingface-cli login` command in each docker container to config a HuggingFace access token.
 
-## 4.2 Detail usage of running LLM models
+## 2.2 Detail usage of running LLM models
 
-### 4.2.1 Run generation with one instance
+### 2.2.1 Run generation with one instance
 
-#### 4.2.1.1 FP32:
+#### 2.2.1.1 FP32:
 
 - Command:
 ```bash
@@ -361,7 +242,7 @@ OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype float32 --ipex
 ```
 
-#### 4.2.1.2 BF16:
+#### 2.2.1.2 BF16:
 
 - Command:
 ```bash
@@ -373,7 +254,7 @@ OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex
 ```
 
-#### 4.2.1.3 Static quantization (INT8):
+#### 2.2.1.3 Static quantization (INT8):
 
 We use the SmoothQuant algorithm to get good accuracy of static quantization, which is a popular method for LLM models. Besides, by default, we enable quantization mixed fp32 inference (non-quantized OPs run with fp32 dtype). To get better performance, you may add "--quant-with-amp" to enable quantization with [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html) inference (non-quantized OPs run with bf16 dtype). Please note that static quantization with AMP is still experimental and it may lead to accuracy drop and other issues.
 
@@ -403,9 +284,9 @@ We provide the following qconfig summary files with good quality (calibration on
 | baichuan-inc/Baichuan-13B-Chat | [link](https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/cpu/2/baichuan-13b_qconfig.json) |
 | bigscience/bloom-1b7 | [link](https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/cpu/2/bloom-1b7_qconfig.json) |
 
-If you would like to generate qconfig summary files (due to changes on model variants or calibration dataset), we provide the [autotune API](../../../../../docs/tutorials/features/sq_recipe_tuning_api.md) and its [tuning examples](llm_sq_recipes.md), which allows an automatic global smoothquant tuning, and automatic layer-by-layer tuning provided by Intel® Neural Compressor for the best accuracy.
+If you would like to generate qconfig summary files (due to changes on model variants or calibration dataset), we provide the [autotune API](../../../../docs/tutorials/features/sq_recipe_tuning_api.md) and its [tuning examples](llm_sq_recipes.md), which allows an automatic global smoothquant tuning, and automatic layer-by-layer tuning provided by Intel® Neural Compressor for the best accuracy.
 
-#### 4.2.1.4 Weight-only quantization:
+#### 2.2.1.4 Weight-only quantization:
 
 By default, for weight-only quantization, we use quantization with [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html) inference ("--quant-with-amp") to get peak performance and fair accuracy.
 
@@ -443,17 +324,17 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT4 --gptq --quant-with-amp --output-dir "saved_results" 
 ```
 
-#### 4.2.1.5 Notes:
+#### 2.2.1.5 Notes:
 
 (1) [_numactl_](https://linux.die.net/man/8/numactl) is used to specify memory and cores of your hardware to get better performance. _\<node N\>_ specifies the [numa](https://en.wikipedia.org/wiki/Non-uniform_memory_access) node id (e.g., 0 to use the memory from the first numa node). _\<physical cores list\>_ specifies phsysical cores which you are using from the _\<node N\>_ numa node (e.g., 0-56 from the first numa node). You can use [_lscpu_](https://man7.org/linux/man-pages/man1/lscpu.1.html) command in Linux to check the numa node information.
 
-(2) The _\<MODEL_ID\>_ (e.g., "meta-llama/Llama-2-13b-hf") specifies the model you will run. we provide some verified _\<MODEL ID\>_ in the [Optimized Model List](#2-ipexllm-optimized-model-list). You can also try other models from [HuggingFace Models](https://huggingface.co/models).
+(2) The _\<MODEL_ID\>_ (e.g., "meta-llama/Llama-2-13b-hf") specifies the model you will run. we provide some verified _\<MODEL ID\>_ in the [Optimized Model List](#1-ipexllm-optimized-model-list-for-inference). You can also try other models from [HuggingFace Models](https://huggingface.co/models).
 
 (3) <a name="generation_sq">for all quantization benchmarks</a>, both quantization and inference stages will be triggered by default. For quantization stage, it will auto-generate the quantized model named "best_model.pt" in the "--output-dir" path, and for inference stage, it will launch the inference with the quantized model "best_model.pt".  For inference-only benchmarks (avoid the repeating quantization stage), you can also reuse these quantized models for by adding "--quantized-model-path <output_dir + "best_model.pt">" .
 
-### 4.2.2 Run generation in distributed way
+### 2.2.2 Run generation in distributed way
 
-#### 4.2.2.1 Prepare:
+#### 2.2.2.1 Prepare:
 
 ```bash
 unset KMP_AFFINITY
@@ -464,9 +345,9 @@ In the DeepSpeed cases below, we recommend "--shard-model" to shard model weight
 If using "--shard-model", it will save a copy of the shard model weights file in the path of "--output-dir" (default path is "./saved_results" if not provided).
 If you have used "--shard-model" and generated such a shard model path (or your model weights files are already well sharded), in further repeated benchmarks, please remove "--shard-model", and replace "-m <MODEL_ID>" with "-m <shard model path>" to skip the repeated shard steps.
 
-Besides, the standalone shard model function/scripts are also provided in the [Advanced Usage](#53-how-to-shard-model-for-distributed-tests-with-deepspeed-autotp) section, in case you would like to generate the shard model weights files in advance before running distributed inference.
+Besides, the standalone shard model function/scripts are also provided in the [Advanced Usage](#33-how-to-shard-model-for-distributed-tests-with-deepspeed-autotp) section, in case you would like to generate the shard model weights files in advance before running distributed inference.
 
-#### 4.2.2.2 FP32:
+#### 2.2.2.2 FP32:
 
 - Command:
 ```bash
@@ -478,7 +359,7 @@ deepspeed --bind_cores_to_rank  run.py --benchmark -m <MODEL_ID> --dtype float32
 deepspeed --bind_cores_to_rank  run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype float32 --ipex --autotp --shard-model
 ```
 
-#### 4.2.2.3 BF16:
+#### 2.2.2.3 BF16:
 
 - Command:
 ```bash
@@ -490,7 +371,7 @@ deepspeed --bind_cores_to_rank  run.py --benchmark -m <MODEL_ID> --dtype bfloat1
 deepspeed --bind_cores_to_rank  run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex  --autotp --shard-model
 ```
 
-#### 4.2.2.4 Weight-only quantization:
+#### 2.2.2.4 Weight-only quantization:
 
 By default, for weight-only quantization, we use quantization with [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html) inference ("--quant-with-amp") to get peak performance and fair accuracy.
 For weight-only quantization with deepspeed, we quantize the model then run the benchmark. The quantized model won't be saved.
@@ -520,7 +401,7 @@ Similar to single instance usage, we need to update some arguments of the runnin
 deepspeed --bind_cores_to_rank  run.py --benchmark -m meta-llama/Llama-2-7b-hf --ipex --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --autotp --shard-model --output-dir "saved_results"
 ```
 
-### 4.2.3 Additional configuration for specific models
+### 2.2.3 Additional configuration for specific models
 
 There are some model-specific requirements to be aware of, as follows:
 
@@ -532,17 +413,17 @@ There are some model-specific requirements to be aware of, as follows:
 
 - For mistralai/Mistral-7B-v0.1 and mistralai/Mixtral-8x7B-Instruct-v0.1, we use a fixed model version because the latest version is not compatible with transformers 4.38.1 and tokenizers 0.15.2.
 
-## 4.3 Instructions for Running LLM with Intel® Xeon® CPU Max Series
+## 2.3 Instructions for Running LLM with Intel® Xeon® CPU Max Series
 
 Intel® Xeon® CPU Max Series are equipped with high bandwidth memory (HBM), which further accelerates LLM inference. For the common case that HBM and DDR are both installed in a Xeon® CPU Max Series server, the memory mode can be configured to Flat Mode or Cache Mode. Details about memory modes can be found at Section 3.1 in [the Xeon® CPU Max Series Configuration Guide](https://cdrdv2-public.intel.com/769060/354227-intel-xeon-cpu-max-series-configuration-and-tuning-guide.pdf).
 
-### 4.3.1 Single Instance Inference with Xeon® CPU Max Series
+### 2.3.1 Single Instance Inference with Xeon® CPU Max Series
 
-#### 4.3.1.1 Cache Mode HBM
+#### 2.3.1.1 Cache Mode HBM
 
-In cache mode, only DDR address space is visible to software and HBM functions as a transparent memory-side cache for DDR. Therefore the usage is the same with [the common usage](#421-run-generation-with-one-instance).
+In cache mode, only DDR address space is visible to software and HBM functions as a transparent memory-side cache for DDR. Therefore the usage is the same with [the common usage](#221-run-generation-with-one-instance).
 
-#### 4.3.1.2 Flat Mode HBM
+#### 2.3.1.2 Flat Mode HBM
 
 In flat mode, HBM and DDR are exposed to software as separate address spaces in this mode. Therefore we need to check the `HBM_NODE_INDEX` of interest with commands like `lscpu`, then the LLM inference invoking command would be like:
 
@@ -567,9 +448,9 @@ OMP_NUM_THREADS=<HBM node cores num> numactl -p <HBM_NODE_INDEX> -C <HBM cores l
 OMP_NUM_THREADS=56 numactl -p 2 -C 0-55 python run.py --benchmark -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex
 ```
 
-### 4.3.2 Distributed Inference with Xeon® CPU Max Series
+### 2.3.2 Distributed Inference with Xeon® CPU Max Series
 
-As HBM has memory capacity limitations, we need to shard the model in advance with DDR memory. Please follow [the example](#53-how-to-shard-model-for-distributed-tests-with-deepspeed-autotp).
+As HBM has memory capacity limitations, we need to shard the model in advance with DDR memory. Please follow [the example](#33-how-to-shard-model-for-distributed-tests-with-deepspeed-autotp).
 
 Then we can invoke distributed inference with `deepspeed` command:
 
@@ -588,10 +469,9 @@ deepspeed --bind_cores_to_rank run.py --benchmark -m ./local_llama2_7b --dtype b
 
 <br>
 
+# 3. Advanced Usage
 
-# 5. Advanced Usage
-
-## 5.1 Weight-only quantization with low precision checkpoint (Prototype)
+## 3.1 Weight-only quantization with low precision checkpoint (Prototype)
 
 Using INT4 weights can further improve performance by reducing memory bandwidth. However, direct per-channel quantization of weights to INT4 probably results in poor accuracy. Some algorithms can modify weights through calibration before quantizing weights to minimize accuracy drop. GPTQ is one of such algorithms. You may generate modified weights and quantization info (scales, zero points) for a certain model with a dataset by such algorithms. The low precision checkpoint is saved as a `state_dict` in a `.pt` file and can be loaded later for weight only quantization. We provide an example here to run GPTQ.
 
@@ -618,7 +498,7 @@ python single_instance/run_quantization.py --ipex-weight-only-quantization --qua
 OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python single_instance/run_quantization.py -m <MODEL_ID> --benchmark --quant-with-amp --quantized-model-path "./saved_results/best_model.pt" 
 ```
 
-To run accuracy tests, please follow the instructions in the [Accuracy Test](#52-accuracy-test) part
+To run accuracy tests, please follow the instructions in the [Accuracy Test](#32-accuracy-test) part
 
 If the checkpoint is generated by some other methods and has different keys in the state_dict, you will need to specify the keys for weight, scales, zero points and bias. Bias is optional in the state_dict while others are required. Default keys are:
 
@@ -667,7 +547,7 @@ Please note that 100 GB disk space, 100 GB memory and Internet access are needed
 
 IPEX now only supports some certain cases. Weights must be N by K and asymmetrically quantized to UINT4 and then compressed along K axis to `torch.int32`. Data type of scales can be any floating point types. Shape of scales should be [N, number_of_groups] or with additional dimensions whose length is 1. Zero points should have the same shape as scales and stored as `torch.int32` but the true data type is UINT4. Bias is optional in the `state_dict` (checkpoint). If it is present, we read bias in the `state_dict`. Otherwise we read bias from the original model. Bias is `None` if it cannot be found in both cases.
 
-## 5.2 Accuracy test
+## 3.2 Accuracy test
 
 We leverage [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) for the accuracy test.
 
@@ -675,13 +555,13 @@ We verify and recommend to test accuracy of most models with "lambada_openai" ta
 For some models, like `Salesforce/codegen-2B-multi` and `mosaicml/mpt-7b`, we verify and recommend to test their accuracy with "hellaswag" task.
 For more candidate tasks for accuracy validation, please check [lm-evaluation-harness task table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).
 
-### 5.2.1 Run with one instance
+### 3.2.1 Run with one instance
 
 ```bash
 cd ./single_instance
 ```
 
-#### 5.2.1.1 FP32:
+#### 3.2.1.1 FP32:
 
 - Command:
 ```bash
@@ -693,7 +573,7 @@ OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py  -m meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai
 ```
 
-#### 5.2.1.2 BF16:
+#### 3.2.1.2 BF16:
 
 - Command:
 ```bash
@@ -705,7 +585,7 @@ OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py  -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai
 ```
 
-#### 5.2.1.3 Quantizations:
+#### 3.2.1.3 Quantizations:
 
 For the quantized models to be used in accuracy tests, we can reuse the model files that are named "best_model.pt" in the "--output-dir" path ([generated during inference performance tests](#generation_sq)).
 
@@ -720,9 +600,9 @@ OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <cpu list> python ru
 OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run_accuracy.py -m meta-llama/Llama-2-7b-hf --quantized-model-path "../saved_results/best_model.pt" --dtype int8  --tasks lambada_openai
 ```
 
-### 5.2.2 Run in distributed way
+### 3.2.2 Run in distributed way
 
-#### 5.2.2.1 Prepare:
+#### 3.2.2.1 Prepare:
 
 ```bash
 # Run distributed accuracy with 2 ranks of one node
@@ -730,7 +610,7 @@ cd ./distributed
 unset KMP_AFFINITY
 ```
 
-#### 5.2.2.2 FP32:
+#### 3.2.2.2 FP32:
 
 - Command:
 ```bash
@@ -741,7 +621,7 @@ deepspeed  --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'`
 deepspeed  --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py  --model  meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai 
 ```
 
-#### 5.2.2.3 BF16:
+#### 3.2.2.3 BF16:
 - Command:
 ```bash
 deepspeed  --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py  --model <MODEL_ID> --dtype  bfloat16 -ipex --tasks <TASK_NAME> 
@@ -751,7 +631,7 @@ deepspeed  --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'`
 deepspeed  --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py  --model  meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai 
 ```
 
-#### 5.2.2.4 Weight-only quantization (INT8):
+#### 3.2.2.4 Weight-only quantization (INT8):
 
 - Command:
 ```bash
@@ -776,7 +656,7 @@ Similar to script usage for performance benchmarking, we need to update some arg
 deepspeed  --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py  --model  meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks <TASK_NAME>  
 ```
 
-## 5.3 How to Shard model for Distributed tests with DeepSpeed (autoTP)
+## 3.3 How to Shard model for Distributed tests with DeepSpeed (autoTP)
 
 To save memory usage, we could shard the model weights under the local path before we launch distributed tests with DeepSpeed.
 
@@ -794,12 +674,12 @@ python create_shard_model.py -m meta-llama/Llama-2-7b-hf --save-path ./local_lla
 
 # 6. Performance Results
 
-The performance results on AWS instances can be found [here](../../../../../docs/tutorials/performance.md#llm-performance).
+The performance results on AWS instances can be found [here](../../../../docs/tutorials/performance.md#llm-performance).
 
 <br>
 
 # 7. Miscellaneous Tips
 
-- We can build up LLM services optimized by Intel® Extension for PyTorch\* with Triton Server. Please refer [here](../../../serving/triton/README.md) for best practice.
+- We can build up LLM services optimized by Intel® Extension for PyTorch\* with Triton Server. Please refer [here](../../serving/triton/README.md) for best practice.
 
 - The LLM inference methods introduced in this page can be well applied for AWS. We can just follow the above instructions and enjoy the boosted performance of LLM with Intel® Extension for PyTorch\* optimizations on the AWS instances.
diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
similarity index 100%
rename from examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py
rename to examples/cpu/llm/inference/distributed/run_accuracy_with_deepspeed.py
diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py b/examples/cpu/llm/inference/distributed/run_generation_tp.py
similarity index 100%
rename from examples/cpu/inference/python/llm/distributed/run_generation_tp.py
rename to examples/cpu/llm/inference/distributed/run_generation_tp.py
diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
similarity index 100%
rename from examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py
rename to examples/cpu/llm/inference/distributed/run_generation_with_deepspeed.py
diff --git a/examples/cpu/inference/python/llm/llm_sq_recipes.md b/examples/cpu/llm/inference/llm_sq_recipes.md
similarity index 84%
rename from examples/cpu/inference/python/llm/llm_sq_recipes.md
rename to examples/cpu/llm/inference/llm_sq_recipes.md
index 22df336e2..88a80b7d0 100644
--- a/examples/cpu/inference/python/llm/llm_sq_recipes.md
+++ b/examples/cpu/llm/inference/llm_sq_recipes.md
@@ -1,25 +1,26 @@
-## Smooth Quantization Autotune Feature (Prototype):
-SmoothQuant is a popular method to improve the accuracy of int8 quantization. The [autotune API](../../../../../docs/tutorials/features/sq_recipe_tuning_api.md) allows automatic global alpha tuning, and automatic layer-by-layer alpha tuning provided by Intel® Neural Compressor for the best accuracy. Below is the basic command to generate the qconfig summary files (and quantized model ".pt" file) with the SmoothQuant autotune API.
-
-```bash
-# general command:
-OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py  --benchmark -m <MODEL_ID> --ipex-smooth-quant --alpha auto  --output-dir "saved_results"
-
-# An example of llama2 7b model:
-OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-hf --ipex-smooth-quant --alpha auto
-```
-
-## Example command for model tuning with AutoTune API
-| Model ID | Command |
-|---|:---:|
-| meta-llama/Llama-2-13b-hf | python run.py -m meta-llama/Llama-2-13b-hf --ipex-smooth-quant --alpha auto --init-alpha 0.8 --alpha-min 0.75 --alpha-max 0.99 --alpha-step 0.01 --shared-criterion 'max' --calib-len 1024 --calib-padding --fallback-add |
-| meta-llama/Llama-2-70b-hf | python run.py -m meta-llama/Llama-2-70b-hf --ipex-smooth-quant --batch-size 56 --calib-shuffle --fallback-add --alpha 0.8 |
-| EleutherAI/gpt-j-6b | python run.py -m EleutherAI/gpt-j-6b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --fallback-add --alpha 0.85 |
-| tiiuae/falcon-7b | python run.py -m tiiuae/falcon-7b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 |
-| tiiuae/falcon-40b | python run.py -m tiiuae/falcon-40b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.9 |
-| facebook/opt-30b | python run.py -m facebook/opt-30b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle |
-| facebook/opt-1.3b | python run.py -m facebook/opt-1.3b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.85 |
-| baichuan-inc/Baichuan2-7B-Chat | python run.py -m baichuan-inc/Baichuan2-7B-Chat --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 |
-| baichuan-inc/Baichuan2-13B-Chat | python run.py -m baichuan-inc/Baichuan2-13B-Chat --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.65 |
-
-*Note*: The above examples are validated with good accuracy on the "lamada_openai" dataset.
+## Smooth Quantization Autotune Feature (Prototype):
+
+SmoothQuant is a popular method to improve the accuracy of int8 quantization. The [autotune API](../../../../docs/tutorials/features/sq_recipe_tuning_api.md) allows automatic global alpha tuning, and automatic layer-by-layer alpha tuning provided by Intel® Neural Compressor for the best accuracy. Below is the basic command to generate the qconfig summary files (and quantized model ".pt" file) with the SmoothQuant autotune API.
+
+```bash
+# general command:
+OMP_NUM_THREADS=<physical cores num> numactl -m <node N> -C <physical cores list> python run.py  --benchmark -m <MODEL_ID> --ipex-smooth-quant --alpha auto  --output-dir "saved_results"
+
+# An example of llama2 7b model:
+OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py  --benchmark -m meta-llama/Llama-2-7b-hf --ipex-smooth-quant --alpha auto
+```
+
+## Example command for model tuning with AutoTune API
+| Model ID | Command |
+|---|:---:|
+| meta-llama/Llama-2-13b-hf | python run.py -m meta-llama/Llama-2-13b-hf --ipex-smooth-quant --alpha auto --init-alpha 0.8 --alpha-min 0.75 --alpha-max 0.99 --alpha-step 0.01 --shared-criterion 'max' --calib-len 1024 --calib-padding --fallback-add |
+| meta-llama/Llama-2-70b-hf | python run.py -m meta-llama/Llama-2-70b-hf --ipex-smooth-quant --batch-size 56 --calib-shuffle --fallback-add --alpha 0.8 |
+| EleutherAI/gpt-j-6b | python run.py -m EleutherAI/gpt-j-6b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --fallback-add --alpha 0.85 |
+| tiiuae/falcon-7b | python run.py -m tiiuae/falcon-7b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 |
+| tiiuae/falcon-40b | python run.py -m tiiuae/falcon-40b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.9 |
+| facebook/opt-30b | python run.py -m facebook/opt-30b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle |
+| facebook/opt-1.3b | python run.py -m facebook/opt-1.3b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.85 |
+| baichuan-inc/Baichuan2-7B-Chat | python run.py -m baichuan-inc/Baichuan2-7B-Chat --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 |
+| baichuan-inc/Baichuan2-13B-Chat | python run.py -m baichuan-inc/Baichuan2-13B-Chat --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.65 |
+
+*Note*: The above examples are validated with good accuracy on the "lamada_openai" dataset.
diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/llm/inference/run.py
similarity index 100%
rename from examples/cpu/inference/python/llm/run.py
rename to examples/cpu/llm/inference/run.py
diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/llm/inference/single_instance/run_accuracy.py
similarity index 100%
rename from examples/cpu/inference/python/llm/single_instance/run_accuracy.py
rename to examples/cpu/llm/inference/single_instance/run_accuracy.py
diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/llm/inference/single_instance/run_generation.py
similarity index 100%
rename from examples/cpu/inference/python/llm/single_instance/run_generation.py
rename to examples/cpu/llm/inference/single_instance/run_generation.py
diff --git a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py b/examples/cpu/llm/inference/single_instance/run_int4_gpt-j_on_cnndailymail.py
similarity index 100%
rename from examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py
rename to examples/cpu/llm/inference/single_instance/run_int4_gpt-j_on_cnndailymail.py
diff --git a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.sh b/examples/cpu/llm/inference/single_instance/run_int4_gpt-j_on_cnndailymail.sh
similarity index 100%
rename from examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.sh
rename to examples/cpu/llm/inference/single_instance/run_int4_gpt-j_on_cnndailymail.sh
diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/llm/inference/single_instance/run_quantization.py
similarity index 97%
rename from examples/cpu/inference/python/llm/single_instance/run_quantization.py
rename to examples/cpu/llm/inference/single_instance/run_quantization.py
index a16a277bc..6016198d2 100644
--- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py
+++ b/examples/cpu/llm/inference/single_instance/run_quantization.py
@@ -13,33 +13,33 @@
 from ast import literal_eval
 import sys
 
-sys.path.append(sys.path[0] + "/../../")
-
-
-from llm.utils.model_class.llm import EXAMPLE_INPUTS_MODE
-from llm.utils.model_class.llama import LLAMAConfig
-from llm.utils.model_class.gptj import GPTJConfig
-from llm.utils.model_class.gptneox import GPTNEOXConfig
-from llm.utils.model_class.falcon import FALCONConfig
-from llm.utils.model_class.opt import OPTConfig
-from llm.utils.model_class.bloom import BloomConfig
-from llm.utils.model_class.codegen import CodeGenConfig
-from llm.utils.model_class.baichuan import BaichuanConfig
-from llm.utils.model_class.chatglm import ChatGLMConfig
-from llm.utils.model_class.gptbigcode import GPTJBigCodeConfig
-from llm.utils.model_class.t5 import T5Config
-from llm.utils.model_class.mistral import MistralConfig
-from llm.utils.model_class.mixtral import MixtralConfig
-from llm.utils.model_class.mpt import MPTConfig
-from llm.utils.model_class.stablelm import StableLMConfig
-from llm.utils.model_class.qwen import QwenConfig
-from llm.utils.model_class.qwen2 import Qwen2Config
-from llm.utils.model_class.git import GitConfig
-from llm.utils.model_class.llava import LlavaConfig
-from llm.utils.model_class.phi import PhiConfig
-from llm.utils.model_class.phi import Phi3Config
-from llm.utils.model_class.yuan import YuanConfig
-from llm.utils.model_class.whisper import WhisperConfig
+sys.path.append(sys.path[0] + "/../../../")
+
+
+from llm.inference.utils.model_class.llm import EXAMPLE_INPUTS_MODE
+from llm.inference.utils.model_class.llama import LLAMAConfig
+from llm.inference.utils.model_class.gptj import GPTJConfig
+from llm.inference.utils.model_class.gptneox import GPTNEOXConfig
+from llm.inference.utils.model_class.falcon import FALCONConfig
+from llm.inference.utils.model_class.opt import OPTConfig
+from llm.inference.utils.model_class.bloom import BloomConfig
+from llm.inference.utils.model_class.codegen import CodeGenConfig
+from llm.inference.utils.model_class.baichuan import BaichuanConfig
+from llm.inference.utils.model_class.chatglm import ChatGLMConfig
+from llm.inference.utils.model_class.gptbigcode import GPTJBigCodeConfig
+from llm.inference.utils.model_class.t5 import T5Config
+from llm.inference.utils.model_class.mistral import MistralConfig
+from llm.inference.utils.model_class.mixtral import MixtralConfig
+from llm.inference.utils.model_class.mpt import MPTConfig
+from llm.inference.utils.model_class.stablelm import StableLMConfig
+from llm.inference.utils.model_class.qwen import QwenConfig
+from llm.inference.utils.model_class.qwen2 import Qwen2Config
+from llm.inference.utils.model_class.git import GitConfig
+from llm.inference.utils.model_class.llava import LlavaConfig
+from llm.inference.utils.model_class.phi import PhiConfig
+from llm.inference.utils.model_class.phi import Phi3Config
+from llm.inference.utils.model_class.yuan import YuanConfig
+from llm.inference.utils.model_class.whisper import WhisperConfig
 
 
 # The latest model is not compatible with the current transformers/tokenizers, so we specify the revision of the model
diff --git a/examples/cpu/inference/python/llm/tools/llava.patch b/examples/cpu/llm/inference/tools/llava.patch
similarity index 100%
rename from examples/cpu/inference/python/llm/tools/llava.patch
rename to examples/cpu/llm/inference/tools/llava.patch
diff --git a/examples/cpu/inference/python/llm/tools/prepare_llava.sh b/examples/cpu/llm/inference/tools/prepare_llava.sh
similarity index 100%
rename from examples/cpu/inference/python/llm/tools/prepare_llava.sh
rename to examples/cpu/llm/inference/tools/prepare_llava.sh
diff --git a/examples/cpu/inference/python/llm/tools/run_scaling.sh b/examples/cpu/llm/inference/tools/run_scaling.sh
similarity index 100%
rename from examples/cpu/inference/python/llm/tools/run_scaling.sh
rename to examples/cpu/llm/inference/tools/run_scaling.sh
diff --git a/examples/cpu/inference/python/llm/utils/create_shard_model.py b/examples/cpu/llm/inference/utils/create_shard_model.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/create_shard_model.py
rename to examples/cpu/llm/inference/utils/create_shard_model.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/baichuan.py b/examples/cpu/llm/inference/utils/model_class/baichuan.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/baichuan.py
rename to examples/cpu/llm/inference/utils/model_class/baichuan.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/bloom.py b/examples/cpu/llm/inference/utils/model_class/bloom.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/bloom.py
rename to examples/cpu/llm/inference/utils/model_class/bloom.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/chatglm.py b/examples/cpu/llm/inference/utils/model_class/chatglm.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/chatglm.py
rename to examples/cpu/llm/inference/utils/model_class/chatglm.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/codegen.py b/examples/cpu/llm/inference/utils/model_class/codegen.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/codegen.py
rename to examples/cpu/llm/inference/utils/model_class/codegen.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/falcon.py b/examples/cpu/llm/inference/utils/model_class/falcon.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/falcon.py
rename to examples/cpu/llm/inference/utils/model_class/falcon.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/git.py b/examples/cpu/llm/inference/utils/model_class/git.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/git.py
rename to examples/cpu/llm/inference/utils/model_class/git.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/gptbigcode.py b/examples/cpu/llm/inference/utils/model_class/gptbigcode.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/gptbigcode.py
rename to examples/cpu/llm/inference/utils/model_class/gptbigcode.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/gptj.py b/examples/cpu/llm/inference/utils/model_class/gptj.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/gptj.py
rename to examples/cpu/llm/inference/utils/model_class/gptj.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/gptneox.py b/examples/cpu/llm/inference/utils/model_class/gptneox.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/gptneox.py
rename to examples/cpu/llm/inference/utils/model_class/gptneox.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/llama.py b/examples/cpu/llm/inference/utils/model_class/llama.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/llama.py
rename to examples/cpu/llm/inference/utils/model_class/llama.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/llava.py b/examples/cpu/llm/inference/utils/model_class/llava.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/llava.py
rename to examples/cpu/llm/inference/utils/model_class/llava.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/llm.py b/examples/cpu/llm/inference/utils/model_class/llm.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/llm.py
rename to examples/cpu/llm/inference/utils/model_class/llm.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/mistral.py b/examples/cpu/llm/inference/utils/model_class/mistral.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/mistral.py
rename to examples/cpu/llm/inference/utils/model_class/mistral.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/mixtral.py b/examples/cpu/llm/inference/utils/model_class/mixtral.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/mixtral.py
rename to examples/cpu/llm/inference/utils/model_class/mixtral.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/mpt.py b/examples/cpu/llm/inference/utils/model_class/mpt.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/mpt.py
rename to examples/cpu/llm/inference/utils/model_class/mpt.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/opt.py b/examples/cpu/llm/inference/utils/model_class/opt.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/opt.py
rename to examples/cpu/llm/inference/utils/model_class/opt.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/phi.py b/examples/cpu/llm/inference/utils/model_class/phi.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/phi.py
rename to examples/cpu/llm/inference/utils/model_class/phi.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/qwen.py b/examples/cpu/llm/inference/utils/model_class/qwen.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/qwen.py
rename to examples/cpu/llm/inference/utils/model_class/qwen.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/qwen2.py b/examples/cpu/llm/inference/utils/model_class/qwen2.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/qwen2.py
rename to examples/cpu/llm/inference/utils/model_class/qwen2.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/stablelm.py b/examples/cpu/llm/inference/utils/model_class/stablelm.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/stablelm.py
rename to examples/cpu/llm/inference/utils/model_class/stablelm.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/t5.py b/examples/cpu/llm/inference/utils/model_class/t5.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/t5.py
rename to examples/cpu/llm/inference/utils/model_class/t5.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/whisper.py b/examples/cpu/llm/inference/utils/model_class/whisper.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/whisper.py
rename to examples/cpu/llm/inference/utils/model_class/whisper.py
diff --git a/examples/cpu/inference/python/llm/utils/model_class/yuan.py b/examples/cpu/llm/inference/utils/model_class/yuan.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_class/yuan.py
rename to examples/cpu/llm/inference/utils/model_class/yuan.py
diff --git a/examples/cpu/inference/python/llm/utils/model_config/mosaicml_mpt-7b_config.json b/examples/cpu/llm/inference/utils/model_config/mosaicml_mpt-7b_config.json
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_config/mosaicml_mpt-7b_config.json
rename to examples/cpu/llm/inference/utils/model_config/mosaicml_mpt-7b_config.json
diff --git a/examples/cpu/inference/python/llm/utils/model_config/tiiuae_falcon-40b_config.json b/examples/cpu/llm/inference/utils/model_config/tiiuae_falcon-40b_config.json
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/model_config/tiiuae_falcon-40b_config.json
rename to examples/cpu/llm/inference/utils/model_config/tiiuae_falcon-40b_config.json
diff --git a/examples/cpu/inference/python/llm/utils/run_gptq.py b/examples/cpu/llm/inference/utils/run_gptq.py
similarity index 100%
rename from examples/cpu/inference/python/llm/utils/run_gptq.py
rename to examples/cpu/llm/inference/utils/run_gptq.py
diff --git a/examples/cpu/llm/tools/env_activate.sh b/examples/cpu/llm/tools/env_activate.sh
new file mode 100644
index 000000000..c5415df0c
--- /dev/null
+++ b/examples/cpu/llm/tools/env_activate.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+MSG_USAGE="Usage: source $0 [inference|fine-tuning]"
+if [ $# -eq 0 ]; then
+    echo ${MSG_USAGE}
+    return 1
+fi
+MODE=$1
+if [ ${MODE} != "inference" ] && [ ${MODE} != "fine-tuning" ]; then
+    echo ${MSG_USAGE}
+    return 2
+fi
+
+# Setup environment variables for performance on Xeon
+export KMP_BLOCKTIME=1
+export KMP_TPAUSE=0
+export KMP_FORKJOIN_BARRIER_PATTERN=dist,dist
+export KMP_PLAIN_BARRIER_PATTERN=dist,dist
+export KMP_REDUCTION_BARRIER_PATTERN=dist,dist
+
+BASEFOLDER=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+export LD_PRELOAD=$(bash ${BASEFOLDER}/get_libstdcpp_lib.sh):${LD_PRELOAD}
+
+function set_ld_preload() {
+    LIB_DIR=$1
+    LIB_NAME=$2
+    LIB=
+    while read line; do
+        LIB=${line}
+        break
+    done < <(find ${LIB_DIR} -name ${LIB_NAME})
+    if [ ! -z ${LIB} ]; then
+        export LD_PRELOAD=${LD_PRELOAD}:${LIB}
+        echo "Appending ${LIB} to environment variable LD_PRELOAD."
+    else
+        echo "Library ${LIB_NAME} is not found. Please append it manually to environment variable LD_PRELOAD."
+    fi
+}
+
+env | grep CONDA_PREFIX > /dev/null
+if [ $? -eq 0 ]; then
+    set_ld_preload ${CONDA_PREFIX} libiomp5.so
+    set_ld_preload ${CONDA_PREFIX} libtcmalloc.so
+else
+    set_ld_preload /usr libiomp5.so
+    set_ld_preload /usr libtcmalloc.so
+fi
+
+ONECCL_PATH=${BASEFOLDER}/../oneCCL_release
+if [ ! -d ${ONECCL_PATH} ]; then
+    echo "Warning: oneCCL is not available."
+else
+    source ${ONECCL_PATH}/env/setvars.sh
+fi
+
+cd ${BASEFOLDER}/../${MODE}
+if [ ${MODE} == "inference" ]; then
+    if [ -f prompt.json ]; then
+        rm -f prompt.json
+    fi
+    wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
+    cd single_instance
+    if [ -f prompt.json ]; then
+        rm -f prompt.json
+    fi
+    ln -s ../prompt.json
+    cd ../distributed
+    if [ -f prompt.json ]; then
+        rm -f prompt.json
+    fi
+    ln -s ../prompt.json
+    cd ..
+elif [ ${MODE} == "fine-tuning" ]; then
+    python -m pip install -r requirements.txt
+fi
\ No newline at end of file
diff --git a/examples/cpu/inference/python/llm/tools/env_setup.sh b/examples/cpu/llm/tools/env_setup.sh
similarity index 94%
rename from examples/cpu/inference/python/llm/tools/env_setup.sh
rename to examples/cpu/llm/tools/env_setup.sh
index 5fa6b1155..b239f530d 100644
--- a/examples/cpu/inference/python/llm/tools/env_setup.sh
+++ b/examples/cpu/llm/tools/env_setup.sh
@@ -38,7 +38,7 @@ done
 
 if [ $((${MODE} & 0x02)) -ne 0 ]; then
     # Enter IPEX root dir
-    cd ../../../../..
+    cd ../../..
 
     if [ ! -f dependency_version.yml ]; then
         echo "Please check if `pwd` is a valid Intel® Extension for PyTorch* source code directory."
@@ -139,7 +139,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
                 echo "[Error] Command \"conda\" is not available."
                 exit 5
             else
-                conda install -y sysroot_linux-64
+                conda install -y sysroot_linux-64 -c conda-forge
                 conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge
                 if [ -z ${CONDA_BUILD_SYSROOT} ]; then
                     source ${CONDA_PREFIX}/etc/conda/activate.d/activate-gcc_linux-64.sh
@@ -212,7 +212,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then
     cd ../..
     cp -r oneCCL/build/_install ${CCLFOLDER}
     rm -rf oneCCL
-    cd intel-extension-for-pytorch/examples/cpu/inference/python/llm
+    cd intel-extension-for-pytorch/examples/cpu/llm
 fi
 if [ $((${MODE} & 0x01)) -ne 0 ]; then
     set +e
@@ -227,19 +227,4 @@ if [ $((${MODE} & 0x01)) -ne 0 ]; then
     bash ${AUX_INSTALL_SCRIPT}
     python -m pip install ${WHEELFOLDER}/*.whl
     rm -rf ${WHEELFOLDER}
-    if [ -f prompt.json ]; then
-        rm -f prompt.json
-    fi
-    wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json
-    cd single_instance
-    if [ -f prompt.json ]; then
-        rm -f prompt.json
-    fi
-    ln -s ../prompt.json
-    cd ../distributed
-    if [ -f prompt.json ]; then
-        rm -f prompt.json
-    fi
-    ln -s ../prompt.json
-fi
-python -m pip install numpy==1.26.4 --force-reinstall
\ No newline at end of file
+fi
\ No newline at end of file
diff --git a/examples/cpu/llm/tools/get_libstdcpp_lib.sh b/examples/cpu/llm/tools/get_libstdcpp_lib.sh
new file mode 120000
index 000000000..06ce62d04
--- /dev/null
+++ b/examples/cpu/llm/tools/get_libstdcpp_lib.sh
@@ -0,0 +1 @@
+../../../../tools/get_libstdcpp_lib.sh
\ No newline at end of file
diff --git a/scripts/compile_bundle.sh b/scripts/compile_bundle.sh
index 0fcd74bcb..0bc2437af 100644
--- a/scripts/compile_bundle.sh
+++ b/scripts/compile_bundle.sh
@@ -220,8 +220,8 @@ if [ ${GCC_CONDA} -eq 1 ]; then
         echo "Command \"conda\" not found. Exit."
         exit 2
     fi
-    conda install -y sysroot_linux-64
-    conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge
+    conda install -y sysroot_linux-64 -c conda-forge
+    conda install -y gcc==12.3 gxx==12.3 cxx-compiler zstd -c conda-forge
 fi
 if [ ${GCC_CONDA} -ge 1 ]; then
     if [ -z ${CONDA_BUILD_SYSROOT} ]; then