From 52f9f74b1a82caabf097a301c83271e79ffbecac Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Fri, 15 Sep 2023 07:23:54 +0800
Subject: [PATCH] Jupyter Notebooks for NeuralChat (#277)

* Jupyter Notebooks for NeuralChat

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>

* update build and deploy chatbot

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>

* added NeuralChat optimization notebooks.

Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>

* Update tts.py

* Add Notebooks for finetuning chatbot on various platforms (#309)

* fix config

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>

* add notebook

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>

---------

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>

* fix as suggestions

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>

* Update tts.py

* Update build_chatbot_on_spr.ipynb

* Update build_chatbot_on_spr.ipynb

* Update tts.py

* update notebook

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>

* update notebook

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>

* fix pylint issue

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>

---------

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>
Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
Signed-off-by: XuhuiRen <xuhui.ren@intel.com>
Co-authored-by: Ye, Xinyu <xinyu.ye@intel.com>
Co-authored-by: Liangyx2 <106130696+Liangyx2@users.noreply.github.com>
Co-authored-by: Haihao Shen <haihao.shen@intel.com>
Co-authored-by: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com>
Co-authored-by: XuhuiRen <xuhui.ren@intel.com>
---
 .../llm/finetuning/finetuning.py              |   2 +-
 .../neural_chat/README.md                     |  42 +-
 .../neural_chat/config.py                     |   6 +-
 .../amp_optimization_on_habana_gaudi.ipynb    |  61 +++
 .../notebooks/amp_optimization_on_spr.ipynb   |  94 +++++
 .../build_chatbot_on_habana_gaudi.ipynb       | 184 ++++++++
 .../docs/notebooks/build_chatbot_on_icx.ipynb | 227 ++++++++++
 .../notebooks/build_chatbot_on_nv_a100.ipynb  | 196 +++++++++
 .../docs/notebooks/build_chatbot_on_spr.ipynb | 238 +++++++++++
 .../docs/notebooks/build_chatbot_on_xpu.ipynb | 187 ++++++++
 .../docs/notebooks/chatbot_on_intel_cpu.ipynb | 370 ----------------
 .../chatbot_on_intel_habana_hpu.ipynb         | 391 -----------------
 .../docs/notebooks/chatbot_on_nv_gpu.ipynb    | 399 ------------------
 .../deploy_chatbot_on_habana_gaudi.ipynb      | 181 ++++++++
 .../notebooks/deploy_chatbot_on_icx.ipynb     | 193 +++++++++
 .../notebooks/deploy_chatbot_on_nv_a100.ipynb | 193 +++++++++
 .../notebooks/deploy_chatbot_on_spr.ipynb     | 195 +++++++++
 .../notebooks/deploy_chatbot_on_xpu.ipynb     | 193 +++++++++
 .../notebooks/finetuning_on_nv_a100.ipynb     | 228 ++++++++++
 ...ulti_card_finetuning_on_habana_gaudi.ipynb | 144 +++++++
 .../multi_node_finetuning_on_spr.ipynb        | 173 ++++++++
 ...ngle_card_finetuning_on_habana_gaudi.ipynb | 240 +++++++++++
 .../single_node_finetuning_on_spr.ipynb       | 228 ++++++++++
 .../weight_only_optimization_on_nv_a100.ipynb |  94 +++++
 .../neural_chat/pipeline/plugins/audio/tts.py |   9 +-
 .../neural_chat/requirements.txt              |   1 -
 26 files changed, 3293 insertions(+), 1176 deletions(-)
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
 delete mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_cpu.ipynb
 delete mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_habana_hpu.ipynb
 delete mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_nv_gpu.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/single_node_finetuning_on_spr.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb

diff --git a/intel_extension_for_transformers/llm/finetuning/finetuning.py b/intel_extension_for_transformers/llm/finetuning/finetuning.py
index 7cadf1b01cc..75ed32ee3fd 100644
--- a/intel_extension_for_transformers/llm/finetuning/finetuning.py
+++ b/intel_extension_for_transformers/llm/finetuning/finetuning.py
@@ -513,7 +513,7 @@ def concatenate_data(dataset, max_seq_length):
                     data_collator=data_collator,
                 )
             else:
-                from optimum.habana import GaudiConfig, GaudiTrainer # pylint: disable=E0611
+                from optimum.habana import GaudiConfig, GaudiTrainer # pylint: disable=E0611 E0401
 
                 gaudi_config = GaudiConfig()
                 gaudi_config.use_fused_adam = True
diff --git a/intel_extension_for_transformers/neural_chat/README.md b/intel_extension_for_transformers/neural_chat/README.md
index 9fc59c44604..3d6c6128b8c 100644
--- a/intel_extension_for_transformers/neural_chat/README.md
+++ b/intel_extension_for_transformers/neural_chat/README.md
@@ -170,12 +170,38 @@ The table below displays the validated model list in NeuralChat for both inferen
 
 ## Jupyter Notebooks 
 
-Check out the latest notebooks to know how to build and customize a chatbot on different platforms.
-
-| **Notebook** | **Description** |
-| :----------: | :-------------: |
-| [build chatbot on Intel Xeon Platforms](./docs/notebooks/chatbot_on_intel_cpu.ipynb) | create a chatbot on Intel Xeon Platforms|
-| [build chatbot on Intel Habana Platforms](./docs/notebooks/chatbot_on_intel_habana_hpu.ipynb) | create a chatbot on Intel Habana Platforms|
-| [build chatbot on Nvidia GPU Platforms](./docs/notebooks/chatbot_on_nv_gpu.ipynb) | create a chatbot on Nvidia GPU Platforms|
-| [finetune on Nvidia GPU Platforms](./examples/instruction_tuning/finetune_on_Nvidia_GPU.ipynb) | fine-tune LLaMA2 and MPT on Nvidia GPU Platforms|
+Welcome to use Jupyter Notebooks to explore how to build and customize chatbots across a wide range of platforms, including Intel Xeon CPU(ICX and SPR), Intel XPU, Intel Habana Gaudi1/Gaudi2, and Nvidia GPU. Dive into our detailed guide to discover how to develop chatbots on these various computing platforms.
+
+| Chapter | Section                                       | Description                                                | Notebook Link                                           |
+| ------- | --------------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------- |
+| 1       | Building a Chatbot on different Platforms   |                                                            |                                                         |
+| 1.1     | Building a Chatbot on Intel CPU ICX         | Learn how to create a chatbot on ICX.                      | [Notebook](./docs/notebooks/build_chatbot_on_icx.ipynb) |
+| 1.2     | Building a Chatbot on Intel CPU SPR         | Learn how to create a chatbot on SPR.                      | [Notebook](./docs/notebooks/build_chatbot_on_spr.ipynb) |
+| 1.3     | Building a Chatbot on Intel XPU             | Learn how to create a chatbot on XPU.                      | [Notebook](./docs/notebooks/build_chatbot_on_xpu.ipynb) |
+| 1.4     | Building a Chatbot on Habana Gaudi1/Gaudi2  | Instructions for building a chatbot on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/build_chatbot_on_habana_gaudi.ipynb) |
+| 1.5     | Building a Chatbot on Nvidia A100           | Learn how to create a chatbot on Nvidia A100 platforms.   | [Notebook](./docs/notebooks/build_chatbot_on_nv_a100.ipynb)   |
+| 2       | Deploying Chatbots as Services on Different Platforms |                                                  |                                                         |
+| 2.1     | Deploying a Chatbot on Intel CPU ICX        | Instructions for deploying a chatbot on ICX.               | [Notebook](./docs/notebooks/deploy_chatbot_on_icx.ipynb) |
+| 2.2     | Deploying a Chatbot on Intel CPU SPR        | Instructions for deploying a chatbot on SPR.               | [Notebook](./docs/notebooks/deploy_chatbot_on_spr.ipynb) |
+| 2.3     | Deploying a Chatbot on Intel XPU            | Learn how to deploy a chatbot on Intel XPU.                | [Notebook](./docs/notebooks/deploy_chatbot_on_xpu.ipynb) |
+| 2.4     | Deploying a Chatbot on Habana Gaudi1/Gaudi2 | Instructions for deploying a chatbot on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb) |
+| 2.5     | Deploying a Chatbot on Nvidia A100          | Learn how to deploy a chatbot as a service on Nvidia A100 platforms. | [Notebook](./docs/notebooks/deploy_chatbot_on_nv_a100.ipynb) |
+| 2.6     | Deploying Chatbot with load balance         | Learn how to deploy a chatbot as a service with load balance. | [Notebook](./docs/notebooks/chatbot_with_load_balance.ipynb) |
+| 3       | Optimizing Chatbots on Different Platforms  |                                                            |                                                         |
+| 3.1     | AMP Optimization on SPR                     | Optimize your chatbot using Automatic Mixed Precision (AMP) on SPR platforms. | [Notebook](./docs/notebooks/amp_optimization_on_spr.ipynb) |
+| 3.2     | AMP Optimization on Habana Gaudi1/Gaudi2    | Learn how to optimize your chatbot with AMP on Intel Habana Gaudi1/Gaudi2 platforms. | [Notebook](./docs/notebooks/amp_optimization_on_habana_gaudi.ipynb) |
+| 3.3     | Weight-Only Optimization on Nvidia A100     | Optimize your chatbot using Weight-Only optimization on Nvidia A100. | [Notebook](./docs/notebooks/weight_only_optimization_on_nv_a100.ipynb) |
+| 4       | Fine-Tuning Chatbots on Different Platforms |                                                            |                                                         |
+| 4.1     | Single Node Fine-Tuning on SPR               | Fine-tune your chatbot on SPR platforms using single node. | [Notebook](./docs/notebooks/single_node_finetuning_on_spr.ipynb) |
+| 4.2     | Multi-Node Fine-Tuning on SPR                | Fine-tune your chatbot on SPR platforms using multiple nodes. | [Notebook](./docs/notebooks/multi_node_finetuning_on_spr.ipynb) |
+| 4.3     | Single-Card Fine-Tuning on Habana Gaudi1/Gaudi2 | Instructions for single-card fine-tuning on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb) |
+| 4.4     | Multi-Card Fine-Tuning on Habana Gaudi1/Gaudi2 | Learn how to perform multi-card fine-tuning on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb) |
+| 4.5     | Fine-Tuning on Nvidia A100                  | Fine-tune your chatbot on Nvidia A100 platforms.          | [Notebook](./docs/notebooks/finetuning_on_nv_a100.ipynb) |
+| 5       | Customizing Chatbots on Different Platforms |                                                            |                                                         |
+| 5.1     | Using Plugins to Customize Chatbots         | Customize your chatbot using plugins.                      | [Notebook](./docs/notebooks/customize_chatbot_with_plugins.ipynb) |
+| 5.2     | Registering New Models to Customize Chatbots |                                                            |                                                         |
+| 5.2.1   | Using Fine-Tuned Models to Customize Chatbots | Instructions for using fine-tuned models to customize chatbots. | [Notebook](./docs/notebooks/customize_chatbot_with_finetuned_models.ipynb) |
+| 5.2.2   | Using Optimized Models to Customize Chatbots | Customize chatbots using optimized models.                | [Notebook](./docs/notebooks/customize_chatbot_with_optimized_models.ipynb) |
+| 5.2.3   | Using New LLM Models to Customize Chatbots  | Learn how to use new LLM models for chatbot customization. | [Notebook](./docs/notebooks/customize_chatbot_with_new_llm_models.ipynb) |
+
 
diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py
index 441ee840f72..7248dc0bcb7 100644
--- a/intel_extension_for_transformers/neural_chat/config.py
+++ b/intel_extension_for_transformers/neural_chat/config.py
@@ -81,7 +81,7 @@ class ModelArguments:
         },
     )
     use_fast_tokenizer: bool = field(
-        default=True,
+        default=False,
         metadata={
             "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
         },
@@ -312,7 +312,7 @@ class FinetuningArguments:
         },
     )
     lora_all_linear: bool = field(
-        default=False,
+        default=True,
         metadata={"help": "if True, will add adaptor for all linear for lora finetuning"},
     )
     task: Optional[str] = field(
@@ -322,7 +322,7 @@ class FinetuningArguments:
             },
     )
     do_lm_eval: bool = field(
-        default=False,
+        default=True,
         metadata={"help": "whether to run the LM evaluation with EleutherAI/lm-evaluation-harness"},
     )
     lm_eval_tasks: Optional[List[str]] = field(
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..699e065d568
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
@@ -0,0 +1,61 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AMP Optimization of Chatbot on Habana's Gaudi processors(HPU)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd ./intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/docker/\n",
+    "docker build --build-arg UBUNTU_VER=22.04 -f Dockerfile -t neuralchat . --target hpu\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host neuralchat:latest\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## BF16 Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
+    "config = PipelineConfig(optimization_config=AMPConfig())\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
new file mode 100644
index 00000000000..7a384788c6b
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AMP Optimization of Chatbot on 4th Generation of Intel® Xeon® Scalable Processors Sapphire Rapids"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## BF16 Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
+    "config = PipelineConfig(optimization_config=AMPConfig())\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py39",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..d9e48cb50e4
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
@@ -0,0 +1,184 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Habana's Gaudi processors(HPU)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference and finetuning on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd ./intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/docker/\n",
+    "docker build --build-arg UBUNTU_VER=22.04 -f Dockerfile -t neuralchat . --target hpu\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host neuralchat:latest\n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "chatbot = build_chatbot()\n",
+    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With Retrieval Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat Retrieval plugin to do domain specific chat by feding with some documents like below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"../../assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ASR & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
new file mode 100644
index 00000000000..e847f6ea728
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
@@ -0,0 +1,227 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on 3rd Generation of Intel® Xeon® Scalable Processors Ice Lake."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Build your chatbot 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Python Code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "chatbot = build_chatbot()\n",
+    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "CLI command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!neuralchat predict --query \"Tell me about Intel Xeon Scalable Processors.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With Retrieval Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat Retrieval plugin to do domain specific chat by feding with some documents like below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"../../assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ASR & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
new file mode 100644
index 00000000000..e31c6ac5d3b
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
@@ -0,0 +1,196 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Nvidia GPUs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "chatbot = build_chatbot()\n",
+    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With Retrieval Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat Retrieval plugin to do domain specific chat by feding with some documents like below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"../../assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ASR & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
new file mode 100644
index 00000000000..5393ee0534b
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
@@ -0,0 +1,238 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on 4th Generation of Intel® Xeon® Scalable Processors Sapphire Rapids.\n",
+    "\n",
+    "The 4th Generation of Intel® Xeon® Scalable processor provides two instruction sets viz. AMX_BF16 and AMX_INT8 which provides acceleration for bfloat16 and int8 operations respectively."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# BF16 Optimization\n",
+    "from intel_extension_for_transformers.neural_chat.config import AMPConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n",
+    "config = PipelineConfig(optimization_config=AMPConfig(), model_name_or_path='mosaicml/mpt-7b-chat',tokenizer_name_or_path='EleutherAI/gpt-neox-20b')\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With Retrieval Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat Retrieval plugin to do domain specific chat by feding with some documents like below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"../../assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins, model_name_or_path='mosaicml/mpt-7b-chat',tokenizer_name_or_path='EleutherAI/gpt-neox-20b')\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ASR & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!curl -OL https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/spk_embed_default.pt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Low Precision Optimization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## BF16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# BF16 Optimization\n",
+    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
+    "config = PipelineConfig(optimization_config=AMPConfig())\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
new file mode 100644
index 00000000000..771d0881154
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
@@ -0,0 +1,187 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Intel® Data Center GPU Flex Series 170, Intel® Data Center GPU Max Series and Intel® Arc™ A-Series GPUs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "%pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "chatbot = build_chatbot()\n",
+    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With RAG Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat RAG plugin to do domain specific chat by feding with some documents like below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"./assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ATS & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot, plugins\n",
+    "plugins.asr.enable = True\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"]=\"./output_audio.wav\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_cpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_cpu.ipynb
deleted file mode 100644
index ca6c5478c15..00000000000
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_cpu.ipynb
+++ /dev/null
@@ -1,370 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Generative AI: Develop and Optimize Your Own Talking Chatbot on Intel CPU"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Intel 4th Gen Xeon Scalable Processors."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Prepare Environment"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%bash\n",
-    "%pip install intel-extension-for-transformers"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Inference 💻"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Giving NeuralChat the textual instruction, it will respond with the textual response."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "chatbot = build_chatbot()\n",
-    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat With RAG Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "User could also leverage NeuralChat RAG plugin to do domain specific chat by feding with some documents like below"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "from intel_extension_for_transformers.neural_chat import plugins\n",
-    "plugins.retrieval.enable=True\n",
-    "plugins.retrieval.args[\"input_path\"]=\"./assets/docs/\"\n",
-    "config = PipelineConfig(plugins=plugins)\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Voice Chat with ATS & TTS Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
-    "\n",
-    "For the Python API code, users have the option to enable different voice chat modes by setting audio_input to True for input or audio_output to True for output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
-    "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"./assets/audio/sample.wav\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Finetuning 🔧"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finetune the pretrained large language model (LLM) with the instruction-following dataset for creating the customized chatbot is very easy for NeuralChat."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning LLM"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TextGenerationFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TextGenerationFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TTSFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TTSFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Low Precision Optimization 🚀"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## BF16"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# BF16 Optimization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
-    "config = PipelineConfig(optimization_config=AMPConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Weight-Only Quantization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Weight-Only Quantization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantizationConfig\n",
-    "config = PipelineConfig(optimization_config=WeightOnlyQuantizationConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Client-Server Architecture for Performance and Scalability"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Start Local Server"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "❗ Please notice that the server is running on the background. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import multiprocessing\n",
-    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
-    "import nest_asyncio\n",
-    "nest_asyncio.apply()\n",
-    "\n",
-    "def start_service():\n",
-    "    server_executor = NeuralChatServerExecutor()\n",
-    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
-    "multiprocessing.Process(target=start_service).start()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Text Chat Service "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import TextChatClientExecutor\n",
-    "executor = TextChatClientExecutor()\n",
-    "result = executor(\n",
-    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n",
-    "print(result.text)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Voice Chat Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import VoiceChatClientExecutor\n",
-    "executor = VoiceChatClientExecutor()\n",
-    "result = executor(\n",
-    "    audio_input_path='./assets/audio/sample.wav',\n",
-    "    audio_output_path='./results.wav',\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import IPython\n",
-    "# Play input audio\n",
-    "print(\"     Play Input Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
-    "# Play output audio\n",
-    "print(\"     Play Output Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Finetune Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import FinetuingClientExecutor\n",
-    "executor = FinetuingClientExecutor()\n",
-    "tuning_status = executor(\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
-    "    )"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_habana_hpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_habana_hpu.ipynb
deleted file mode 100644
index efd2f5b5dc3..00000000000
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_habana_hpu.ipynb
+++ /dev/null
@@ -1,391 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Generative AI: Develop and Optimize Your Own Talking Chatbot on Habana HPU"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Habana's Gaudi processors(HPU)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Prepare Environment"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
-    "\n",
-    "```bash\n",
-    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
-    "cd intel-extension-for-transformers/docker/inference/\n",
-    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbothabana:latest  ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
-    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbothabana:latest\n",
-    "```\n",
-    "\n",
-    "To run finetuning on Habana HPU, please execute below steps\n",
-    "\n",
-    "```bash\n",
-    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
-    "cd intel-extension-for-transformers/docker/finetuning/\n",
-    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu\n",
-    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v /dev/shm:/dev/shm  -v /absolute/path/to/llama2:/llama2 -v /absolute/path/to/alpaca_data.json:/dataset/alpaca_data.json --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
-    "\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Inference 💻"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Giving NeuralChat the textual instruction, it will respond with the textual response."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "chatbot = build_chatbot()\n",
-    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat With RAG Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "User could also leverage NeuralChat RAG plugin to do domain specific chat by feding with some documents like below"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "from intel_extension_for_transformers.neural_chat import plugins\n",
-    "plugins.retrieval.enable=True\n",
-    "plugins.retrieval.args[\"input_path\"]=\"./assets/docs/\"\n",
-    "config = PipelineConfig(plugins=plugins)\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Voice Chat with ATS & TTS Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
-    "\n",
-    "For the Python API code, users have the option to enable different voice chat modes by setting audio_input to True for input or audio_output to True for output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
-    "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"./assets/audio/sample.wav\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Finetuning 🔧"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finetune the pretrained large language model (LLM) with the instruction-following dataset for creating the customized chatbot is very easy for NeuralChat."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning LLM"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TextGenerationFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TextGenerationFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TTSFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TTSFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Low Precision Optimization 🚀"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## BF16"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# BF16 Optimization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
-    "config = PipelineConfig(optimization_config=AMPConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Weight-Only Quantization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Weight-Only Quantization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantizationConfig\n",
-    "config = PipelineConfig(optimization_config=WeightOnlyQuantizationConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Client-Server Architecture for Performance and Scalability"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Start Local Server"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "❗ Please notice that the server is running on the background. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import multiprocessing\n",
-    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
-    "import nest_asyncio\n",
-    "nest_asyncio.apply()\n",
-    "\n",
-    "def start_service():\n",
-    "    server_executor = NeuralChatServerExecutor()\n",
-    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
-    "multiprocessing.Process(target=start_service).start()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Text Chat Service "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import TextChatClientExecutor\n",
-    "executor = TextChatClientExecutor()\n",
-    "result = executor(\n",
-    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n",
-    "print(result.text)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Voice Chat Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import VoiceChatClientExecutor\n",
-    "executor = VoiceChatClientExecutor()\n",
-    "result = executor(\n",
-    "    audio_input_path='./assets/audio/sample.wav',\n",
-    "    audio_output_path='./results.wav',\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import IPython\n",
-    "# Play input audio\n",
-    "print(\"     Play Input Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
-    "# Play output audio\n",
-    "print(\"     Play Output Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Finetune Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import FinetuingClientExecutor\n",
-    "executor = FinetuingClientExecutor()\n",
-    "tuning_status = executor(\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
-    "    )"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_nv_gpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_nv_gpu.ipynb
deleted file mode 100644
index c31e0d367cb..00000000000
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_nv_gpu.ipynb
+++ /dev/null
@@ -1,399 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Generative AI: Develop and Optimize Your Own Talking Chatbot on Nvidia GPU"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Nvidia GPUs."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Prepare Environment"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%bash\n",
-    "%pip install intel-extension-for-transformers"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Inference 💻"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Giving NeuralChat the textual instruction, it will respond with the textual response."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "chatbot = build_chatbot()\n",
-    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat With RAG Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "User could also leverage NeuralChat RAG plugin to do domain specific chat by feding with some documents like below"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "from intel_extension_for_transformers.neural_chat import plugins\n",
-    "plugins.retrieval.enable=True\n",
-    "plugins.retrieval.args[\"input_path\"]=\"./assets/docs/\"\n",
-    "config = PipelineConfig(plugins=plugins)\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Voice Chat with ATS & TTS Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
-    "\n",
-    "For the Python API code, users have the option to enable different voice chat modes by setting audio_input to True for input or audio_output to True for output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
-    "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"./assets/audio/sample.wav\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Finetuning 🔧"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finetune the pretrained large language model (LLM) with the instruction-following dataset for creating the customized chatbot is very easy for NeuralChat."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning LLM"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TextGenerationFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TextGenerationFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TTSFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TTSFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Low Precision Optimization 🚀"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## FP16"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# FP16 Optimization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
-    "config = PipelineConfig(optimization_config=AMPConfig(dtype=\"float16\"))\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Weight-Only Quantization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Weight-Only Quantization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantizationConfig\n",
-    "config = PipelineConfig(optimization_config=WeightOnlyQuantizationConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Bitsandbytes Quantization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Bitsandbytes Quantization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, BitsAndBytesConfig\n",
-    "config = PipelineConfig(\n",
-    "    device='cuda',\n",
-    "    optimization_config=BitsAndBytesConfig(\n",
-    "            load_in_4bit=True,\n",
-    "            bnb_4bit_quant_type='nf4',\n",
-    "            bnb_4bit_use_double_quant=True,\n",
-    "            bnb_4bit_compute_dtype=\"bfloat16\"\n",
-    "        )\n",
-    ")\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Client-Server Architecture for Performance and Scalability"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Start Local Server"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "❗ Please notice that the server is running on the background. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import multiprocessing\n",
-    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
-    "import nest_asyncio\n",
-    "nest_asyncio.apply()\n",
-    "\n",
-    "def start_service():\n",
-    "    server_executor = NeuralChatServerExecutor()\n",
-    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
-    "multiprocessing.Process(target=start_service).start()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Text Chat Service "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import TextChatClientExecutor\n",
-    "executor = TextChatClientExecutor()\n",
-    "result = executor(\n",
-    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n",
-    "print(result.text)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Voice Chat Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import VoiceChatClientExecutor\n",
-    "executor = VoiceChatClientExecutor()\n",
-    "result = executor(\n",
-    "    audio_input_path='./assets/audio/sample.wav',\n",
-    "    audio_output_path='./results.wav',\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import IPython\n",
-    "# Play input audio\n",
-    "print(\"     Play Input Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
-    "# Play output audio\n",
-    "print(\"     Play Output Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Finetune Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import FinetuingClientExecutor\n",
-    "executor = FinetuingClientExecutor()\n",
-    "tuning_status = executor(\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
-    "    )"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..4f86b55ee43
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
@@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on Habana's Gaudi processors(HPU)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference and finetuning on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd ./intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/docker/\n",
+    "docker build --build-arg UBUNTU_VER=22.04 -f Dockerfile -t neuralchat . --target hpu\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host neuralchat:latest\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb
new file mode 100644
index 00000000000..f9749cda5ea
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb
@@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on 3rd Generation of Intel® Xeon® Scalable Processors Ice Lake."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
new file mode 100644
index 00000000000..e58ed93e3de
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
@@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on Nvidia GPUs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb
new file mode 100644
index 00000000000..9ab1f885ca4
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb
@@ -0,0 +1,195 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on 4th Generation of Intel® Xeon® Scalable Processors Sapphire Rapids.\n",
+    "\n",
+    "The 4th Generation of Intel® Xeon® Scalable processor provides two instruction sets viz. AMX_BF16 and AMX_INT8 which provides acceleration for bfloat16 and int8 operations respectively."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
new file mode 100644
index 00000000000..b61eb709713
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
@@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on Intel® Data Center GPU Flex Series 170, Intel® Data Center GPU Max Series and Intel® Arc™ A-Series GPUs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb
new file mode 100644
index 00000000000..ed8805283c7
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on Nvidia A100 GPU"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot on the customized data on Nvidia A100 GPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Recommend to use Python 3.9 or higher version."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "\n",
+    "We employ the [LoRA approach](https://arxiv.org/pdf/2106.09685.pdf) to finetune the LLM efficiently."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on Alpaca-format dataset to conduct text generation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(train_file=\"alpaca_data.json\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on the summarization task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"cnn_dailymail\", dataset_config_name=\"3.0.0\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on the code generation task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"theblackcat102/evol-codealpaca-v1\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..8d705e82447
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb
@@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on Habana Gaudi \n",
+    "\n",
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot with the customized data on multi Intel Habana Gaodi Processors.\n",
+    "\n",
+    "## Prepare Environment\n",
+    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations.\n",
+    "\n",
+    "IMPORTANT: Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook.\n",
+    "\n",
+    "To run finetuning on Habana HPU, please execute below steps\n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd intel-extension-for-transformers/neural_chat/docker/finetuning/\n",
+    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "You could resort to `gaudi_spawn.py` to automatically complete the setting for the multiple card on habana. Then, you can train your chatbot with Alpaca dataset.\n",
+    "```bash\n",
+    "python gaudi_spawn.py \\\n",
+    "        --world_size 8 --use_mpi finetune_clm.py \\\n",
+    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
+    "        --bf16 True \\\n",
+    "        --train_file \"/path/to/alpaca_data.json\" \\\n",
+    "        --dataset_concatenation \\\n",
+    "        --per_device_train_batch_size 2 \\\n",
+    "        --per_device_eval_batch_size 2 \\\n",
+    "        --gradient_accumulation_steps 4 \\\n",
+    "        --evaluation_strategy \"no\" \\\n",
+    "        --save_strategy \"steps\" \\\n",
+    "        --save_steps 2000 \\\n",
+    "        --save_total_limit 1 \\\n",
+    "        --learning_rate 1e-4  \\\n",
+    "        --logging_steps 1 \\\n",
+    "        --do_train \\\n",
+    "        --num_train_epochs 3 \\\n",
+    "        --overwrite_output_dir \\\n",
+    "        --log_level info \\\n",
+    "        --output_dir ./llama_peft_finetuned_model \\\n",
+    "        --peft lora \\\n",
+    "        --use_fast_tokenizer false \\\n",
+    "        --device \"habana\" \\\n",
+    "        --use_habana \\\n",
+    "        --use_lazy_mode \\\n",
+    "```\n",
+    "\n",
+    "Train your chatbot on the summarization task.\n",
+    "```bash\n",
+    "python gaudi_spawn.py \\\n",
+    "        --world_size 8 --use_mpi finetune_clm.py \\\n",
+    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
+    "        --bf16 True \\\n",
+    "        --dataset_name \"cnn_dailymail\" \\\n",
+    "        --dataset_config_name \"3.0.0\" \\\n",
+    "        --dataset_concatenation \\\n",
+    "        --per_device_train_batch_size 2 \\\n",
+    "        --per_device_eval_batch_size 2 \\\n",
+    "        --gradient_accumulation_steps 4 \\\n",
+    "        --evaluation_strategy \"no\" \\\n",
+    "        --save_strategy \"steps\" \\\n",
+    "        --save_steps 2000 \\\n",
+    "        --save_total_limit 1 \\\n",
+    "        --learning_rate 1e-4  \\\n",
+    "        --logging_steps 1 \\\n",
+    "        --do_train \\\n",
+    "        --num_train_epochs 3 \\\n",
+    "        --overwrite_output_dir \\\n",
+    "        --log_level info \\\n",
+    "        --output_dir ./llama_peft_finetuned_model \\\n",
+    "        --peft lora \\\n",
+    "        --use_fast_tokenizer false \\\n",
+    "        --device \"habana\" \\\n",
+    "        --use_habana \\\n",
+    "        --use_lazy_mode \\\n",
+    "```\n",
+    "\n",
+    "Train your chatbot on the code generation task:\n",
+    "```bash\n",
+    "python gaudi_spawn.py \\\n",
+    "        --world_size 8 --use_mpi finetune_clm.py \\\n",
+    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
+    "        --bf16 True \\\n",
+    "        --dataset_name \"theblackcat102/evol-codealpaca-v1\" \\\n",
+    "        --dataset_concatenation \\\n",
+    "        --per_device_train_batch_size 2 \\\n",
+    "        --per_device_eval_batch_size 2 \\\n",
+    "        --gradient_accumulation_steps 4 \\\n",
+    "        --evaluation_strategy \"no\" \\\n",
+    "        --save_strategy \"steps\" \\\n",
+    "        --save_steps 2000 \\\n",
+    "        --save_total_limit 1 \\\n",
+    "        --learning_rate 1e-4  \\\n",
+    "        --logging_steps 1 \\\n",
+    "        --do_train \\\n",
+    "        --num_train_epochs 3 \\\n",
+    "        --overwrite_output_dir \\\n",
+    "        --log_level info \\\n",
+    "        --output_dir ./llama_peft_finetuned_model \\\n",
+    "        --peft lora \\\n",
+    "        --use_fast_tokenizer false \\\n",
+    "        --device \"habana\" \\\n",
+    "        --use_habana \\\n",
+    "        --use_lazy_mode \\\n",
+    "```\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
new file mode 100644
index 00000000000..109627c0696
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
@@ -0,0 +1,173 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on Multi-node SPR\n",
+    "\n",
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot with the customized data on multi-node SPR server.\n",
+    "\n",
+    "## Prepare Environment\n",
+    "We support Distributed Data Parallel (DDP) finetuning on both single node and multi-node settings. Before using DDP to speedup training, we need to configure the environment. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Recommend python 3.9 or higher version.\n",
+    "\n",
+    "```bash\n",
+    "pip install -r requirements.txt\n",
+    "# To use ccl as the distributed backend in distributed training on CPU requires to install below requirement.\n",
+    "python -m pip install oneccl_bind_pt==1.13 -f https://developer.intel.com/ipex-whl-stable-cpu\n",
+    "```\n",
+    "\n",
+    "Then, follow the [hugginface guide](https://huggingface.co/docs/transformers/perf_train_cpu_many) to install Intel® oneCCL Bindings for PyTorch, IPEX\n",
+    "\n",
+    "oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to set the environment before using it.\n",
+    "\n",
+    "For Intel® oneCCL >= 1.12.0:\n",
+    "``` bash\n",
+    "oneccl_bindings_for_pytorch_path=$(python -c \"from oneccl_bindings_for_pytorch import cwd; print(cwd)\")\n",
+    "source $oneccl_bindings_for_pytorch_path/env/setvars.sh\n",
+    "```\n",
+    "\n",
+    "For Intel® oneCCL whose version < 1.12.0:\n",
+    "``` bash\n",
+    "torch_ccl_path=$(python -c \"import torch; import torch_ccl; import os;  print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))\")\n",
+    "source $torch_ccl_path/env/setvars.sh\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "Before start the finetuning, you need to create a node configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument. Here, we take a training with a total of 16 processors on 4 Xeon SPR nodes as an example. We use node 0/1/2/3 to conduct the finetuning, where node 0 is served as the master node, each node has two sockets. ppn (processes per node) is set to 4, means each socket has two processors. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.\n",
+    "\n",
+    "In node 0, you could use the following code to set the node configuration.\n",
+    "``` bash\n",
+    " cat hostfile\n",
+    " xxx.xxx.xxx.xxx #node 0 ip\n",
+    " xxx.xxx.xxx.xxx #node 1 ip\n",
+    " xxx.xxx.xxx.xxx #node 2 ip\n",
+    " xxx.xxx.xxx.xxx #node 3 ip\n",
+    "```\n",
+    "\n",
+    "If you have enabled passwordless SSH in cpu clusters, you could use mpirun in the master node to start the DDP finetune. Run the following command in node0 and **4DDP** will be enabled in node 0/1/2/3 with BF16 auto mixed precision:\n",
+    "``` bash\n",
+    "export CCL_WORKER_COUNT=1\n",
+    "export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip\n",
+    "## DDP p-tuning for Llama\n",
+    "mpirun -f hostfile -n 16 -ppn 4 -genv OMP_NUM_THREADS=56 python3 finetune_clm.py \\\n",
+    "    --model_name_or_path decapoda-research/llama-7b-hf \\\n",
+    "    --train_file ./alpaca_data.json \\\n",
+    "    --bf16 True \\\n",
+    "    --output_dir ./llama_peft_finetuned_model \\\n",
+    "    --num_train_epochs 3 \\\n",
+    "    --per_device_train_batch_size 4 \\\n",
+    "    --per_device_eval_batch_size 4 \\\n",
+    "    --gradient_accumulation_steps 1 \\\n",
+    "    --evaluation_strategy \"no\" \\\n",
+    "    --save_strategy \"steps\" \\\n",
+    "    --save_steps 2000 \\\n",
+    "    --save_total_limit 1 \\\n",
+    "    --learning_rate 1e-4 \\\n",
+    "    --logging_steps 1 \\\n",
+    "    --peft ptun \\\n",
+    "    --group_by_length True \\\n",
+    "    --dataset_concatenation \\\n",
+    "    --use_fast_tokenizer false \\\n",
+    "    --do_train \\\n",
+    "    --no_cuda \\\n",
+    "    --ddp_backend ccl \\\n",
+    "```\n",
+    "you could also indicate `--peft` to switch peft tuning method in ptun (P-tuning), prefix (Prefix tuning), prompt (Prompt tuning), llama_adapter (LLama Adapter), lora (LORA), see https://github.com/huggingface/peft for more detail.\n",
+    "\n",
+    "Similarly, you can train you chatbot on the summarization task:\n",
+    "``` bash\n",
+    "export CCL_WORKER_COUNT=1\n",
+    "export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip\n",
+    "## DDP p-tuning for Llama\n",
+    "mpirun -f hostfile -n 16 -ppn 4 -genv OMP_NUM_THREADS=56 python3 finetune_clm.py \\\n",
+    "    --model_name_or_path decapoda-research/llama-7b-hf \\\n",
+    "    --dataset_name \"cnn_dailymail\" \\\n",
+    "    --dataset_config_name \"3.0.0\" \\\n",
+    "    --bf16 True \\\n",
+    "    --output_dir ./llama_peft_finetuned_model \\\n",
+    "    --num_train_epochs 3 \\\n",
+    "    --per_device_train_batch_size 4 \\\n",
+    "    --per_device_eval_batch_size 4 \\\n",
+    "    --gradient_accumulation_steps 1 \\\n",
+    "    --evaluation_strategy \"no\" \\\n",
+    "    --save_strategy \"steps\" \\\n",
+    "    --save_steps 2000 \\\n",
+    "    --save_total_limit 1 \\\n",
+    "    --learning_rate 1e-4 \\\n",
+    "    --logging_steps 1 \\\n",
+    "    --peft ptun \\\n",
+    "    --group_by_length True \\\n",
+    "    --dataset_concatenation \\\n",
+    "    --use_fast_tokenizer false \\\n",
+    "    --do_train \\\n",
+    "    --no_cuda \\\n",
+    "    --ddp_backend ccl \\\n",
+    "```\n",
+    "\n",
+    "Train your chatbot on the code generation task:\n",
+    "``` bash\n",
+    "export CCL_WORKER_COUNT=1\n",
+    "export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip\n",
+    "## DDP p-tuning for Llama\n",
+    "mpirun -f hostfile -n 16 -ppn 4 -genv OMP_NUM_THREADS=56 python3 finetune_clm.py \\\n",
+    "    --model_name_or_path decapoda-research/llama-7b-hf \\\n",
+    "    --dataset_name \"theblackcat102/evol-codealpaca-v1\" \\\n",
+    "    --bf16 True \\\n",
+    "    --output_dir ./llama_peft_finetuned_model \\\n",
+    "    --num_train_epochs 3 \\\n",
+    "    --per_device_train_batch_size 4 \\\n",
+    "    --per_device_eval_batch_size 4 \\\n",
+    "    --gradient_accumulation_steps 1 \\\n",
+    "    --evaluation_strategy \"no\" \\\n",
+    "    --save_strategy \"steps\" \\\n",
+    "    --save_steps 2000 \\\n",
+    "    --save_total_limit 1 \\\n",
+    "    --learning_rate 1e-4 \\\n",
+    "    --logging_steps 1 \\\n",
+    "    --peft ptun \\\n",
+    "    --group_by_length True \\\n",
+    "    --dataset_concatenation \\\n",
+    "    --use_fast_tokenizer false \\\n",
+    "    --do_train \\\n",
+    "    --no_cuda \\\n",
+    "    --ddp_backend ccl \\\n",
+    "```\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..c8756f645bc
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
@@ -0,0 +1,240 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on Habana Gaudi \n",
+    "\n",
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot with the customized data on Intel Habana Gaodi Processor.\n",
+    "\n",
+    "## Prepare Environment\n",
+    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations.\n",
+    "\n",
+    "IMPORTANT: Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook.\n",
+    "\n",
+    "To run finetuning on Habana HPU, please execute below steps\n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd intel-extension-for-transformers/neural_chat/docker/finetuning/\n",
+    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.habana import GaudiTrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(train_file=\"alpaca_data.json\")\n",
+    "training_args = GaudiTrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    "    use_habana=True,\n",
+    "    use_lazy_mode=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments(device=\"habana\")\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "You can train your chatbot with Alpaca dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.habana import GaudiTrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(train_file=\"alpaca_data.json\")\n",
+    "training_args = GaudiTrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    "    use_habana=True,\n",
+    "    use_lazy_mode=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments(device=\"habana\")\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train your chatbot on the summarization task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.habana import GaudiTrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"cnn_dailymail\", dataset_config_name=\"3.0.0\")\n",
+    "training_args = GaudiTrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    "    use_habana=True,\n",
+    "    use_lazy_mode=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments(device=\"habana\")\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train your chatbot on the code completion task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.habana import GaudiTrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"theblackcat102/evol-codealpaca-v1\")\n",
+    "training_args = GaudiTrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    "    use_habana=True,\n",
+    "    use_lazy_mode=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments(device=\"habana\")\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/single_node_finetuning_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_node_finetuning_on_spr.ipynb
new file mode 100644
index 00000000000..5dd22709d4b
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_node_finetuning_on_spr.ipynb
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on a Single Node Xeon SPR "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot on the customized data on a single node Xeon SPR."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Recommend to use Python 3.9 or higher version."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "\n",
+    "We employ the [LoRA approach](https://arxiv.org/pdf/2106.09685.pdf) to finetune the LLM efficiently."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on Alpaca-format dataset to conduct text generation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(train_file=\"alpaca_data.json\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=2,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on the summarization task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"cnn_dailymail\", dataset_config_name=\"3.0.0\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=2,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on the code generation task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"theblackcat102/evol-codealpaca-v1\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=2,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb
new file mode 100644
index 00000000000..d9ca61b12e3
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb
@@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Weight Only Quantization Optimization of Chatbot on Nvidia's A100"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Weight Only Quantization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantizationConfig\n",
+    "config = PipelineConfig(optimization_config=WeightOnlyQuantizationConfig())\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py39",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
index 76f1441c1a6..2e4135caa09 100644
--- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
+++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
@@ -53,11 +53,14 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
         self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
         self.vocoder.eval()
         script_dir = os.path.dirname(os.path.abspath(__file__))
-        if os.path.exists(os.path.join(script_dir, '../../../assets/speaker_embeddings/spk_embed_default.pt')):
-            default_speaker_embedding_path = os.path.join(script_dir,
-                                                '../../../assets/speaker_embeddings/spk_embed_default.pt')
+        if os.path.exists(os.path.split(os.path.split(os.path.split(script_dir)[0])[0])[0] \
+                          + '/assets/speaker_embeddings/spk_embed_default.pt'):
+            default_speaker_embedding_path = os.path.split(os.path.split(os.path.split(script_dir)[0])[0])[0] \
+                              + '/assets/speaker_embeddings/spk_embed_default.pt'
         elif os.path.exists(os.path.join(asset_path, 'speaker_embeddings/spk_embed_default.pt')):
             default_speaker_embedding_path = os.path.join(asset_path, 'speaker_embeddings/spk_embed_default.pt')
+        elif os.path.exists('spk_embed_default.pt'):
+            default_speaker_embedding_path = 'spk_embed_default.pt'
         else:
             print("Warning! Need to prepare speaker_embeddings")
         # load the default speaker embedding
diff --git a/intel_extension_for_transformers/neural_chat/requirements.txt b/intel_extension_for_transformers/neural_chat/requirements.txt
index 9b038c98ea5..9942b9fe885 100644
--- a/intel_extension_for_transformers/neural_chat/requirements.txt
+++ b/intel_extension_for_transformers/neural_chat/requirements.txt
@@ -28,7 +28,6 @@ starlette
 yacs
 uvicorn
 optimum
-optimum[habana]
 sentence_transformers
 unstructured
 markdown