From 980deee3007a5ed508f1bfca4fd83ec5dae6f38f Mon Sep 17 00:00:00 2001
From: "Lv, Liang1" <liang1.lv@intel.com>
Date: Sun, 10 Sep 2023 22:03:26 +0800
Subject: [PATCH 01/13] Jupyter Notebooks for NeuralChat

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>
---
 .../neural_chat/README.md                     | 40 +++++++++++++++----
 .../amp_optimization_on_habana_gaudi.ipynb    |  0
 .../notebooks/amp_optimization_on_spr.ipynb   |  0
 .../build_chatbot_on_habana_gaudi.ipynb       |  0
 .../docs/notebooks/build_chatbot_on_icx.ipynb |  0
 .../notebooks/build_chatbot_on_nv_a100.ipynb  |  0
 .../docs/notebooks/build_chatbot_on_spr.ipynb |  0
 .../docs/notebooks/build_chatbot_on_xpu.ipynb |  0
 ...tomize_chatbot_with_finetuned_models.ipynb |  0
 ...ustomize_chatbot_with_new_llm_models.ipynb |  0
 ...tomize_chatbot_with_optimized_models.ipynb |  0
 .../customize_chatbot_with_plugins.ipynb      |  0
 .../deploy_chatbot_on_habana_gaudi.ipynb      |  0
 .../notebooks/deploy_chatbot_on_icx.ipynb     |  0
 .../notebooks/deploy_chatbot_on_nv_a100.ipynb |  0
 .../notebooks/deploy_chatbot_on_spr.ipynb     |  0
 .../notebooks/deploy_chatbot_on_xpu.ipynb     |  0
 .../notebooks/finetuning_on_nv_a100.ipynb     |  0
 ...ulti_card_finetuning_on_habana_gaudi.ipynb |  0
 .../multi_node_finetuning_on_spr.ipynb        |  0
 ...ngle_card_finetuning_on_habana_gaudi.ipynb |  0
 .../weight_only_optimization_on_nv_a100.ipynb |  0
 22 files changed, 32 insertions(+), 8 deletions(-)
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_finetuned_models.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_new_llm_models.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_optimized_models.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_plugins.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb

diff --git a/intel_extension_for_transformers/neural_chat/README.md b/intel_extension_for_transformers/neural_chat/README.md
index 9fc59c44604..17790583b8f 100644
--- a/intel_extension_for_transformers/neural_chat/README.md
+++ b/intel_extension_for_transformers/neural_chat/README.md
@@ -170,12 +170,36 @@ The table below displays the validated model list in NeuralChat for both inferen
 
 ## Jupyter Notebooks 
 
-Check out the latest notebooks to know how to build and customize a chatbot on different platforms.
-
-| **Notebook** | **Description** |
-| :----------: | :-------------: |
-| [build chatbot on Intel Xeon Platforms](./docs/notebooks/chatbot_on_intel_cpu.ipynb) | create a chatbot on Intel Xeon Platforms|
-| [build chatbot on Intel Habana Platforms](./docs/notebooks/chatbot_on_intel_habana_hpu.ipynb) | create a chatbot on Intel Habana Platforms|
-| [build chatbot on Nvidia GPU Platforms](./docs/notebooks/chatbot_on_nv_gpu.ipynb) | create a chatbot on Nvidia GPU Platforms|
-| [finetune on Nvidia GPU Platforms](./examples/instruction_tuning/finetune_on_Nvidia_GPU.ipynb) | fine-tune LLaMA2 and MPT on Nvidia GPU Platforms|
+Welcome to use Jupyter Notebooks to explore how to build and customize chatbots across a wide range of platforms, including Intel Xeon CPU(ICX and SPR), Intel XPU, Intel Habana Gaudi1/Gaudi2, and Nvidia GPU. Dive into our detailed guide to discover how to develop chatbots on these various computing platforms.
+
+| Chapter | Section                                       | Description                                                | Notebook Link                                           |
+| ------- | --------------------------------------------- | ---------------------------------------------------------- | ------------------------------------------------------- |
+| 1       | Building a Chatbot on different Platforms   |                                                            |                                                         |
+| 1.1     | Building a Chatbot on Intel CPU ICX         | Learn how to create a chatbot on ICX.                      | [Notebook](./docs/notebooks/build_chatbot_on_icx.ipynb) |
+| 1.2     | Building a Chatbot on Intel CPU SPR         | Learn how to create a chatbot on SPR.                      | [Notebook](./docs/notebooks/build_chatbot_on_spr.ipynb) |
+| 1.3     | Building a Chatbot on Intel XPU             | Learn how to create a chatbot on XPU.                      | [Notebook](./docs/notebooks/build_chatbot_on_xpu.ipynb) |
+| 1.4     | Building a Chatbot on Habana Gaudi1/Gaudi2  | Instructions for building a chatbot on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/build_chatbot_on_habana_gaudi.ipynb) |
+| 1.5     | Building a Chatbot on Nvidia A100           | Learn how to create a chatbot on Nvidia A100 platforms.   | [Notebook](./docs/notebooks/build_chatbot_on_nv_a100.ipynb)   |
+| 2       | Deploying Chatbots as Services on Different Platforms |                                                  |                                                         |
+| 2.1     | Deploying a Chatbot on Intel CPU ICX        | Instructions for deploying a chatbot on ICX.               | [Notebook](./docs/notebooks/deploy_chatbot_on_icx.ipynb) |
+| 2.2     | Deploying a Chatbot on Intel CPU SPR        | Instructions for deploying a chatbot on SPR.               | [Notebook](./docs/notebooks/deploy_chatbot_on_spr.ipynb) |
+| 2.3     | Deploying a Chatbot on Intel XPU            | Learn how to deploy a chatbot on Intel XPU.                | [Notebook](./docs/notebooks/deploy_chatbot_on_xpu.ipynb) |
+| 2.4     | Deploying a Chatbot on Habana Gaudi1/Gaudi2 | Instructions for deploying a chatbot on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb) |
+| 2.5     | Deploying a Chatbot on Nvidia A100          | Learn how to deploy a chatbot as a service on Nvidia A100 platforms. | [Notebook](./docs/notebooks/deploy_chatbot_on_nv_a100.ipynb) |
+| 3       | Optimizing Chatbots on Different Platforms  |                                                            |                                                         |
+| 3.1     | AMP Optimization on SPR                     | Optimize your chatbot using Automatic Mixed Precision (AMP) on SPR platforms. | [Notebook](./docs/notebooks/amp_optimization_on_spr.ipynb) |
+| 3.2     | AMP Optimization on Habana Gaudi1/Gaudi2    | Learn how to optimize your chatbot with AMP on Intel Habana Gaudi1/Gaudi2 platforms. | [Notebook](./docs/notebooks/amp_optimization_on_habana_gaudi.ipynb) |
+| 3.3     | Weight-Only Optimization on Nvidia A100     | Optimize your chatbot using Weight-Only optimization on Nvidia A100. | [Notebook](./docs/notebooks/weight_only_optimization_on_nv_a100.ipynb) |
+| 4       | Fine-Tuning Chatbots on Different Platforms |                                                            |                                                         |
+| 4.1     | Multi-Node Fine-Tuning on SPR                | Fine-tune your chatbot on SPR platforms using multiple nodes. | [Notebook](./docs/notebooks/multi_node_finetuning_on_spr.ipynb) |
+| 4.2     | Single-Card Fine-Tuning on Habana Gaudi1/Gaudi2 | Instructions for single-card fine-tuning on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb) |
+| 4.3     | Multi-Card Fine-Tuning on Habana Gaudi1/Gaudi2 | Learn how to perform multi-card fine-tuning on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb) |
+| 4.4     | Fine-Tuning on Nvidia A100                  | Fine-tune your chatbot on Nvidia A100 platforms.          | [Notebook](./docs/notebooks/finetuning_on_nv_a100.ipynb) |
+| 5       | Customizing Chatbots on Different Platforms |                                                            |                                                         |
+| 5.1     | Using Plugins to Customize Chatbots         | Customize your chatbot using plugins.                      | [Notebook](./docs/notebooks/customize_chatbot_with_plugins.ipynb) |
+| 5.2     | Registering New Models to Customize Chatbots |                                                            |                                                         |
+| 5.2.1   | Using Fine-Tuned Models to Customize Chatbots | Instructions for using fine-tuned models to customize chatbots. | [Notebook](./docs/notebooks/customize_chatbot_with_finetuned_models.ipynb) |
+| 5.2.2   | Using Optimized Models to Customize Chatbots | Customize chatbots using optimized models.                | [Notebook](./docs/notebooks/customize_chatbot_with_optimized_models.ipynb) |
+| 5.2.3   | Using New LLM Models to Customize Chatbots  | Learn how to use new LLM models for chatbot customization. | [Notebook](./docs/notebooks/customize_chatbot_with_new_llm_models.ipynb) |
+
 
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_finetuned_models.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_finetuned_models.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_new_llm_models.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_new_llm_models.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_optimized_models.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_optimized_models.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_plugins.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/customize_chatbot_with_plugins.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb
new file mode 100644
index 00000000000..e69de29bb2d

From ad0d7105c2fd895de0ea9a37022a7054a8985280 Mon Sep 17 00:00:00 2001
From: "Lv, Liang1" <liang1.lv@intel.com>
Date: Sun, 10 Sep 2023 23:26:17 +0800
Subject: [PATCH 02/13] update build and deploy chatbot

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>
---
 .../build_chatbot_on_habana_gaudi.ipynb       | 165 +++++++++++++++
 .../docs/notebooks/build_chatbot_on_icx.ipynb | 198 ++++++++++++++++++
 .../notebooks/build_chatbot_on_nv_a100.ipynb  | 164 +++++++++++++++
 .../docs/notebooks/build_chatbot_on_spr.ipynb | 172 +++++++++++++++
 .../docs/notebooks/build_chatbot_on_xpu.ipynb | 141 +++++++++++++
 .../deploy_chatbot_on_habana_gaudi.ipynb      | 191 +++++++++++++++++
 .../notebooks/deploy_chatbot_on_icx.ipynb     | 193 +++++++++++++++++
 .../notebooks/deploy_chatbot_on_nv_a100.ipynb | 170 +++++++++++++++
 .../notebooks/deploy_chatbot_on_spr.ipynb     | 195 +++++++++++++++++
 .../notebooks/deploy_chatbot_on_xpu.ipynb     | 170 +++++++++++++++
 10 files changed, 1759 insertions(+)

diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
index e69de29bb2d..de20eb0c52a 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
@@ -0,0 +1,165 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Habana's Gaudi processors(HPU)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd intel-extension-for-transformers/docker/inference/\n",
+    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbothabana:latest  ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbothabana:latest\n",
+    "```\n",
+    "\n",
+    "To run finetuning on Habana HPU, please execute below steps\n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd intel-extension-for-transformers/docker/finetuning/\n",
+    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v /dev/shm:/dev/shm  -v /absolute/path/to/llama2:/llama2 -v /absolute/path/to/alpaca_data.json:/dataset/alpaca_data.json --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "chatbot = build_chatbot()\n",
+    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With Retrieval Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat Retrieval plugin to do domain specific chat by feding with some documents like below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"../../assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ASR & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot, plugins\n",
+    "plugins.asr.enable = True\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"]=\"./output_audio.wav\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
index e69de29bb2d..19edff13f5c 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
@@ -0,0 +1,198 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on 3rd Generation of Intel® Xeon® Scalable Processors Ice Lake."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Build your chatbot 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Python Code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "chatbot = build_chatbot()\n",
+    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "CLI command:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!neuralchat predict --query \"Tell me about Intel Xeon Scalable Processors.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With Retrieval Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat Retrieval plugin to do domain specific chat by feding with some documents like below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"../../assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ASR & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot, plugins\n",
+    "plugins.asr.enable = True\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"]=\"./output_audio.wav\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
index e69de29bb2d..513616097c8 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Nvidia GPUs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "chatbot = build_chatbot()\n",
+    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With Retrieval Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat Retrieval plugin to do domain specific chat by feding with some documents like below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"../../assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ASR & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
index e69de29bb2d..d95e4ef3dff 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
@@ -0,0 +1,172 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on 4th Generation of Intel® Xeon® Scalable Processors Sapphire Rapids.\n",
+    "\n",
+    "The 4th Generation of Intel® Xeon® Scalable processor provides two instruction sets viz. AMX_BF16 and AMX_INT8 which provides acceleration for bfloat16 and int8 operations respectively."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# BF16 Optimization\n",
+    "from intel_extension_for_transformers.neural_chat.config import AMPConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n",
+    "config = PipelineConfig(optimization_config=AMPConfig())\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With Retrieval Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat Retrieval plugin to do domain specific chat by feding with some documents like below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"../../assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ASR & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot, plugins\n",
+    "plugins.asr.enable = True\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"]=\"./output_audio.wav\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
index e69de29bb2d..d24f5627b9c 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
@@ -0,0 +1,141 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Intel® Data Center GPU Flex Series 170, Intel® Data Center GPU Max Series and Intel® Arc™ A-Series GPUs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "%pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference 💻"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Giving NeuralChat the textual instruction, it will respond with the textual response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "chatbot = build_chatbot()\n",
+    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Chat With RAG Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "User could also leverage NeuralChat RAG plugin to do domain specific chat by feding with some documents like below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.retrieval.enable=True\n",
+    "plugins.retrieval.args[\"input_path\"]=\"./assets/docs/\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Voice Chat with ATS & TTS Plugin"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
+    "\n",
+    "For the Python API code, users have the option to enable different voice chat modes by setting audio_input to True for input or audio_output to True for output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./assets/audio/sample.wav\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
index e69de29bb2d..c0028e40afc 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
@@ -0,0 +1,191 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on Habana's Gaudi processors(HPU)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd intel-extension-for-transformers/docker/inference/\n",
+    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbothabana:latest  ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbothabana:latest\n",
+    "```\n",
+    "\n",
+    "To run finetuning on Habana HPU, please execute below steps\n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd intel-extension-for-transformers/docker/finetuning/\n",
+    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v /dev/shm:/dev/shm  -v /absolute/path/to/llama2:/llama2 -v /absolute/path/to/alpaca_data.json:/dataset/alpaca_data.json --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb
index e69de29bb2d..f9749cda5ea 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_icx.ipynb
@@ -0,0 +1,193 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on 3rd Generation of Intel® Xeon® Scalable Processors Ice Lake."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
index e69de29bb2d..c72f0b1589f 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
@@ -0,0 +1,170 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on Nvidia GPUs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "%pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb
index e69de29bb2d..9ab1f885ca4 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_spr.ipynb
@@ -0,0 +1,195 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on 4th Generation of Intel® Xeon® Scalable Processors Sapphire Rapids.\n",
+    "\n",
+    "The 4th Generation of Intel® Xeon® Scalable processor provides two instruction sets viz. AMX_BF16 and AMX_INT8 which provides acceleration for bfloat16 and int8 operations respectively."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
index e69de29bb2d..d3a299bc43c 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
@@ -0,0 +1,170 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to deploy a talking chatbot as a service on Intel® Data Center GPU Flex Series 170, Intel® Data Center GPU Max Series and Intel® Arc™ A-Series GPUs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "%pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Client-Server Architecture for Performance and Scalability"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Quick Start Local Server"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "❗ Please notice that the server is running on the background. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import multiprocessing\n",
+    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
+    "import nest_asyncio\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "def start_service():\n",
+    "    server_executor = NeuralChatServerExecutor()\n",
+    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
+    "multiprocessing.Process(target=start_service).start()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Text Chat Service "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import TextChatClientExecutor\n",
+    "executor = TextChatClientExecutor()\n",
+    "result = executor(\n",
+    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n",
+    "print(result.text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Voice Chat Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import VoiceChatClientExecutor\n",
+    "executor = VoiceChatClientExecutor()\n",
+    "result = executor(\n",
+    "    audio_input_path='./assets/audio/sample.wav',\n",
+    "    audio_output_path='./results.wav',\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server entry point \n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import IPython\n",
+    "# Play input audio\n",
+    "print(\"     Play Input Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
+    "# Play output audio\n",
+    "print(\"     Play Output Audio ......\")\n",
+    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Access Finetune Service"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from neural_chat import FinetuingClientExecutor\n",
+    "executor = FinetuingClientExecutor()\n",
+    "tuning_status = executor(\n",
+    "    server_ip=\"127.0.0.1\", # master server ip\n",
+    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 6c8c3a0a7d53061e1ab1dbc961db62d967af088e Mon Sep 17 00:00:00 2001
From: "Ye, Xinyu" <xinyu.ye@intel.com>
Date: Tue, 12 Sep 2023 04:33:12 -0400
Subject: [PATCH 03/13] added NeuralChat optimization notebooks.

Signed-off-by: Ye, Xinyu <xinyu.ye@intel.com>
---
 .../llm/quantization/optimization.py          |  2 +-
 .../amp_optimization_on_habana_gaudi.ipynb    | 61 ++++++++++++++++
 .../notebooks/amp_optimization_on_spr.ipynb   | 70 +++++++++++++++++++
 .../weight_only_optimization_on_nv_a100.ipynb | 70 +++++++++++++++++++
 4 files changed, 202 insertions(+), 1 deletion(-)

diff --git a/intel_extension_for_transformers/llm/quantization/optimization.py b/intel_extension_for_transformers/llm/quantization/optimization.py
index d5e38aea9b3..5db84390df2 100644
--- a/intel_extension_for_transformers/llm/quantization/optimization.py
+++ b/intel_extension_for_transformers/llm/quantization/optimization.py
@@ -45,7 +45,7 @@ def optimize(self, model):
                     },
                 },
             }
-            recipes = {"rtn_args": {"enable_full_range": config.weight_only_quant_config.enable_full_range}}
+            recipes = {"rtn_args": {"enable_full_range": config.enable_full_range}}
             conf = PostTrainingQuantConfig(
                 approach='weight_only',
                 op_type_dict=op_type_dict,
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
index e69de29bb2d..386df18d4a6 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
@@ -0,0 +1,61 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AMP Optimization of Chatbot on Habana's Gaudi processors(HPU)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/docker/inference/\n",
+    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbothabana:latest  ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbothabana:latest\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## BF16 Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
+    "config = PipelineConfig(optimization_config=AMPConfig())\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
index e69de29bb2d..e5ab6d2d8c9 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
@@ -0,0 +1,70 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AMP Optimization of Chatbot on 4th Generation of Intel® Xeon® Scalable Processors Sapphire Rapids"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## BF16 Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
+    "config = PipelineConfig(optimization_config=AMPConfig())\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py39",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb
index e69de29bb2d..09c28df6e5c 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb
@@ -0,0 +1,70 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Weight Only Quantization Optimization of Chatbot on Nvidia's A100"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Weight Only Quantization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantizationConfig\n",
+    "config = PipelineConfig(optimization_config=WeightOnlyQuantizationConfig())\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py39",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 45b342927500c126ede71ea5bb5f5ea42c304dd0 Mon Sep 17 00:00:00 2001
From: Liangyx2 <106130696+Liangyx2@users.noreply.github.com>
Date: Wed, 13 Sep 2023 16:42:36 +0800
Subject: [PATCH 04/13] Update tts.py

---
 .../neural_chat/pipeline/plugins/audio/tts.py                | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
index fb0c0cec4f3..c6e1dded166 100644
--- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
+++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
@@ -57,9 +57,8 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
         self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
         self.vocoder.eval()
         script_dir = os.path.dirname(os.path.abspath(__file__))
-        if os.path.exists(os.path.join(script_dir, '../../../assets/speaker_embeddings/spk_embed_default.pt')):
-            default_speaker_embedding_path = os.path.join(script_dir,
-                                                '../../../assets/speaker_embeddings/spk_embed_default.pt')
+        if os.path.exists(os.path.split(os.path.split(os.path.split(script_dir)[0])[0])[0]+ '/assets/speaker_embeddings/spk_embed_default.pt'):
+            default_speaker_embedding_path = os.path.split(os.path.split(os.path.split(script_dir)[0])[0])[0]+ '/assets/speaker_embeddings/spk_embed_default.pt'
         elif os.path.exists(os.path.join(asset_path, 'speaker_embeddings/spk_embed_default.pt')):
             default_speaker_embedding_path = os.path.join(asset_path, 'speaker_embeddings/spk_embed_default.pt')
         else:

From 2a42491129c825181c52d65e2e608e7c9b6c624b Mon Sep 17 00:00:00 2001
From: XuhuiRen <44249229+XuhuiRen@users.noreply.github.com>
Date: Thu, 14 Sep 2023 13:01:36 +0800
Subject: [PATCH 05/13] Add Notebooks for finetuning chatbot on various
 platforms (#309)

* fix config

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>

* add notebook

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>

---------

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>
---
 .../neural_chat/config.py                     |   6 +-
 .../notebooks/finetuning_on_nv_a100.ipynb     | 228 ++++++++++++++++++
 ...ulti_card_finetuning_on_habana_gaudi.ipynb | 144 +++++++++++
 .../multi_node_finetuning_on_spr.ipynb        | 184 ++++++++++++++
 ...ngle_card_finetuning_on_habana_gaudi.ipynb | 141 +++++++++++
 .../single_node_finetuning_on_spr.ipynb       | 228 ++++++++++++++++++
 .../neural_chat/requirements.txt              |   1 -
 7 files changed, 928 insertions(+), 4 deletions(-)
 create mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/single_node_finetuning_on_spr.ipynb

diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py
index c5f08eb3d35..8e227b43d01 100644
--- a/intel_extension_for_transformers/neural_chat/config.py
+++ b/intel_extension_for_transformers/neural_chat/config.py
@@ -81,7 +81,7 @@ class ModelArguments:
         },
     )
     use_fast_tokenizer: bool = field(
-        default=True,
+        default=False,
         metadata={
             "help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
         },
@@ -312,7 +312,7 @@ class FinetuningArguments:
         },
     )
     lora_all_linear: bool = field(
-        default=False,
+        default=True,
         metadata={"help": "if True, will add adaptor for all linear for lora finetuning"},
     )
     task: Optional[str] = field(
@@ -322,7 +322,7 @@ class FinetuningArguments:
             },
     )
     do_lm_eval: bool = field(
-        default=False,
+        default=True,
         metadata={"help": "whether to run the LM evaluation with EleutherAI/lm-evaluation-harness"},
     )
     lm_eval_tasks: Optional[List[str]] = field(
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb
index e69de29bb2d..ed8805283c7 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/finetuning_on_nv_a100.ipynb
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on Nvidia A100 GPU"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot on the customized data on Nvidia A100 GPU."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Recommend to use Python 3.9 or higher version."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "\n",
+    "We employ the [LoRA approach](https://arxiv.org/pdf/2106.09685.pdf) to finetune the LLM efficiently."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on Alpaca-format dataset to conduct text generation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(train_file=\"alpaca_data.json\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on the summarization task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"cnn_dailymail\", dataset_config_name=\"3.0.0\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on the code generation task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"theblackcat102/evol-codealpaca-v1\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb
index e69de29bb2d..8d705e82447 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb
@@ -0,0 +1,144 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on Habana Gaudi \n",
+    "\n",
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot with the customized data on multi Intel Habana Gaodi Processors.\n",
+    "\n",
+    "## Prepare Environment\n",
+    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations.\n",
+    "\n",
+    "IMPORTANT: Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook.\n",
+    "\n",
+    "To run finetuning on Habana HPU, please execute below steps\n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd intel-extension-for-transformers/neural_chat/docker/finetuning/\n",
+    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "You could resort to `gaudi_spawn.py` to automatically complete the setting for the multiple card on habana. Then, you can train your chatbot with Alpaca dataset.\n",
+    "```bash\n",
+    "python gaudi_spawn.py \\\n",
+    "        --world_size 8 --use_mpi finetune_clm.py \\\n",
+    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
+    "        --bf16 True \\\n",
+    "        --train_file \"/path/to/alpaca_data.json\" \\\n",
+    "        --dataset_concatenation \\\n",
+    "        --per_device_train_batch_size 2 \\\n",
+    "        --per_device_eval_batch_size 2 \\\n",
+    "        --gradient_accumulation_steps 4 \\\n",
+    "        --evaluation_strategy \"no\" \\\n",
+    "        --save_strategy \"steps\" \\\n",
+    "        --save_steps 2000 \\\n",
+    "        --save_total_limit 1 \\\n",
+    "        --learning_rate 1e-4  \\\n",
+    "        --logging_steps 1 \\\n",
+    "        --do_train \\\n",
+    "        --num_train_epochs 3 \\\n",
+    "        --overwrite_output_dir \\\n",
+    "        --log_level info \\\n",
+    "        --output_dir ./llama_peft_finetuned_model \\\n",
+    "        --peft lora \\\n",
+    "        --use_fast_tokenizer false \\\n",
+    "        --device \"habana\" \\\n",
+    "        --use_habana \\\n",
+    "        --use_lazy_mode \\\n",
+    "```\n",
+    "\n",
+    "Train your chatbot on the summarization task.\n",
+    "```bash\n",
+    "python gaudi_spawn.py \\\n",
+    "        --world_size 8 --use_mpi finetune_clm.py \\\n",
+    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
+    "        --bf16 True \\\n",
+    "        --dataset_name \"cnn_dailymail\" \\\n",
+    "        --dataset_config_name \"3.0.0\" \\\n",
+    "        --dataset_concatenation \\\n",
+    "        --per_device_train_batch_size 2 \\\n",
+    "        --per_device_eval_batch_size 2 \\\n",
+    "        --gradient_accumulation_steps 4 \\\n",
+    "        --evaluation_strategy \"no\" \\\n",
+    "        --save_strategy \"steps\" \\\n",
+    "        --save_steps 2000 \\\n",
+    "        --save_total_limit 1 \\\n",
+    "        --learning_rate 1e-4  \\\n",
+    "        --logging_steps 1 \\\n",
+    "        --do_train \\\n",
+    "        --num_train_epochs 3 \\\n",
+    "        --overwrite_output_dir \\\n",
+    "        --log_level info \\\n",
+    "        --output_dir ./llama_peft_finetuned_model \\\n",
+    "        --peft lora \\\n",
+    "        --use_fast_tokenizer false \\\n",
+    "        --device \"habana\" \\\n",
+    "        --use_habana \\\n",
+    "        --use_lazy_mode \\\n",
+    "```\n",
+    "\n",
+    "Train your chatbot on the code generation task:\n",
+    "```bash\n",
+    "python gaudi_spawn.py \\\n",
+    "        --world_size 8 --use_mpi finetune_clm.py \\\n",
+    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
+    "        --bf16 True \\\n",
+    "        --dataset_name \"theblackcat102/evol-codealpaca-v1\" \\\n",
+    "        --dataset_concatenation \\\n",
+    "        --per_device_train_batch_size 2 \\\n",
+    "        --per_device_eval_batch_size 2 \\\n",
+    "        --gradient_accumulation_steps 4 \\\n",
+    "        --evaluation_strategy \"no\" \\\n",
+    "        --save_strategy \"steps\" \\\n",
+    "        --save_steps 2000 \\\n",
+    "        --save_total_limit 1 \\\n",
+    "        --learning_rate 1e-4  \\\n",
+    "        --logging_steps 1 \\\n",
+    "        --do_train \\\n",
+    "        --num_train_epochs 3 \\\n",
+    "        --overwrite_output_dir \\\n",
+    "        --log_level info \\\n",
+    "        --output_dir ./llama_peft_finetuned_model \\\n",
+    "        --peft lora \\\n",
+    "        --use_fast_tokenizer false \\\n",
+    "        --device \"habana\" \\\n",
+    "        --use_habana \\\n",
+    "        --use_lazy_mode \\\n",
+    "```\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
index e69de29bb2d..b7ed8817081 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
@@ -0,0 +1,184 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on Multi-node SPR\n",
+    "\n",
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot with the customized data on multi-node SPR server.\n",
+    "\n",
+    "## Prepare Environment\n",
+    "We support Distributed Data Parallel (DDP) finetuning on both single node and multi-node settings. To use DDP to speedup training, the bash command needs a small adjustment.\n",
+    "For example, to finetune Llama-7b through DDP training, bash command will look like the following, where\n",
+    "<br>\n",
+    "*`<MASTER_ADDRESS>`* is the address of the master node, it won't be necessary for single node case,\n",
+    "<br>\n",
+    "*`<NUM_PROCESSES_PER_NODE>`* is the desired processes to use in current node, for node with GPU, usually set to number of GPUs in this node, for node without GPU and use CPU for training, it's recommended set to 1,\n",
+    "<br>\n",
+    "*`<NUM_NODES>`*  is the number of nodes to use,\n",
+    "<br>\n",
+    "*`<NODE_RANK>`* is the rank of the current node, rank starts from 0 to *`<NUM_NODES>`*`-1`.\n",
+    "<br>\n",
+    "> Also please note that to use CPU for training in each node with multi-node settings, argument `--no_cuda` is mandatory, and `--ddp_backend ccl` is required if to use ccl as the distributed backend. In multi-node setting, following command needs to be launched in each node, and all the commands should be the same except for *`<NODE_RANK>`*, which should be integer from 0 to *`<NUM_NODES>`*`-1` assigned to each node."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Recommend python 3.9 or higher version.\n",
+    "\n",
+    "```bash\n",
+    "pip install -r requirements.txt\n",
+    "# To use ccl as the distributed backend in distributed training on CPU requires to install below requirement.\n",
+    "python -m pip install oneccl_bind_pt==1.13 -f https://developer.intel.com/ipex-whl-stable-cpu\n",
+    "```\n",
+    "\n",
+    "Then, follow the [hugginface guide](https://huggingface.co/docs/transformers/perf_train_cpu_many) to install Intel® oneCCL Bindings for PyTorch, IPEX\n",
+    "\n",
+    "oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to set the environment before using it.\n",
+    "\n",
+    "For Intel® oneCCL >= 1.12.0:\n",
+    "``` bash\n",
+    "oneccl_bindings_for_pytorch_path=$(python -c \"from oneccl_bindings_for_pytorch import cwd; print(cwd)\")\n",
+    "source $oneccl_bindings_for_pytorch_path/env/setvars.sh\n",
+    "```\n",
+    "\n",
+    "For Intel® oneCCL whose version < 1.12.0:\n",
+    "``` bash\n",
+    "torch_ccl_path=$(python -c \"import torch; import torch_ccl; import os;  print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))\")\n",
+    "source $torch_ccl_path/env/setvars.sh\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "Before start the finetuning, you need to create a node configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument. Here, we take a training with a total of 16 processors on 4 Xeon SPR nodes as an example. We use node 0/1/2/3 to conduct the finetuning, where node 0 is served as the master node, each node has two sockets. ppn (processes per node) is set to 4, means each socket has two processors. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.\n",
+    "\n",
+    "In node 0, you could use the following code to set the node configuration.\n",
+    "``` bash\n",
+    " cat hostfile\n",
+    " xxx.xxx.xxx.xxx #node 0 ip\n",
+    " xxx.xxx.xxx.xxx #node 1 ip\n",
+    " xxx.xxx.xxx.xxx #node 2 ip\n",
+    " xxx.xxx.xxx.xxx #node 3 ip\n",
+    "```\n",
+    "\n",
+    "If you have enabled passwordless SSH in cpu clusters, you could use mpirun in the master node to start the DDP finetune. Run the following command in node0 and **4DDP** will be enabled in node 0/1/2/3 with BF16 auto mixed precision:\n",
+    "``` bash\n",
+    "export CCL_WORKER_COUNT=1\n",
+    "export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip\n",
+    "## DDP p-tuning for Llama\n",
+    "mpirun -f hostfile -n 16 -ppn 4 -genv OMP_NUM_THREADS=56 python3 finetune_clm.py \\\n",
+    "    --model_name_or_path decapoda-research/llama-7b-hf \\\n",
+    "    --train_file ./alpaca_data.json \\\n",
+    "    --bf16 True \\\n",
+    "    --output_dir ./llama_peft_finetuned_model \\\n",
+    "    --num_train_epochs 3 \\\n",
+    "    --per_device_train_batch_size 4 \\\n",
+    "    --per_device_eval_batch_size 4 \\\n",
+    "    --gradient_accumulation_steps 1 \\\n",
+    "    --evaluation_strategy \"no\" \\\n",
+    "    --save_strategy \"steps\" \\\n",
+    "    --save_steps 2000 \\\n",
+    "    --save_total_limit 1 \\\n",
+    "    --learning_rate 1e-4 \\\n",
+    "    --logging_steps 1 \\\n",
+    "    --peft ptun \\\n",
+    "    --group_by_length True \\\n",
+    "    --dataset_concatenation \\\n",
+    "    --use_fast_tokenizer false \\\n",
+    "    --do_train \\\n",
+    "    --no_cuda \\\n",
+    "    --ddp_backend ccl \\\n",
+    "```\n",
+    "you could also indicate `--peft` to switch peft tuning method in ptun (P-tuning), prefix (Prefix tuning), prompt (Prompt tuning), llama_adapter (LLama Adapter), lora (LORA), see https://github.com/huggingface/peft for more detail.\n",
+    "\n",
+    "Similarly, you can train you chatbot on the summarization task:\n",
+    "``` bash\n",
+    "export CCL_WORKER_COUNT=1\n",
+    "export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip\n",
+    "## DDP p-tuning for Llama\n",
+    "mpirun -f hostfile -n 16 -ppn 4 -genv OMP_NUM_THREADS=56 python3 finetune_clm.py \\\n",
+    "    --model_name_or_path decapoda-research/llama-7b-hf \\\n",
+    "    --dataset_name \"cnn_dailymail\" \\\n",
+    "    --dataset_config_name \"3.0.0\" \\\n",
+    "    --bf16 True \\\n",
+    "    --output_dir ./llama_peft_finetuned_model \\\n",
+    "    --num_train_epochs 3 \\\n",
+    "    --per_device_train_batch_size 4 \\\n",
+    "    --per_device_eval_batch_size 4 \\\n",
+    "    --gradient_accumulation_steps 1 \\\n",
+    "    --evaluation_strategy \"no\" \\\n",
+    "    --save_strategy \"steps\" \\\n",
+    "    --save_steps 2000 \\\n",
+    "    --save_total_limit 1 \\\n",
+    "    --learning_rate 1e-4 \\\n",
+    "    --logging_steps 1 \\\n",
+    "    --peft ptun \\\n",
+    "    --group_by_length True \\\n",
+    "    --dataset_concatenation \\\n",
+    "    --use_fast_tokenizer false \\\n",
+    "    --do_train \\\n",
+    "    --no_cuda \\\n",
+    "    --ddp_backend ccl \\\n",
+    "```\n",
+    "\n",
+    "Train your chatbot on the code generation task:\n",
+    "``` bash\n",
+    "export CCL_WORKER_COUNT=1\n",
+    "export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip\n",
+    "## DDP p-tuning for Llama\n",
+    "mpirun -f hostfile -n 16 -ppn 4 -genv OMP_NUM_THREADS=56 python3 finetune_clm.py \\\n",
+    "    --model_name_or_path decapoda-research/llama-7b-hf \\\n",
+    "    --dataset_name \"theblackcat102/evol-codealpaca-v1\" \\\n",
+    "    --bf16 True \\\n",
+    "    --output_dir ./llama_peft_finetuned_model \\\n",
+    "    --num_train_epochs 3 \\\n",
+    "    --per_device_train_batch_size 4 \\\n",
+    "    --per_device_eval_batch_size 4 \\\n",
+    "    --gradient_accumulation_steps 1 \\\n",
+    "    --evaluation_strategy \"no\" \\\n",
+    "    --save_strategy \"steps\" \\\n",
+    "    --save_steps 2000 \\\n",
+    "    --save_total_limit 1 \\\n",
+    "    --learning_rate 1e-4 \\\n",
+    "    --logging_steps 1 \\\n",
+    "    --peft ptun \\\n",
+    "    --group_by_length True \\\n",
+    "    --dataset_concatenation \\\n",
+    "    --use_fast_tokenizer false \\\n",
+    "    --do_train \\\n",
+    "    --no_cuda \\\n",
+    "    --ddp_backend ccl \\\n",
+    "```\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
index e69de29bb2d..feb169a9a4e 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
@@ -0,0 +1,141 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on Habana Gaudi \n",
+    "\n",
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot with the customized data on Intel Habana Gaodi Processor.\n",
+    "\n",
+    "## Prepare Environment\n",
+    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations.\n",
+    "\n",
+    "IMPORTANT: Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook.\n",
+    "\n",
+    "To run finetuning on Habana HPU, please execute below steps\n",
+    "\n",
+    "```bash\n",
+    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
+    "cd intel-extension-for-transformers/neural_chat/docker/finetuning/\n",
+    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "You can train your chatbot with Alpaca dataset.\n",
+    "```bash\n",
+    "python finetune_clm.py \\\n",
+    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
+    "        --bf16 True \\\n",
+    "        --train_file \"/path/to/alpaca_data.json\" \\\n",
+    "        --dataset_concatenation \\\n",
+    "        --per_device_train_batch_size 2 \\\n",
+    "        --per_device_eval_batch_size 2 \\\n",
+    "        --gradient_accumulation_steps 4 \\\n",
+    "        --evaluation_strategy \"no\" \\\n",
+    "        --save_strategy \"steps\" \\\n",
+    "        --save_steps 2000 \\\n",
+    "        --save_total_limit 1 \\\n",
+    "        --learning_rate 1e-4  \\\n",
+    "        --logging_steps 1 \\\n",
+    "        --do_train \\\n",
+    "        --num_train_epochs 3 \\\n",
+    "        --overwrite_output_dir \\\n",
+    "        --log_level info \\\n",
+    "        --output_dir ./llama_peft_finetuned_model \\\n",
+    "        --peft lora \\\n",
+    "        --use_fast_tokenizer false \\\n",
+    "        --device \"habana\" \\\n",
+    "        --use_habana \\\n",
+    "        --use_lazy_mode \\\n",
+    "```\n",
+    "\n",
+    "Train your chatbot on the summarization task.\n",
+    "```bash\n",
+    "python finetune_clm.py \\\n",
+    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
+    "        --bf16 True \\\n",
+    "        --dataset_name \"cnn_dailymail\" \\\n",
+    "        --dataset_config_name \"3.0.0\" \\\n",
+    "        --dataset_concatenation \\\n",
+    "        --per_device_train_batch_size 2 \\\n",
+    "        --per_device_eval_batch_size 2 \\\n",
+    "        --gradient_accumulation_steps 4 \\\n",
+    "        --evaluation_strategy \"no\" \\\n",
+    "        --save_strategy \"steps\" \\\n",
+    "        --save_steps 2000 \\\n",
+    "        --save_total_limit 1 \\\n",
+    "        --learning_rate 1e-4  \\\n",
+    "        --logging_steps 1 \\\n",
+    "        --do_train \\\n",
+    "        --num_train_epochs 3 \\\n",
+    "        --overwrite_output_dir \\\n",
+    "        --log_level info \\\n",
+    "        --output_dir ./llama_peft_finetuned_model \\\n",
+    "        --peft lora \\\n",
+    "        --use_fast_tokenizer false \\\n",
+    "        --device \"habana\" \\\n",
+    "        --use_habana \\\n",
+    "        --use_lazy_mode \\\n",
+    "```\n",
+    "\n",
+    "Train your chatbot on the code generation task:\n",
+    "```bash\n",
+    "python finetune_clm.py \\\n",
+    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
+    "        --bf16 True \\\n",
+    "        --dataset_name \"theblackcat102/evol-codealpaca-v1\" \\\n",
+    "        --dataset_concatenation \\\n",
+    "        --per_device_train_batch_size 2 \\\n",
+    "        --per_device_eval_batch_size 2 \\\n",
+    "        --gradient_accumulation_steps 4 \\\n",
+    "        --evaluation_strategy \"no\" \\\n",
+    "        --save_strategy \"steps\" \\\n",
+    "        --save_steps 2000 \\\n",
+    "        --save_total_limit 1 \\\n",
+    "        --learning_rate 1e-4  \\\n",
+    "        --logging_steps 1 \\\n",
+    "        --do_train \\\n",
+    "        --num_train_epochs 3 \\\n",
+    "        --overwrite_output_dir \\\n",
+    "        --log_level info \\\n",
+    "        --output_dir ./llama_peft_finetuned_model \\\n",
+    "        --peft lora \\\n",
+    "        --use_fast_tokenizer false \\\n",
+    "        --device \"habana\" \\\n",
+    "        --use_habana \\\n",
+    "        --use_lazy_mode \\\n",
+    "```\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/single_node_finetuning_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_node_finetuning_on_spr.ipynb
new file mode 100644
index 00000000000..5dd22709d4b
--- /dev/null
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_node_finetuning_on_spr.ipynb
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetune Your Chatbot on a Single Node Xeon SPR "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot on the customized data on a single node Xeon SPR."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare Environment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Recommend to use Python 3.9 or higher version."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Dataset\n",
+    "We select 3 kind of datasets to conduct the finetuning process for different tasks.\n",
+    "\n",
+    "1. Text Generation (General domain instruction): We use the [Alpaca dataset](https://github.com/tatsu-lab/stanford_alpaca) from Stanford University as the general domain dataset to fine-tune the model. This dataset is provided in the form of a JSON file, [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json). In Alpaca, researchers have manually crafted 175 seed tasks to guide `text-davinci-003` in generating 52K instruction data for diverse tasks.\n",
+    "\n",
+    "2. Summarization: An English-language dataset [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) containing just over 300k unique news articles as written by journalists at CNN and the Daily Mail, is used for this task.\n",
+    "\n",
+    "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1).\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Finetune Your Chatbot\n",
+    "\n",
+    "We employ the [LoRA approach](https://arxiv.org/pdf/2106.09685.pdf) to finetune the LLM efficiently."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on Alpaca-format dataset to conduct text generation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(train_file=\"alpaca_data.json\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=2,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on the summarization task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"cnn_dailymail\", dataset_config_name=\"3.0.0\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=2,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finetune the model on the code generation task:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"theblackcat102/evol-codealpaca-v1\")\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=2,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True\n",
+    ")\n",
+    "finetune_args = FinetuningArguments()\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/intel_extension_for_transformers/neural_chat/requirements.txt b/intel_extension_for_transformers/neural_chat/requirements.txt
index 9b038c98ea5..9942b9fe885 100644
--- a/intel_extension_for_transformers/neural_chat/requirements.txt
+++ b/intel_extension_for_transformers/neural_chat/requirements.txt
@@ -28,7 +28,6 @@ starlette
 yacs
 uvicorn
 optimum
-optimum[habana]
 sentence_transformers
 unstructured
 markdown

From ef3fcbbb31e28a1bf981b8315f3d7eaec5dafb04 Mon Sep 17 00:00:00 2001
From: XuhuiRen <xuhui.ren@intel.com>
Date: Thu, 14 Sep 2023 14:50:07 +0800
Subject: [PATCH 06/13] fix as suggestions

Signed-off-by: XuhuiRen <xuhui.ren@intel.com>
---
 .../neural_chat/config.py                     |   2 +-
 .../multi_node_finetuning_on_spr.ipynb        |  13 +-
 ...ngle_card_finetuning_on_habana_gaudi.ipynb | 267 ++++++++++++------
 3 files changed, 185 insertions(+), 97 deletions(-)

diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py
index 8e227b43d01..b8266550a89 100644
--- a/intel_extension_for_transformers/neural_chat/config.py
+++ b/intel_extension_for_transformers/neural_chat/config.py
@@ -420,7 +420,7 @@ class AMPConfig:
 
 class PipelineConfig:
     def __init__(self,
-                 model_name_or_path="meta-llama/Llama-2-7b-hf",
+                 model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
                  tokenizer_name_or_path=None,
                  hf_access_token=None,
                  device="auto",
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
index b7ed8817081..109627c0696 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/multi_node_finetuning_on_spr.ipynb
@@ -9,18 +9,7 @@
     "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook will introduce how to finetune your chatbot with the customized data on multi-node SPR server.\n",
     "\n",
     "## Prepare Environment\n",
-    "We support Distributed Data Parallel (DDP) finetuning on both single node and multi-node settings. To use DDP to speedup training, the bash command needs a small adjustment.\n",
-    "For example, to finetune Llama-7b through DDP training, bash command will look like the following, where\n",
-    "<br>\n",
-    "*`<MASTER_ADDRESS>`* is the address of the master node, it won't be necessary for single node case,\n",
-    "<br>\n",
-    "*`<NUM_PROCESSES_PER_NODE>`* is the desired processes to use in current node, for node with GPU, usually set to number of GPUs in this node, for node without GPU and use CPU for training, it's recommended set to 1,\n",
-    "<br>\n",
-    "*`<NUM_NODES>`*  is the number of nodes to use,\n",
-    "<br>\n",
-    "*`<NODE_RANK>`* is the rank of the current node, rank starts from 0 to *`<NUM_NODES>`*`-1`.\n",
-    "<br>\n",
-    "> Also please note that to use CPU for training in each node with multi-node settings, argument `--no_cuda` is mandatory, and `--ddp_backend ccl` is required if to use ccl as the distributed backend. In multi-node setting, following command needs to be launched in each node, and all the commands should be the same except for *`<NODE_RANK>`*, which should be integer from 0 to *`<NUM_NODES>`*`-1` assigned to each node."
+    "We support Distributed Data Parallel (DDP) finetuning on both single node and multi-node settings. Before using DDP to speedup training, we need to configure the environment. "
    ]
   },
   {
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
index feb169a9a4e..c8756f645bc 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb
@@ -38,95 +38,194 @@
     "3. Code Generation: To enhance code performance of LLMs (Large Language Models), we use the [theblackcat102/evol-codealpaca-v1](https://huggingface.co/datasets/theblackcat102/evol-codealpaca-v1)."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.habana import GaudiTrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(train_file=\"alpaca_data.json\")\n",
+    "training_args = GaudiTrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    "    use_habana=True,\n",
+    "    use_lazy_mode=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments(device=\"habana\")\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Finetune Your Chatbot\n",
-    "You can train your chatbot with Alpaca dataset.\n",
-    "```bash\n",
-    "python finetune_clm.py \\\n",
-    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
-    "        --bf16 True \\\n",
-    "        --train_file \"/path/to/alpaca_data.json\" \\\n",
-    "        --dataset_concatenation \\\n",
-    "        --per_device_train_batch_size 2 \\\n",
-    "        --per_device_eval_batch_size 2 \\\n",
-    "        --gradient_accumulation_steps 4 \\\n",
-    "        --evaluation_strategy \"no\" \\\n",
-    "        --save_strategy \"steps\" \\\n",
-    "        --save_steps 2000 \\\n",
-    "        --save_total_limit 1 \\\n",
-    "        --learning_rate 1e-4  \\\n",
-    "        --logging_steps 1 \\\n",
-    "        --do_train \\\n",
-    "        --num_train_epochs 3 \\\n",
-    "        --overwrite_output_dir \\\n",
-    "        --log_level info \\\n",
-    "        --output_dir ./llama_peft_finetuned_model \\\n",
-    "        --peft lora \\\n",
-    "        --use_fast_tokenizer false \\\n",
-    "        --device \"habana\" \\\n",
-    "        --use_habana \\\n",
-    "        --use_lazy_mode \\\n",
-    "```\n",
-    "\n",
-    "Train your chatbot on the summarization task.\n",
-    "```bash\n",
-    "python finetune_clm.py \\\n",
-    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
-    "        --bf16 True \\\n",
-    "        --dataset_name \"cnn_dailymail\" \\\n",
-    "        --dataset_config_name \"3.0.0\" \\\n",
-    "        --dataset_concatenation \\\n",
-    "        --per_device_train_batch_size 2 \\\n",
-    "        --per_device_eval_batch_size 2 \\\n",
-    "        --gradient_accumulation_steps 4 \\\n",
-    "        --evaluation_strategy \"no\" \\\n",
-    "        --save_strategy \"steps\" \\\n",
-    "        --save_steps 2000 \\\n",
-    "        --save_total_limit 1 \\\n",
-    "        --learning_rate 1e-4  \\\n",
-    "        --logging_steps 1 \\\n",
-    "        --do_train \\\n",
-    "        --num_train_epochs 3 \\\n",
-    "        --overwrite_output_dir \\\n",
-    "        --log_level info \\\n",
-    "        --output_dir ./llama_peft_finetuned_model \\\n",
-    "        --peft lora \\\n",
-    "        --use_fast_tokenizer false \\\n",
-    "        --device \"habana\" \\\n",
-    "        --use_habana \\\n",
-    "        --use_lazy_mode \\\n",
-    "```\n",
-    "\n",
-    "Train your chatbot on the code generation task:\n",
-    "```bash\n",
-    "python finetune_clm.py \\\n",
-    "        --model_name_or_path \"decapoda-research/llama-7b-hf\" \\\n",
-    "        --bf16 True \\\n",
-    "        --dataset_name \"theblackcat102/evol-codealpaca-v1\" \\\n",
-    "        --dataset_concatenation \\\n",
-    "        --per_device_train_batch_size 2 \\\n",
-    "        --per_device_eval_batch_size 2 \\\n",
-    "        --gradient_accumulation_steps 4 \\\n",
-    "        --evaluation_strategy \"no\" \\\n",
-    "        --save_strategy \"steps\" \\\n",
-    "        --save_steps 2000 \\\n",
-    "        --save_total_limit 1 \\\n",
-    "        --learning_rate 1e-4  \\\n",
-    "        --logging_steps 1 \\\n",
-    "        --do_train \\\n",
-    "        --num_train_epochs 3 \\\n",
-    "        --overwrite_output_dir \\\n",
-    "        --log_level info \\\n",
-    "        --output_dir ./llama_peft_finetuned_model \\\n",
-    "        --peft lora \\\n",
-    "        --use_fast_tokenizer false \\\n",
-    "        --device \"habana\" \\\n",
-    "        --use_habana \\\n",
-    "        --use_lazy_mode \\\n",
-    "```\n"
+    "You can train your chatbot with Alpaca dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.habana import GaudiTrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(train_file=\"alpaca_data.json\")\n",
+    "training_args = GaudiTrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    "    use_habana=True,\n",
+    "    use_lazy_mode=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments(device=\"habana\")\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train your chatbot on the summarization task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.habana import GaudiTrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"cnn_dailymail\", dataset_config_name=\"3.0.0\")\n",
+    "training_args = GaudiTrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    "    use_habana=True,\n",
+    "    use_lazy_mode=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments(device=\"habana\")\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train your chatbot on the code completion task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from optimum.habana import GaudiTrainingArguments\n",
+    "from intel_extension_for_transformers.neural_chat.config import (\n",
+    "    ModelArguments,\n",
+    "    DataArguments,\n",
+    "    FinetuningArguments,\n",
+    "    TextGenerationFinetuningConfig,\n",
+    ")\n",
+    "from intel_extension_for_transformers.neural_chat.chatbot import finetune_model\n",
+    "model_args = ModelArguments(model_name_or_path=\"/models/llama-7b-hf/\")\n",
+    "data_args = DataArguments(dataset_name=\"theblackcat102/evol-codealpaca-v1\")\n",
+    "training_args = GaudiTrainingArguments(\n",
+    "    output_dir='./tmp',\n",
+    "    do_train=True,\n",
+    "    do_eval=True,\n",
+    "    num_train_epochs=3,\n",
+    "    overwrite_output_dir=True,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    per_device_eval_batch_size=4,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    save_strategy=\"no\",\n",
+    "    log_level=\"info\",\n",
+    "    save_total_limit=2,\n",
+    "    bf16=True,\n",
+    "    use_habana=True,\n",
+    "    use_lazy_mode=True,\n",
+    ")\n",
+    "finetune_args = FinetuningArguments(device=\"habana\")\n",
+    "finetune_cfg = TextGenerationFinetuningConfig(\n",
+    "            model_args=model_args,\n",
+    "            data_args=data_args,\n",
+    "            training_args=training_args,\n",
+    "            finetune_args=finetune_args,\n",
+    "        )\n",
+    "finetune_model(finetune_cfg)"
    ]
   }
  ],

From ac6f3eb131ca2838f0a96558b777ff6580c96d4e Mon Sep 17 00:00:00 2001
From: Liangyx2 <106130696+Liangyx2@users.noreply.github.com>
Date: Thu, 14 Sep 2023 15:33:44 +0800
Subject: [PATCH 07/13] Update tts.py

---
 .../neural_chat/pipeline/plugins/audio/tts.py                   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
index 63f81b8ed3d..98fa86760d9 100644
--- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
+++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
@@ -59,6 +59,8 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
             default_speaker_embedding_path = os.path.split(os.path.split(os.path.split(script_dir)[0])[0])[0]+ '/assets/speaker_embeddings/spk_embed_default.pt'
         elif os.path.exists(os.path.join(asset_path, 'speaker_embeddings/spk_embed_default.pt')):
             default_speaker_embedding_path = os.path.join(asset_path, 'speaker_embeddings/spk_embed_default.pt')
+        elif os.path.exists('spk_embed_default.pt'):
+            default_speaker_embedding_path = 'spk_embed_default.pt'
         else:
             print("Warning! Need to prepare speaker_embeddings")
         # load the default speaker embedding

From 2d00ec2540eb06077b67f34c14ca55e2c9e0bfc9 Mon Sep 17 00:00:00 2001
From: Liangyx2 <106130696+Liangyx2@users.noreply.github.com>
Date: Thu, 14 Sep 2023 15:41:27 +0800
Subject: [PATCH 08/13] Update build_chatbot_on_spr.ipynb

---
 .../docs/notebooks/build_chatbot_on_spr.ipynb     | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
index d95e4ef3dff..6a23ffd428f 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
@@ -79,7 +79,7 @@
     "# BF16 Optimization\n",
     "from intel_extension_for_transformers.neural_chat.config import AMPConfig\n",
     "from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig\n",
-    "config = PipelineConfig(optimization_config=AMPConfig())\n",
+    "config = PipelineConfig(optimization_config=AMPConfig(), model_name_or_path='mosaicml/mpt-7b-chat',tokenizer_name_or_path='EleutherAI/gpt-neox-20b')\n",
     "chatbot = build_chatbot(config)\n",
     "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
     "print(response)"
@@ -110,7 +110,7 @@
     "from intel_extension_for_transformers.neural_chat import plugins\n",
     "plugins.retrieval.enable=True\n",
     "plugins.retrieval.args[\"input_path\"]=\"../../assets/docs/\"\n",
-    "config = PipelineConfig(plugins=plugins)\n",
+    "config = PipelineConfig(plugins=plugins, model_name_or_path='mosaicml/mpt-7b-chat',tokenizer_name_or_path='EleutherAI/gpt-neox-20b')\n",
     "chatbot = build_chatbot(config)\n",
     "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
    ]
@@ -131,6 +131,15 @@
     "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!!curl -OL https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/spk_embed_default.pt"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -142,7 +151,7 @@
     "plugins.asr.enable = True\n",
     "plugins.tts.enable = True\n",
     "plugins.tts.args[\"output_audio_path\"]=\"./output_audio.wav\"\n",
-    "config = PipelineConfig(plugins=plugins)\n",
+    "config = PipelineConfig(plugins=plugins, model_name_or_path='mosaicml/mpt-7b-chat',tokenizer_name_or_path='EleutherAI/gpt-neox-20b')\n",
     "chatbot = build_chatbot(config)\n",
     "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
    ]

From 4cb490b3ab977e4325e3b648c578290ce56f6bcd Mon Sep 17 00:00:00 2001
From: Liangyx2 <106130696+Liangyx2@users.noreply.github.com>
Date: Thu, 14 Sep 2023 15:41:56 +0800
Subject: [PATCH 09/13] Update build_chatbot_on_spr.ipynb

---
 .../neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
index 6a23ffd428f..5418a6cc155 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
@@ -137,7 +137,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!!curl -OL https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/spk_embed_default.pt"
+    "!curl -OL https://raw.githubusercontent.com/intel/intel-extension-for-transformers/main/intel_extension_for_transformers/neural_chat/assets/speaker_embeddings/spk_embed_default.pt"
    ]
   },
   {

From cd4da922cd2f16137d677bbc2d5ca8e739c2ca76 Mon Sep 17 00:00:00 2001
From: Liangyx2 <106130696+Liangyx2@users.noreply.github.com>
Date: Thu, 14 Sep 2023 16:09:34 +0800
Subject: [PATCH 10/13] Update tts.py

---
 .../neural_chat/pipeline/plugins/audio/tts.py               | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
index 98fa86760d9..28e2a725e2c 100644
--- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
+++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
@@ -55,8 +55,10 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
         self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
         self.vocoder.eval()
         script_dir = os.path.dirname(os.path.abspath(__file__))
-        if os.path.exists(os.path.split(os.path.split(os.path.split(script_dir)[0])[0])[0]+ '/assets/speaker_embeddings/spk_embed_default.pt'):
-            default_speaker_embedding_path = os.path.split(os.path.split(os.path.split(script_dir)[0])[0])[0]+ '/assets/speaker_embeddings/spk_embed_default.pt'
+        if os.path.exists(os.path.split(os.path.split(os.path.split(script_dir)[0])[0])[0] \
+                          + '/assets/speaker_embeddings/spk_embed_default.pt'):
+            default_speaker_embedding_path = os.path.split(os.path.split(os.path.split(script_dir)[0])[0])[0] \
+                              + '/assets/speaker_embeddings/spk_embed_default.pt'
         elif os.path.exists(os.path.join(asset_path, 'speaker_embeddings/spk_embed_default.pt')):
             default_speaker_embedding_path = os.path.join(asset_path, 'speaker_embeddings/spk_embed_default.pt')
         elif os.path.exists('spk_embed_default.pt'):

From 9a02d0f4b57d852482189de4293205ae6cd399c4 Mon Sep 17 00:00:00 2001
From: "Lv, Liang1" <liang1.lv@intel.com>
Date: Thu, 14 Sep 2023 21:46:06 +0800
Subject: [PATCH 11/13] update notebook

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>
---
 .../build_chatbot_on_habana_gaudi.ipynb       | 55 ++++++++++------
 .../docs/notebooks/build_chatbot_on_icx.ipynb | 35 ++++++++++-
 .../notebooks/build_chatbot_on_nv_a100.ipynb  | 36 ++++++++++-
 .../docs/notebooks/build_chatbot_on_spr.ipynb | 63 ++++++++++++++++++-
 .../docs/notebooks/build_chatbot_on_xpu.ipynb | 52 ++++++++++++++-
 .../deploy_chatbot_on_habana_gaudi.ipynb      | 18 ++----
 .../notebooks/deploy_chatbot_on_nv_a100.ipynb | 27 +++++++-
 .../notebooks/deploy_chatbot_on_xpu.ipynb     | 27 +++++++-
 8 files changed, 266 insertions(+), 47 deletions(-)

diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
index de20eb0c52a..d9e48cb50e4 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_habana_gaudi.ipynb
@@ -25,24 +25,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
+    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference and finetuning on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
     "\n",
     "```bash\n",
     "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
-    "cd intel-extension-for-transformers/docker/inference/\n",
-    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbothabana:latest  ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
-    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbothabana:latest\n",
-    "```\n",
-    "\n",
-    "To run finetuning on Habana HPU, please execute below steps\n",
-    "\n",
-    "```bash\n",
-    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
-    "cd intel-extension-for-transformers/docker/finetuning/\n",
-    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu\n",
-    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v /dev/shm:/dev/shm  -v /absolute/path/to/llama2:/llama2 -v /absolute/path/to/alpaca_data.json:/dataset/alpaca_data.json --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
-    "\n",
-    "```"
+    "cd ./intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/docker/\n",
+    "docker build --build-arg UBUNTU_VER=22.04 -f Dockerfile -t neuralchat . --target hpu\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host neuralchat:latest\n",
+    "```\n"
    ]
   },
   {
@@ -131,13 +121,42 @@
    "outputs": [],
    "source": [
     "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot, plugins\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
     "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
     "plugins.tts.enable = True\n",
-    "plugins.tts.args[\"output_audio_path\"]=\"./output_audio.wav\"\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
     "config = PipelineConfig(plugins=plugins)\n",
     "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
    ]
   }
  ],
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
index 19edff13f5c..e847f6ea728 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_icx.ipynb
@@ -164,13 +164,42 @@
    "outputs": [],
    "source": [
     "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot, plugins\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
     "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
     "plugins.tts.enable = True\n",
-    "plugins.tts.args[\"output_audio_path\"]=\"./output_audio.wav\"\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
     "config = PipelineConfig(plugins=plugins)\n",
     "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
    ]
   }
  ],
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
index 513616097c8..e31c6ac5d3b 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_nv_a100.ipynb
@@ -134,9 +134,41 @@
    "source": [
     "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
     "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
     "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
    ]
   }
  ],
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
index d95e4ef3dff..f8cb5cff1b0 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_spr.ipynb
@@ -138,13 +138,70 @@
    "outputs": [],
    "source": [
     "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot, plugins\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
     "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
     "plugins.tts.enable = True\n",
-    "plugins.tts.args[\"output_audio_path\"]=\"./output_audio.wav\"\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
     "config = PipelineConfig(plugins=plugins)\n",
     "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Low Precision Optimization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## BF16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# BF16 Optimization\n",
+    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
+    "config = PipelineConfig(optimization_config=AMPConfig())\n",
+    "chatbot = build_chatbot(config)\n",
+    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
+    "print(response)"
    ]
   }
  ],
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
index d24f5627b9c..771d0881154 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/build_chatbot_on_xpu.ipynb
@@ -24,6 +24,23 @@
     "%pip install intel-extension-for-transformers"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -100,7 +117,30 @@
    "source": [
     "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
     "\n",
-    "For the Python API code, users have the option to enable different voice chat modes by setting audio_input to True for input or audio_output to True for output."
+    "For the Python API code, users have the option to enable different voice chat modes by setting ASR and TTS plugins enable or disable."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
+    "from intel_extension_for_transformers.neural_chat import build_chatbot, plugins\n",
+    "plugins.asr.enable = True\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"]=\"./output_audio.wav\"\n",
+    "config = PipelineConfig(plugins=plugins)\n",
+    "chatbot = build_chatbot(config)\n",
+    "result = chatbot.predict(query=\"../../assets/audio/sample.wav\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can display the generated wav file using IPython."
    ]
   },
   {
@@ -111,9 +151,15 @@
    "source": [
     "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
     "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
+    "from intel_extension_for_transformers.neural_chat import plugins\n",
+    "plugins.tts.enable = True\n",
+    "plugins.tts.args[\"output_audio_path\"] = \"./response.wav\"\n",
+    "plugins.asr.enable = True\n",
+    "\n",
+    "config = PipelineConfig(plugins=plugins)\n",
     "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"./assets/audio/sample.wav\")"
+    "result = chatbot.predict(query=\"./sample.wav\")\n",
+    "print(result)"
    ]
   }
  ],
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
index c0028e40afc..4f86b55ee43 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb
@@ -25,23 +25,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
+    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference and finetuning on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
     "\n",
     "```bash\n",
     "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
-    "cd intel-extension-for-transformers/docker/inference/\n",
-    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbothabana:latest  ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
-    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbothabana:latest\n",
-    "```\n",
-    "\n",
-    "To run finetuning on Habana HPU, please execute below steps\n",
-    "\n",
-    "```bash\n",
-    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
-    "cd intel-extension-for-transformers/docker/finetuning/\n",
-    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu\n",
-    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v /dev/shm:/dev/shm  -v /absolute/path/to/llama2:/llama2 -v /absolute/path/to/alpaca_data.json:/dataset/alpaca_data.json --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
-    "\n",
+    "cd ./intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/docker/\n",
+    "docker build --build-arg UBUNTU_VER=22.04 -f Dockerfile -t neuralchat . --target hpu\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host neuralchat:latest\n",
     "```"
    ]
   },
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
index c72f0b1589f..e58ed93e3de 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_nv_a100.ipynb
@@ -14,14 +14,37 @@
     "# Prepare Environment"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%bash\n",
-    "%pip install intel-extension-for-transformers"
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
    ]
   },
   {
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
index d3a299bc43c..b61eb709713 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/deploy_chatbot_on_xpu.ipynb
@@ -14,14 +14,37 @@
     "# Prepare Environment"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "%%bash\n",
-    "%pip install intel-extension-for-transformers"
+    "!pip install intel-extension-for-transformers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
    ]
   },
   {

From 6738c0cac65201ffac908c3ecb8416b73c020539 Mon Sep 17 00:00:00 2001
From: "Lv, Liang1" <liang1.lv@intel.com>
Date: Thu, 14 Sep 2023 22:02:55 +0800
Subject: [PATCH 12/13] update notebook

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>
---
 .../neural_chat/README.md                     |  10 +-
 .../amp_optimization_on_habana_gaudi.ipynb    |   8 +-
 .../notebooks/amp_optimization_on_spr.ipynb   |  24 ++
 .../docs/notebooks/chatbot_on_intel_cpu.ipynb | 370 ----------------
 .../chatbot_on_intel_habana_hpu.ipynb         | 391 -----------------
 .../docs/notebooks/chatbot_on_nv_gpu.ipynb    | 399 ------------------
 .../weight_only_optimization_on_nv_a100.ipynb |  24 ++
 7 files changed, 58 insertions(+), 1168 deletions(-)
 delete mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_cpu.ipynb
 delete mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_habana_hpu.ipynb
 delete mode 100644 intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_nv_gpu.ipynb

diff --git a/intel_extension_for_transformers/neural_chat/README.md b/intel_extension_for_transformers/neural_chat/README.md
index 17790583b8f..3d6c6128b8c 100644
--- a/intel_extension_for_transformers/neural_chat/README.md
+++ b/intel_extension_for_transformers/neural_chat/README.md
@@ -186,15 +186,17 @@ Welcome to use Jupyter Notebooks to explore how to build and customize chatbots
 | 2.3     | Deploying a Chatbot on Intel XPU            | Learn how to deploy a chatbot on Intel XPU.                | [Notebook](./docs/notebooks/deploy_chatbot_on_xpu.ipynb) |
 | 2.4     | Deploying a Chatbot on Habana Gaudi1/Gaudi2 | Instructions for deploying a chatbot on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/deploy_chatbot_on_habana_gaudi.ipynb) |
 | 2.5     | Deploying a Chatbot on Nvidia A100          | Learn how to deploy a chatbot as a service on Nvidia A100 platforms. | [Notebook](./docs/notebooks/deploy_chatbot_on_nv_a100.ipynb) |
+| 2.6     | Deploying Chatbot with load balance         | Learn how to deploy a chatbot as a service with load balance. | [Notebook](./docs/notebooks/chatbot_with_load_balance.ipynb) |
 | 3       | Optimizing Chatbots on Different Platforms  |                                                            |                                                         |
 | 3.1     | AMP Optimization on SPR                     | Optimize your chatbot using Automatic Mixed Precision (AMP) on SPR platforms. | [Notebook](./docs/notebooks/amp_optimization_on_spr.ipynb) |
 | 3.2     | AMP Optimization on Habana Gaudi1/Gaudi2    | Learn how to optimize your chatbot with AMP on Intel Habana Gaudi1/Gaudi2 platforms. | [Notebook](./docs/notebooks/amp_optimization_on_habana_gaudi.ipynb) |
 | 3.3     | Weight-Only Optimization on Nvidia A100     | Optimize your chatbot using Weight-Only optimization on Nvidia A100. | [Notebook](./docs/notebooks/weight_only_optimization_on_nv_a100.ipynb) |
 | 4       | Fine-Tuning Chatbots on Different Platforms |                                                            |                                                         |
-| 4.1     | Multi-Node Fine-Tuning on SPR                | Fine-tune your chatbot on SPR platforms using multiple nodes. | [Notebook](./docs/notebooks/multi_node_finetuning_on_spr.ipynb) |
-| 4.2     | Single-Card Fine-Tuning on Habana Gaudi1/Gaudi2 | Instructions for single-card fine-tuning on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb) |
-| 4.3     | Multi-Card Fine-Tuning on Habana Gaudi1/Gaudi2 | Learn how to perform multi-card fine-tuning on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb) |
-| 4.4     | Fine-Tuning on Nvidia A100                  | Fine-tune your chatbot on Nvidia A100 platforms.          | [Notebook](./docs/notebooks/finetuning_on_nv_a100.ipynb) |
+| 4.1     | Single Node Fine-Tuning on SPR               | Fine-tune your chatbot on SPR platforms using single node. | [Notebook](./docs/notebooks/single_node_finetuning_on_spr.ipynb) |
+| 4.2     | Multi-Node Fine-Tuning on SPR                | Fine-tune your chatbot on SPR platforms using multiple nodes. | [Notebook](./docs/notebooks/multi_node_finetuning_on_spr.ipynb) |
+| 4.3     | Single-Card Fine-Tuning on Habana Gaudi1/Gaudi2 | Instructions for single-card fine-tuning on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/single_card_finetuning_on_habana_gaudi.ipynb) |
+| 4.4     | Multi-Card Fine-Tuning on Habana Gaudi1/Gaudi2 | Learn how to perform multi-card fine-tuning on Intel Habana Gaudi1/Gaudi2. | [Notebook](./docs/notebooks/multi_card_finetuning_on_habana_gaudi.ipynb) |
+| 4.5     | Fine-Tuning on Nvidia A100                  | Fine-tune your chatbot on Nvidia A100 platforms.          | [Notebook](./docs/notebooks/finetuning_on_nv_a100.ipynb) |
 | 5       | Customizing Chatbots on Different Platforms |                                                            |                                                         |
 | 5.1     | Using Plugins to Customize Chatbots         | Customize your chatbot using plugins.                      | [Notebook](./docs/notebooks/customize_chatbot_with_plugins.ipynb) |
 | 5.2     | Registering New Models to Customize Chatbots |                                                            |                                                         |
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
index 386df18d4a6..699e065d568 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_habana_gaudi.ipynb
@@ -22,10 +22,10 @@
     "\n",
     "```bash\n",
     "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
-    "cd intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/docker/inference/\n",
-    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbothabana:latest  ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
-    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbothabana:latest\n",
-    "```"
+    "cd ./intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/docker/\n",
+    "docker build --build-arg UBUNTU_VER=22.04 -f Dockerfile -t neuralchat . --target hpu\n",
+    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host neuralchat:latest\n",
+    "```\n"
    ]
   },
   {
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
index e5ab6d2d8c9..7a384788c6b 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/amp_optimization_on_spr.ipynb
@@ -14,6 +14,13 @@
     "## Prepare Environment"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -23,6 +30,23 @@
     "!pip install intel-extension-for-transformers"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_cpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_cpu.ipynb
deleted file mode 100644
index ca6c5478c15..00000000000
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_cpu.ipynb
+++ /dev/null
@@ -1,370 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Generative AI: Develop and Optimize Your Own Talking Chatbot on Intel CPU"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Intel 4th Gen Xeon Scalable Processors."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Prepare Environment"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%bash\n",
-    "%pip install intel-extension-for-transformers"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Inference 💻"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Giving NeuralChat the textual instruction, it will respond with the textual response."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "chatbot = build_chatbot()\n",
-    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat With RAG Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "User could also leverage NeuralChat RAG plugin to do domain specific chat by feding with some documents like below"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "from intel_extension_for_transformers.neural_chat import plugins\n",
-    "plugins.retrieval.enable=True\n",
-    "plugins.retrieval.args[\"input_path\"]=\"./assets/docs/\"\n",
-    "config = PipelineConfig(plugins=plugins)\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Voice Chat with ATS & TTS Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
-    "\n",
-    "For the Python API code, users have the option to enable different voice chat modes by setting audio_input to True for input or audio_output to True for output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
-    "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"./assets/audio/sample.wav\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Finetuning 🔧"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finetune the pretrained large language model (LLM) with the instruction-following dataset for creating the customized chatbot is very easy for NeuralChat."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning LLM"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TextGenerationFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TextGenerationFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TTSFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TTSFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Low Precision Optimization 🚀"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## BF16"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# BF16 Optimization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
-    "config = PipelineConfig(optimization_config=AMPConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Weight-Only Quantization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Weight-Only Quantization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantizationConfig\n",
-    "config = PipelineConfig(optimization_config=WeightOnlyQuantizationConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Client-Server Architecture for Performance and Scalability"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Start Local Server"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "❗ Please notice that the server is running on the background. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import multiprocessing\n",
-    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
-    "import nest_asyncio\n",
-    "nest_asyncio.apply()\n",
-    "\n",
-    "def start_service():\n",
-    "    server_executor = NeuralChatServerExecutor()\n",
-    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
-    "multiprocessing.Process(target=start_service).start()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Text Chat Service "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import TextChatClientExecutor\n",
-    "executor = TextChatClientExecutor()\n",
-    "result = executor(\n",
-    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n",
-    "print(result.text)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Voice Chat Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import VoiceChatClientExecutor\n",
-    "executor = VoiceChatClientExecutor()\n",
-    "result = executor(\n",
-    "    audio_input_path='./assets/audio/sample.wav',\n",
-    "    audio_output_path='./results.wav',\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import IPython\n",
-    "# Play input audio\n",
-    "print(\"     Play Input Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
-    "# Play output audio\n",
-    "print(\"     Play Output Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Finetune Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import FinetuingClientExecutor\n",
-    "executor = FinetuingClientExecutor()\n",
-    "tuning_status = executor(\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
-    "    )"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_habana_hpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_habana_hpu.ipynb
deleted file mode 100644
index efd2f5b5dc3..00000000000
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_intel_habana_hpu.ipynb
+++ /dev/null
@@ -1,391 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Generative AI: Develop and Optimize Your Own Talking Chatbot on Habana HPU"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Habana's Gaudi processors(HPU)."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Prepare Environment"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In order to streamline the process, users can construct a Docker image employing a Dockerfile, initiate the Docker container, and then proceed to execute inference or finetuning operations."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**IMPORTANT:** Please note Habana's Gaudi processors(HPU) requires docker environment for running. User needs to manually execute below steps to build docker image and run docker container for inference on Habana HPU. The Jupyter notebook server should be started in the docker container and then run this Jupyter notebook. \n",
-    "\n",
-    "```bash\n",
-    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
-    "cd intel-extension-for-transformers/docker/inference/\n",
-    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbothabana:latest  ./ -f Dockerfile  --target hpu --build-arg BASE_NAME=\"base-installer-ubuntu22.04\" --build-arg ARTIFACTORY_URL=\"vault.habana.ai\" --build-arg VERSION=\"1.10.0\" --build-arg REVISION=\"494\" --build-arg PT_VERSION=\"2.0.1\" --build-arg OS_NUMBER=\"2204\"\n",
-    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host chatbothabana:latest\n",
-    "```\n",
-    "\n",
-    "To run finetuning on Habana HPU, please execute below steps\n",
-    "\n",
-    "```bash\n",
-    "git clone https://github.com/intel/intel-extension-for-transformers.git\n",
-    "cd intel-extension-for-transformers/docker/finetuning/\n",
-    "DOCKER_BUILDKIT=1 docker build --network=host --tag chatbot_finetuning:latest ./ -f Dockerfile  --target hpu\n",
-    "docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none -v /dev/shm:/dev/shm  -v /absolute/path/to/llama2:/llama2 -v /absolute/path/to/alpaca_data.json:/dataset/alpaca_data.json --cap-add=sys_nice --net=host --ipc=host chatbot_finetuning:latest\n",
-    "\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Inference 💻"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Giving NeuralChat the textual instruction, it will respond with the textual response."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "chatbot = build_chatbot()\n",
-    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat With RAG Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "User could also leverage NeuralChat RAG plugin to do domain specific chat by feding with some documents like below"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "from intel_extension_for_transformers.neural_chat import plugins\n",
-    "plugins.retrieval.enable=True\n",
-    "plugins.retrieval.args[\"input_path\"]=\"./assets/docs/\"\n",
-    "config = PipelineConfig(plugins=plugins)\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Voice Chat with ATS & TTS Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
-    "\n",
-    "For the Python API code, users have the option to enable different voice chat modes by setting audio_input to True for input or audio_output to True for output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
-    "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"./assets/audio/sample.wav\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Finetuning 🔧"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finetune the pretrained large language model (LLM) with the instruction-following dataset for creating the customized chatbot is very easy for NeuralChat."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning LLM"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TextGenerationFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TextGenerationFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TTSFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TTSFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Low Precision Optimization 🚀"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## BF16"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# BF16 Optimization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
-    "config = PipelineConfig(optimization_config=AMPConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Weight-Only Quantization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Weight-Only Quantization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantizationConfig\n",
-    "config = PipelineConfig(optimization_config=WeightOnlyQuantizationConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Client-Server Architecture for Performance and Scalability"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Start Local Server"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "❗ Please notice that the server is running on the background. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import multiprocessing\n",
-    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
-    "import nest_asyncio\n",
-    "nest_asyncio.apply()\n",
-    "\n",
-    "def start_service():\n",
-    "    server_executor = NeuralChatServerExecutor()\n",
-    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
-    "multiprocessing.Process(target=start_service).start()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Text Chat Service "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import TextChatClientExecutor\n",
-    "executor = TextChatClientExecutor()\n",
-    "result = executor(\n",
-    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n",
-    "print(result.text)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Voice Chat Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import VoiceChatClientExecutor\n",
-    "executor = VoiceChatClientExecutor()\n",
-    "result = executor(\n",
-    "    audio_input_path='./assets/audio/sample.wav',\n",
-    "    audio_output_path='./results.wav',\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import IPython\n",
-    "# Play input audio\n",
-    "print(\"     Play Input Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
-    "# Play output audio\n",
-    "print(\"     Play Output Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Finetune Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import FinetuingClientExecutor\n",
-    "executor = FinetuingClientExecutor()\n",
-    "tuning_status = executor(\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
-    "    )"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_nv_gpu.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_nv_gpu.ipynb
deleted file mode 100644
index c31e0d367cb..00000000000
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/chatbot_on_nv_gpu.ipynb
+++ /dev/null
@@ -1,399 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Generative AI: Develop and Optimize Your Own Talking Chatbot on Nvidia GPU"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "NeuralChat is a customizable chat framework designed to create user own chatbot within few minutes on multiple architectures. This notebook is used to demostrate how to build a talking chatbot on Nvidia GPUs."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Prepare Environment"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%bash\n",
-    "%pip install intel-extension-for-transformers"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Inference 💻"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Giving NeuralChat the textual instruction, it will respond with the textual response."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "chatbot = build_chatbot()\n",
-    "response = chatbot.predict(\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Text Chat With RAG Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "User could also leverage NeuralChat RAG plugin to do domain specific chat by feding with some documents like below"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "from intel_extension_for_transformers.neural_chat import plugins\n",
-    "plugins.retrieval.enable=True\n",
-    "plugins.retrieval.args[\"input_path\"]=\"./assets/docs/\"\n",
-    "config = PipelineConfig(plugins=plugins)\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(\"How many cores does the Intel® Xeon® Platinum 8480+ Processor have in total?\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Voice Chat with ATS & TTS Plugin"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In the context of voice chat, users have the option to engage in various modes: utilizing input audio and receiving output audio, employing input audio and receiving textual output, or providing input in textual form and receiving audio output.\n",
-    "\n",
-    "For the Python API code, users have the option to enable different voice chat modes by setting audio_input to True for input or audio_output to True for output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import PipelineConfig\n",
-    "from intel_extension_for_transformers.neural_chat import build_chatbot\n",
-    "config = PipelineConfig(audio_input=True, audio_output=True)\n",
-    "chatbot = build_chatbot(config)\n",
-    "result = chatbot.predict(query=\"./assets/audio/sample.wav\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Finetuning 🔧"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Finetune the pretrained large language model (LLM) with the instruction-following dataset for creating the customized chatbot is very easy for NeuralChat."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning LLM"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TextGenerationFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TextGenerationFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Finetuning TTS"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from intel_extension_for_transformers.neural_chat import TTSFinetuningConfig\n",
-    "from intel_extension_for_transformers.neural_chat import finetune_model\n",
-    "finetune_cfg = TTSFinetuningConfig()\n",
-    "finetuned_model = finetune_model(finetune_cfg)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Low Precision Optimization 🚀"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## FP16"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# FP16 Optimization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, AMPConfig\n",
-    "config = PipelineConfig(optimization_config=AMPConfig(dtype=\"float16\"))\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Weight-Only Quantization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Weight-Only Quantization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, WeightOnlyQuantizationConfig\n",
-    "config = PipelineConfig(optimization_config=WeightOnlyQuantizationConfig())\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Bitsandbytes Quantization"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Bitsandbytes Quantization\n",
-    "from intel_extension_for_transformers.neural_chat.config import PipelineConfig, BitsAndBytesConfig\n",
-    "config = PipelineConfig(\n",
-    "    device='cuda',\n",
-    "    optimization_config=BitsAndBytesConfig(\n",
-    "            load_in_4bit=True,\n",
-    "            bnb_4bit_quant_type='nf4',\n",
-    "            bnb_4bit_use_double_quant=True,\n",
-    "            bnb_4bit_compute_dtype=\"bfloat16\"\n",
-    "        )\n",
-    ")\n",
-    "chatbot = build_chatbot(config)\n",
-    "response = chatbot.predict(query=\"Tell me about Intel Xeon Scalable Processors.\")\n",
-    "print(response)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Client-Server Architecture for Performance and Scalability"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Quick Start Local Server"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "❗ Please notice that the server is running on the background. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import time\n",
-    "import multiprocessing\n",
-    "from intel_extension_for_transformers.neural_chat import NeuralChatServerExecutor\n",
-    "import nest_asyncio\n",
-    "nest_asyncio.apply()\n",
-    "\n",
-    "def start_service():\n",
-    "    server_executor = NeuralChatServerExecutor()\n",
-    "    server_executor(config_file=\"./server/config/neuralchat.yaml\", log_file=\"./log/neuralchat.log\")\n",
-    "multiprocessing.Process(target=start_service).start()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Text Chat Service "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import TextChatClientExecutor\n",
-    "executor = TextChatClientExecutor()\n",
-    "result = executor(\n",
-    "    prompt=\"Tell me about Intel Xeon Scalable Processors.\",\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n",
-    "print(result.text)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Voice Chat Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import VoiceChatClientExecutor\n",
-    "executor = VoiceChatClientExecutor()\n",
-    "result = executor(\n",
-    "    audio_input_path='./assets/audio/sample.wav',\n",
-    "    audio_output_path='./results.wav',\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server entry point \n",
-    "    )\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import IPython\n",
-    "# Play input audio\n",
-    "print(\"     Play Input Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/sample.wav\"))\n",
-    "# Play output audio\n",
-    "print(\"     Play Output Audio ......\")\n",
-    "IPython.display.display(IPython.display.Audio(\"./assets/audio/welcome.wav\"))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Access Finetune Service"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from neural_chat import FinetuingClientExecutor\n",
-    "executor = FinetuingClientExecutor()\n",
-    "tuning_status = executor(\n",
-    "    server_ip=\"127.0.0.1\", # master server ip\n",
-    "    port=8000 # master server port (port on socket 0, if both sockets are deployed)\n",
-    "    )"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb b/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb
index 09c28df6e5c..d9ca61b12e3 100644
--- a/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb
+++ b/intel_extension_for_transformers/neural_chat/docs/notebooks/weight_only_optimization_on_nv_a100.ipynb
@@ -14,6 +14,13 @@
     "## Prepare Environment"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install intel extension for transformers:"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -23,6 +30,23 @@
     "!pip install intel-extension-for-transformers"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install Requirements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd ../../\n",
+    "!pip install -r requirements.txt"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From b67a5cc9f4a6197129c8af18a0afc5d21438665a Mon Sep 17 00:00:00 2001
From: "Lv, Liang1" <liang1.lv@intel.com>
Date: Fri, 15 Sep 2023 00:22:29 +0800
Subject: [PATCH 13/13] fix pylint issue

Signed-off-by: Lv, Liang1 <liang1.lv@intel.com>
---
 intel_extension_for_transformers/llm/finetuning/finetuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/intel_extension_for_transformers/llm/finetuning/finetuning.py b/intel_extension_for_transformers/llm/finetuning/finetuning.py
index 7cadf1b01cc..75ed32ee3fd 100644
--- a/intel_extension_for_transformers/llm/finetuning/finetuning.py
+++ b/intel_extension_for_transformers/llm/finetuning/finetuning.py
@@ -513,7 +513,7 @@ def concatenate_data(dataset, max_seq_length):
                     data_collator=data_collator,
                 )
             else:
-                from optimum.habana import GaudiConfig, GaudiTrainer # pylint: disable=E0611
+                from optimum.habana import GaudiConfig, GaudiTrainer # pylint: disable=E0611 E0401
 
                 gaudi_config = GaudiConfig()
                 gaudi_config.use_fused_adam = True