diff --git a/ChatQnA/Dockerfile b/ChatQnA/Dockerfile
index 4ece0783a..ee84069a2 100644
--- a/ChatQnA/Dockerfile
+++ b/ChatQnA/Dockerfile
@@ -19,7 +19,8 @@ RUN git clone https://github.com/opea-project/GenAIComps.git
 
 WORKDIR /home/user/GenAIComps
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt
+    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \
+    pip install --no-cache-dir langchain_core
 
 COPY ./chatqna.py /home/user/chatqna.py
 
diff --git a/ChatQnA/Dockerfile.guardrails b/ChatQnA/Dockerfile.guardrails
index 12f8ce957..168dfb138 100644
--- a/ChatQnA/Dockerfile.guardrails
+++ b/ChatQnA/Dockerfile.guardrails
@@ -19,9 +19,10 @@ RUN git clone https://github.com/opea-project/GenAIComps.git
 
 WORKDIR /home/user/GenAIComps
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt
+    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \
+    pip install --no-cache-dir langchain_core
 
-COPY ./chatqna_guardrails.py /home/user/chatqna_guardrails.py
+COPY ./chatqna.py /home/user/chatqna.py
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps
 
@@ -31,4 +32,4 @@ WORKDIR /home/user
 
 RUN echo 'ulimit -S -n 999999' >> ~/.bashrc
 
-ENTRYPOINT ["python", "chatqna_guardrails.py"]
+ENTRYPOINT ["python", "chatqna.py", "--with-guardrails"]
diff --git a/ChatQnA/Dockerfile.no_wrapper b/ChatQnA/Dockerfile.no_wrapper
deleted file mode 100644
index 7e4cd5381..000000000
--- a/ChatQnA/Dockerfile.no_wrapper
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-FROM python:3.11-slim
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
-    git \
-    libgl1-mesa-glx \
-    libjemalloc-dev
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-WORKDIR /home/user/
-RUN git clone https://github.com/opea-project/GenAIComps.git
-
-WORKDIR /home/user/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \
-    pip install --no-cache-dir langchain_core
-
-COPY ./chatqna_no_wrapper.py /home/user/chatqna_no_wrapper.py
-
-ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps
-
-USER user
-
-WORKDIR /home/user
-
-ENTRYPOINT ["python", "chatqna_no_wrapper.py"]
diff --git a/ChatQnA/Dockerfile.no_wrapper_without_rerank b/ChatQnA/Dockerfile.no_wrapper_without_rerank
deleted file mode 100644
index 9f28f27b6..000000000
--- a/ChatQnA/Dockerfile.no_wrapper_without_rerank
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-FROM python:3.11-slim
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
-    git \
-    libgl1-mesa-glx \
-    libjemalloc-dev
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-WORKDIR /home/user/
-RUN git clone https://github.com/opea-project/GenAIComps.git
-
-WORKDIR /home/user/GenAIComps
-RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \
-    pip install --no-cache-dir langchain_core
-
-COPY ./chatqna_no_wrapper.py /home/user/chatqna_no_wrapper.py
-
-ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps
-
-USER user
-
-WORKDIR /home/user
-
-ENTRYPOINT ["python", "chatqna_no_wrapper.py", "--without-rerank"]
diff --git a/ChatQnA/Dockerfile.without_rerank b/ChatQnA/Dockerfile.without_rerank
index 52fcf31d9..030aef159 100644
--- a/ChatQnA/Dockerfile.without_rerank
+++ b/ChatQnA/Dockerfile.without_rerank
@@ -6,9 +6,9 @@
 FROM python:3.11-slim
 
 RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    git \
     libgl1-mesa-glx \
-    libjemalloc-dev \
-    git
+    libjemalloc-dev
 
 RUN useradd -m -s /bin/bash user && \
     mkdir -p /home/user && \
@@ -19,9 +19,10 @@ RUN git clone https://github.com/opea-project/GenAIComps.git
 
 WORKDIR /home/user/GenAIComps
 RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt
+    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt && \
+    pip install --no-cache-dir langchain_core
 
-COPY ./chatqna_without_rerank.py /home/user/chatqna_without_rerank.py
+COPY ./chatqna.py /home/user/chatqna.py
 
 ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps
 
@@ -31,4 +32,4 @@ WORKDIR /home/user
 
 RUN echo 'ulimit -S -n 999999' >> ~/.bashrc
 
-ENTRYPOINT ["python", "chatqna_without_rerank.py"]
+ENTRYPOINT ["python", "chatqna.py", "--without-rerank"]
diff --git a/ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml
similarity index 99%
rename from ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml
index f67d53e05..c6d419dea 100644
--- a/ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/performance/oob/with_rerank/eight_gaudi/oob_eight_gaudi_with_rerank.yaml
@@ -327,7 +327,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: opea/tei-gaudi:latest
+        image: ghcr.io/huggingface/tei-gaudi:latest
         imagePullPolicy: IfNotPresent
         name: reranking-dependency-deploy
         ports:
diff --git a/ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/no_wrapper_oob_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml
similarity index 99%
rename from ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/no_wrapper_oob_four_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml
index 1f2ce5fba..0cfc9ed07 100644
--- a/ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/no_wrapper_oob_four_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/performance/oob/with_rerank/four_gaudi/oob_four_gaudi_with_rerank.yaml
@@ -327,7 +327,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: opea/tei-gaudi:latest
+        image: ghcr.io/huggingface/tei-gaudi:latest
         imagePullPolicy: IfNotPresent
         name: reranking-dependency-deploy
         ports:
diff --git a/ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/no_wrapper_oob_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml
similarity index 99%
rename from ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/no_wrapper_oob_single_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml
index 1a446b60c..902d6e052 100644
--- a/ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/no_wrapper_oob_single_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/performance/oob/with_rerank/single_gaudi/oob_single_gaudi_with_rerank.yaml
@@ -327,7 +327,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: opea/tei-gaudi:latest
+        image: ghcr.io/huggingface/tei-gaudi:latest
         imagePullPolicy: IfNotPresent
         name: reranking-dependency-deploy
         ports:
diff --git a/ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/no_wrapper_oob_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml
similarity index 99%
rename from ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/no_wrapper_oob_two_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml
index d96774381..57ce7d582 100644
--- a/ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/no_wrapper_oob_two_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/performance/oob/with_rerank/two_gaudi/oob_two_gaudi_with_rerank.yaml
@@ -327,7 +327,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: opea/tei-gaudi:latest
+        image: ghcr.io/huggingface/tei-gaudi:latest
         imagePullPolicy: IfNotPresent
         name: reranking-dependency-deploy
         ports:
diff --git a/ChatQnA/benchmark/performance/oob/without_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/performance/oob/without_rerank/eight_gaudi/no_wrapper_oob_eight_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/without_rerank/eight_gaudi/oob_eight_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/performance/oob/without_rerank/four_gaudi/no_wrapper_oob_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/performance/oob/without_rerank/four_gaudi/no_wrapper_oob_four_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/without_rerank/four_gaudi/oob_four_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/performance/oob/without_rerank/single_gaudi/no_wrapper_oob_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/performance/oob/without_rerank/single_gaudi/no_wrapper_oob_single_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/without_rerank/single_gaudi/oob_single_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/performance/oob/without_rerank/two_gaudi/no_wrapper_oob_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/performance/oob/without_rerank/two_gaudi/no_wrapper_oob_two_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/oob/without_rerank/two_gaudi/oob_two_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml
similarity index 99%
rename from ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml
index 556572eca..52cd258f6 100644
--- a/ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/performance/tuned/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml
@@ -345,7 +345,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: opea/tei-gaudi:latest
+        image: ghcr.io/huggingface/tei-gaudi:latest
         imagePullPolicy: IfNotPresent
         name: reranking-dependency-deploy
         ports:
diff --git a/ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml
similarity index 99%
rename from ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml
index d17b6d485..551d811fc 100644
--- a/ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/performance/tuned/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml
@@ -345,7 +345,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: opea/tei-gaudi:latest
+        image: ghcr.io/huggingface/tei-gaudi:latest
         imagePullPolicy: IfNotPresent
         name: reranking-dependency-deploy
         ports:
diff --git a/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml
similarity index 99%
rename from ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml
index fc414df89..d3fd9e6a0 100644
--- a/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/performance/tuned/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml
@@ -345,7 +345,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: opea/tei-gaudi:latest
+        image: ghcr.io/huggingface/tei-gaudi:latest
         imagePullPolicy: IfNotPresent
         name: reranking-dependency-deploy
         ports:
diff --git a/ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml
similarity index 99%
rename from ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_with_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml
index b5771ee7a..99970eaca 100644
--- a/ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_with_rerank.yaml
+++ b/ChatQnA/benchmark/performance/tuned/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml
@@ -345,7 +345,7 @@ spec:
         envFrom:
         - configMapRef:
             name: qna-config
-        image: opea/tei-gaudi:latest
+        image: ghcr.io/huggingface/tei-gaudi:latest
         imagePullPolicy: IfNotPresent
         name: reranking-dependency-deploy
         ports:
diff --git a/ChatQnA/benchmark/performance/tuned/without_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/performance/tuned/without_rerank/eight_gaudi/no_wrapper_tuned_eight_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/performance/tuned/without_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/performance/tuned/without_rerank/four_gaudi/no_wrapper_tuned_four_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/performance/tuned/without_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/performance/tuned/without_rerank/single_gaudi/no_wrapper_tuned_single_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml
diff --git a/ChatQnA/benchmark/performance/tuned/without_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml
similarity index 100%
rename from ChatQnA/benchmark/performance/tuned/without_rerank/two_gaudi/no_wrapper_tuned_two_gaudi_without_rerank.yaml
rename to ChatQnA/benchmark/performance/tuned/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml
diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index 09062b5d2..5fe6000f6 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -1,37 +1,197 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import argparse
+import json
 import os
+import re
 
 from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType
+from langchain_core.prompts import PromptTemplate
+
+
+class ChatTemplate:
+    @staticmethod
+    def generate_rag_prompt(question, documents):
+        context_str = "\n".join(documents)
+        if context_str and len(re.findall("[\u4E00-\u9FFF]", context_str)) / len(context_str) >= 0.3:
+            # chinese context
+            template = """
+### 你将扮演一个乐于助人、尊重他人并诚实的助手，你的目标是帮助用户解答问题。有效地利用来自本地知识库的搜索结果。确保你的回答中只包含相关信息。如果你不确定问题的答案，请避免分享不准确的信息。
+### 搜索结果：{context}
+### 问题：{question}
+### 回答：
+"""
+        else:
+            template = """
+### You are a helpful, respectful and honest assistant to help the user with questions. \
+Please refer to the search results obtained from the local knowledge base. \
+But be careful to not incorporate the information that you think is not relevant to the question. \
+If you don't know the answer to a question, please don't share false information. \n
+### Search results: {context} \n
+### Question: {question} \n
+### Answer:
+"""
+        return template.format(context=context_str, question=question)
+
 
 MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0")
 MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
-EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0")
-EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000))
+GUARDRAIL_SERVICE_HOST_IP = os.getenv("GUARDRAIL_SERVICE_HOST_IP", "0.0.0.0")
+GUARDRAIL_SERVICE_PORT = int(os.getenv("GUARDRAIL_SERVICE_PORT", 9090))
+EMBEDDING_SERVER_HOST_IP = os.getenv("EMBEDDING_SERVER_HOST_IP", "0.0.0.0")
+EMBEDDING_SERVER_PORT = int(os.getenv("EMBEDDING_SERVER_PORT", 6006))
 RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
 RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000))
-RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0")
-RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000))
-LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
-LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))
+RERANK_SERVER_HOST_IP = os.getenv("RERANK_SERVER_HOST_IP", "0.0.0.0")
+RERANK_SERVER_PORT = int(os.getenv("RERANK_SERVER_PORT", 8808))
+LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
+LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 9009))
+
+
+def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
+    if self.services[cur_node].service_type == ServiceType.EMBEDDING:
+        inputs["inputs"] = inputs["text"]
+        del inputs["text"]
+    elif self.services[cur_node].service_type == ServiceType.RETRIEVER:
+        # prepare the retriever params
+        retriever_parameters = kwargs.get("retriever_parameters", None)
+        if retriever_parameters:
+            inputs.update(retriever_parameters.dict())
+    elif self.services[cur_node].service_type == ServiceType.LLM:
+        # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
+        next_inputs = {}
+        next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
+        next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
+        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
+        next_inputs["top_p"] = llm_parameters_dict["top_p"]
+        next_inputs["stream"] = inputs["streaming"]
+        next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
+        next_inputs["presence_penalty"] = inputs["presence_penalty"]
+        next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
+        next_inputs["temperature"] = inputs["temperature"]
+        inputs = next_inputs
+    return inputs
+
+
+def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_dict, **kwargs):
+    next_data = {}
+    if self.services[cur_node].service_type == ServiceType.EMBEDDING:
+        assert isinstance(data, list)
+        next_data = {"text": inputs["inputs"], "embedding": data[0]}
+    elif self.services[cur_node].service_type == ServiceType.RETRIEVER:
+
+        docs = [doc["text"] for doc in data["retrieved_docs"]]
+
+        with_rerank = runtime_graph.downstream(cur_node)[0].startswith("rerank")
+        if with_rerank and docs:
+            # forward to rerank
+            # prepare inputs for rerank
+            next_data["query"] = data["initial_query"]
+            next_data["texts"] = [doc["text"] for doc in data["retrieved_docs"]]
+        else:
+            # forward to llm
+            if not docs and with_rerank:
+                # delete the rerank from retriever -> rerank -> llm
+                for ds in reversed(runtime_graph.downstream(cur_node)):
+                    for nds in runtime_graph.downstream(ds):
+                        runtime_graph.add_edge(cur_node, nds)
+                    runtime_graph.delete_node_if_exists(ds)
+
+            # handle template
+            # if user provides template, then format the prompt with it
+            # otherwise, use the default template
+            prompt = data["initial_query"]
+            chat_template = llm_parameters_dict["chat_template"]
+            if chat_template:
+                prompt_template = PromptTemplate.from_template(chat_template)
+                input_variables = prompt_template.input_variables
+                if sorted(input_variables) == ["context", "question"]:
+                    prompt = prompt_template.format(question=data["initial_query"], context="\n".join(docs))
+                elif input_variables == ["question"]:
+                    prompt = prompt_template.format(question=data["initial_query"])
+                else:
+                    print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
+                    prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs)
+            else:
+                prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs)
+
+            next_data["inputs"] = prompt
+
+    elif self.services[cur_node].service_type == ServiceType.RERANK:
+        # rerank the inputs with the scores
+        reranker_parameters = kwargs.get("reranker_parameters", None)
+        top_n = reranker_parameters.top_n if reranker_parameters else 1
+        docs = inputs["texts"]
+        reranked_docs = []
+        for best_response in data[:top_n]:
+            reranked_docs.append(docs[best_response["index"]])
+
+        # handle template
+        # if user provides template, then format the prompt with it
+        # otherwise, use the default template
+        prompt = inputs["query"]
+        chat_template = llm_parameters_dict["chat_template"]
+        if chat_template:
+            prompt_template = PromptTemplate.from_template(chat_template)
+            input_variables = prompt_template.input_variables
+            if sorted(input_variables) == ["context", "question"]:
+                prompt = prompt_template.format(question=prompt, context="\n".join(docs))
+            elif input_variables == ["question"]:
+                prompt = prompt_template.format(question=prompt)
+            else:
+                print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
+                prompt = ChatTemplate.generate_rag_prompt(prompt, docs)
+        else:
+            prompt = ChatTemplate.generate_rag_prompt(prompt, docs)
+
+        next_data["inputs"] = prompt
+
+    else:
+        next_data = data
+
+    return next_data
+
+
+def align_generator(self, gen, **kwargs):
+    # openai reaponse format
+    # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
+    for line in gen:
+        line = line.decode("utf-8")
+        start = line.find("{")
+        end = line.rfind("}") + 1
+
+        json_str = line[start:end]
+        try:
+            # sometimes yield empty chunk, do a fallback here
+            json_data = json.loads(json_str)
+            if json_data["choices"][0]["finish_reason"] != "eos_token":
+                yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
+        except Exception as e:
+            yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
+    yield "data: [DONE]\n\n"
 
 
 class ChatQnAService:
     def __init__(self, host="0.0.0.0", port=8000):
         self.host = host
         self.port = port
+        ServiceOrchestrator.align_inputs = align_inputs
+        ServiceOrchestrator.align_outputs = align_outputs
+        ServiceOrchestrator.align_generator = align_generator
         self.megaservice = ServiceOrchestrator()
 
     def add_remote_service(self):
+
         embedding = MicroService(
             name="embedding",
-            host=EMBEDDING_SERVICE_HOST_IP,
-            port=EMBEDDING_SERVICE_PORT,
-            endpoint="/v1/embeddings",
+            host=EMBEDDING_SERVER_HOST_IP,
+            port=EMBEDDING_SERVER_PORT,
+            endpoint="/embed",
             use_remote_service=True,
             service_type=ServiceType.EMBEDDING,
         )
+
         retriever = MicroService(
             name="retriever",
             host=RETRIEVER_SERVICE_HOST_IP,
@@ -40,18 +200,20 @@ def add_remote_service(self):
             use_remote_service=True,
             service_type=ServiceType.RETRIEVER,
         )
+
         rerank = MicroService(
             name="rerank",
-            host=RERANK_SERVICE_HOST_IP,
-            port=RERANK_SERVICE_PORT,
-            endpoint="/v1/reranking",
+            host=RERANK_SERVER_HOST_IP,
+            port=RERANK_SERVER_PORT,
+            endpoint="/rerank",
             use_remote_service=True,
             service_type=ServiceType.RERANK,
         )
+
         llm = MicroService(
             name="llm",
-            host=LLM_SERVICE_HOST_IP,
-            port=LLM_SERVICE_PORT,
+            host=LLM_SERVER_HOST_IP,
+            port=LLM_SERVER_PORT,
             endpoint="/v1/chat/completions",
             use_remote_service=True,
             service_type=ServiceType.LLM,
@@ -62,7 +224,109 @@ def add_remote_service(self):
         self.megaservice.flow_to(rerank, llm)
         self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
 
+    def add_remote_service_without_rerank(self):
+
+        embedding = MicroService(
+            name="embedding",
+            host=EMBEDDING_SERVER_HOST_IP,
+            port=EMBEDDING_SERVER_PORT,
+            endpoint="/embed",
+            use_remote_service=True,
+            service_type=ServiceType.EMBEDDING,
+        )
+
+        retriever = MicroService(
+            name="retriever",
+            host=RETRIEVER_SERVICE_HOST_IP,
+            port=RETRIEVER_SERVICE_PORT,
+            endpoint="/v1/retrieval",
+            use_remote_service=True,
+            service_type=ServiceType.RETRIEVER,
+        )
+
+        llm = MicroService(
+            name="llm",
+            host=LLM_SERVER_HOST_IP,
+            port=LLM_SERVER_PORT,
+            endpoint="/v1/chat/completions",
+            use_remote_service=True,
+            service_type=ServiceType.LLM,
+        )
+        self.megaservice.add(embedding).add(retriever).add(llm)
+        self.megaservice.flow_to(embedding, retriever)
+        self.megaservice.flow_to(retriever, llm)
+        self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
+
+    def add_remote_service_with_guardrails(self):
+        guardrail_in = MicroService(
+            name="guardrail_in",
+            host=GUARDRAIL_SERVICE_HOST_IP,
+            port=GUARDRAIL_SERVICE_PORT,
+            endpoint="/v1/guardrails",
+            use_remote_service=True,
+            service_type=ServiceType.GUARDRAIL,
+        )
+        embedding = MicroService(
+            name="embedding",
+            host=EMBEDDING_SERVER_HOST_IP,
+            port=EMBEDDING_SERVER_PORT,
+            endpoint="/embed",
+            use_remote_service=True,
+            service_type=ServiceType.EMBEDDING,
+        )
+        retriever = MicroService(
+            name="retriever",
+            host=RETRIEVER_SERVICE_HOST_IP,
+            port=RETRIEVER_SERVICE_PORT,
+            endpoint="/v1/retrieval",
+            use_remote_service=True,
+            service_type=ServiceType.RETRIEVER,
+        )
+        rerank = MicroService(
+            name="rerank",
+            host=RERANK_SERVER_HOST_IP,
+            port=RERANK_SERVER_PORT,
+            endpoint="/rerank",
+            use_remote_service=True,
+            service_type=ServiceType.RERANK,
+        )
+        llm = MicroService(
+            name="llm",
+            host=LLM_SERVER_HOST_IP,
+            port=LLM_SERVER_PORT,
+            endpoint="/v1/chat/completions",
+            use_remote_service=True,
+            service_type=ServiceType.LLM,
+        )
+        # guardrail_out = MicroService(
+        #     name="guardrail_out",
+        #     host=GUARDRAIL_SERVICE_HOST_IP,
+        #     port=GUARDRAIL_SERVICE_PORT,
+        #     endpoint="/v1/guardrails",
+        #     use_remote_service=True,
+        #     service_type=ServiceType.GUARDRAIL,
+        # )
+        # self.megaservice.add(guardrail_in).add(embedding).add(retriever).add(rerank).add(llm).add(guardrail_out)
+        self.megaservice.add(guardrail_in).add(embedding).add(retriever).add(rerank).add(llm)
+        self.megaservice.flow_to(guardrail_in, embedding)
+        self.megaservice.flow_to(embedding, retriever)
+        self.megaservice.flow_to(retriever, rerank)
+        self.megaservice.flow_to(rerank, llm)
+        # self.megaservice.flow_to(llm, guardrail_out)
+        self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
+
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--without-rerank", action="store_true")
+    parser.add_argument("--with-guardrails", action="store_true")
+
+    args = parser.parse_args()
+
     chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT)
-    chatqna.add_remote_service()
+    if args.without_rerank:
+        chatqna.add_remote_service_without_rerank()
+    elif args.with_guardrails:
+        chatqna.add_remote_service_with_guardrails()
+    else:
+        chatqna.add_remote_service()
diff --git a/ChatQnA/chatqna.yaml b/ChatQnA/chatqna.yaml
index 78a996a7c..e8a2d2735 100644
--- a/ChatQnA/chatqna.yaml
+++ b/ChatQnA/chatqna.yaml
@@ -30,21 +30,11 @@ opea_micro_services:
       HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
     model-id: ${EMBEDDING_MODEL_ID}
-  embedding:
-    host: ${EMBEDDING_SERVICE_HOST_IP}
-    ports: ${EMBEDDING_SERVICE_PORT}
-    image: opea/embedding-tei:latest
-    endpoint: /v1/embeddings
   retrieval:
     host: ${RETRIEVER_SERVICE_HOST_IP}
     ports: ${RETRIEVER_SERVICE_PORT}
     image: opea/retriever-redis:latest
     endpoint: /v1/retrieval
-  reranking:
-    host: ${RERANK_SERVICE_HOST_IP}
-    ports: ${RERANK_SERVICE_PORT}
-    image: opea/reranking-tei:latest
-    endpoint: /v1/reranking
   tgi-service:
     host: ${TGI_SERVICE_IP}
     ports: ${TGI_SERVICE_PORT}
@@ -64,11 +54,6 @@ opea_micro_services:
       USE_FLASH_ATTENTION: true
       FLASH_ATTENTION_RECOMPUTE: true
     model-id: ${LLM_MODEL_ID}
-  llm:
-    host: ${LLM_SERVICE_HOST_IP}
-    ports: ${LLM_SERVICE_PORT}
-    image: opea/llm-tgi:latest
-    endpoint: /v1/chat/completions
   ui:
     host: ${UI_SERVICE_HOST_IP}
     ports:
diff --git a/ChatQnA/chatqna_guardrails.py b/ChatQnA/chatqna_guardrails.py
deleted file mode 100644
index 7f78d80da..000000000
--- a/ChatQnA/chatqna_guardrails.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType
-
-MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0")
-MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
-GUARDRAIL_SERVICE_HOST_IP = os.getenv("GUARDRAIL_SERVICE_HOST_IP", "0.0.0.0")
-GUARDRAIL_SERVICE_PORT = int(os.getenv("GUARDRAIL_SERVICE_PORT", 9090))
-EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0")
-EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000))
-RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
-RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000))
-RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0")
-RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000))
-LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
-LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))
-
-
-class ChatQnAService:
-    def __init__(self, host="0.0.0.0", port=8000):
-        self.host = host
-        self.port = port
-        self.megaservice = ServiceOrchestrator()
-
-    def add_remote_service(self):
-        guardrail_in = MicroService(
-            name="guardrail_in",
-            host=GUARDRAIL_SERVICE_HOST_IP,
-            port=GUARDRAIL_SERVICE_PORT,
-            endpoint="/v1/guardrails",
-            use_remote_service=True,
-            service_type=ServiceType.GUARDRAIL,
-        )
-        embedding = MicroService(
-            name="embedding",
-            host=EMBEDDING_SERVICE_HOST_IP,
-            port=EMBEDDING_SERVICE_PORT,
-            endpoint="/v1/embeddings",
-            use_remote_service=True,
-            service_type=ServiceType.EMBEDDING,
-        )
-        retriever = MicroService(
-            name="retriever",
-            host=RETRIEVER_SERVICE_HOST_IP,
-            port=RETRIEVER_SERVICE_PORT,
-            endpoint="/v1/retrieval",
-            use_remote_service=True,
-            service_type=ServiceType.RETRIEVER,
-        )
-        rerank = MicroService(
-            name="rerank",
-            host=RERANK_SERVICE_HOST_IP,
-            port=RERANK_SERVICE_PORT,
-            endpoint="/v1/reranking",
-            use_remote_service=True,
-            service_type=ServiceType.RERANK,
-        )
-        llm = MicroService(
-            name="llm",
-            host=LLM_SERVICE_HOST_IP,
-            port=LLM_SERVICE_PORT,
-            endpoint="/v1/chat/completions",
-            use_remote_service=True,
-            service_type=ServiceType.LLM,
-        )
-        # guardrail_out = MicroService(
-        #     name="guardrail_out",
-        #     host=GUARDRAIL_SERVICE_HOST_IP,
-        #     port=GUARDRAIL_SERVICE_PORT,
-        #     endpoint="/v1/guardrails",
-        #     use_remote_service=True,
-        #     service_type=ServiceType.GUARDRAIL,
-        # )
-        # self.megaservice.add(guardrail_in).add(embedding).add(retriever).add(rerank).add(llm).add(guardrail_out)
-        self.megaservice.add(guardrail_in).add(embedding).add(retriever).add(rerank).add(llm)
-        self.megaservice.flow_to(guardrail_in, embedding)
-        self.megaservice.flow_to(embedding, retriever)
-        self.megaservice.flow_to(retriever, rerank)
-        self.megaservice.flow_to(rerank, llm)
-        # self.megaservice.flow_to(llm, guardrail_out)
-        self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
-
-
-if __name__ == "__main__":
-    chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT)
-    chatqna.add_remote_service()
diff --git a/ChatQnA/chatqna_no_wrapper.py b/ChatQnA/chatqna_no_wrapper.py
deleted file mode 100644
index c08c6a2f3..000000000
--- a/ChatQnA/chatqna_no_wrapper.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-import json
-import os
-import re
-
-from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType
-from langchain_core.prompts import PromptTemplate
-
-
-class ChatTemplate:
-    @staticmethod
-    def generate_rag_prompt(question, documents):
-        context_str = "\n".join(documents)
-        if context_str and len(re.findall("[\u4E00-\u9FFF]", context_str)) / len(context_str) >= 0.3:
-            # chinese context
-            template = """
-### 你将扮演一个乐于助人、尊重他人并诚实的助手，你的目标是帮助用户解答问题。有效地利用来自本地知识库的搜索结果。确保你的回答中只包含相关信息。如果你不确定问题的答案，请避免分享不准确的信息。
-### 搜索结果：{context}
-### 问题：{question}
-### 回答：
-"""
-        else:
-            template = """
-### You are a helpful, respectful and honest assistant to help the user with questions. \
-Please refer to the search results obtained from the local knowledge base. \
-But be careful to not incorporate the information that you think is not relevant to the question. \
-If you don't know the answer to a question, please don't share false information. \n
-### Search results: {context} \n
-### Question: {question} \n
-### Answer:
-"""
-        return template.format(context=context_str, question=question)
-
-
-MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0")
-MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
-# EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0")
-# EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000))
-# RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
-# RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000))
-# RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0")
-# RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000))
-# LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
-# LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))
-EMBEDDING_SERVER_HOST_IP = os.getenv("EMBEDDING_SERVER_HOST_IP", "0.0.0.0")
-EMBEDDING_SERVER_PORT = int(os.getenv("EMBEDDING_SERVER_PORT", 6006))
-RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
-RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000))
-RERANK_SERVER_HOST_IP = os.getenv("RERANK_SERVER_HOST_IP", "0.0.0.0")
-RERANK_SERVER_PORT = int(os.getenv("RERANK_SERVER_PORT", 8808))
-LLM_SERVER_HOST_IP = os.getenv("LLM_SERVER_HOST_IP", "0.0.0.0")
-LLM_SERVER_PORT = int(os.getenv("LLM_SERVER_PORT", 9009))
-
-
-def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
-    if self.services[cur_node].service_type == ServiceType.EMBEDDING:
-        inputs["inputs"] = inputs["text"]
-        del inputs["text"]
-    elif self.services[cur_node].service_type == ServiceType.RETRIEVER:
-        # prepare the retriever params
-        retriever_parameters = kwargs.get("retriever_parameters", None)
-        if retriever_parameters:
-            inputs.update(retriever_parameters.dict())
-    elif self.services[cur_node].service_type == ServiceType.LLM:
-        # convert TGI/vLLM to unified OpenAI /v1/chat/completions format
-        next_inputs = {}
-        next_inputs["model"] = "tgi"  # specifically clarify the fake model to make the format unified
-        next_inputs["messages"] = [{"role": "user", "content": inputs["inputs"]}]
-        next_inputs["max_tokens"] = llm_parameters_dict["max_tokens"]
-        next_inputs["top_p"] = llm_parameters_dict["top_p"]
-        next_inputs["stream"] = inputs["streaming"]
-        next_inputs["frequency_penalty"] = inputs["frequency_penalty"]
-        next_inputs["presence_penalty"] = inputs["presence_penalty"]
-        next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
-        next_inputs["temperature"] = inputs["temperature"]
-        inputs = next_inputs
-
-    return inputs
-
-
-def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_dict, **kwargs):
-    next_data = {}
-    if self.services[cur_node].service_type == ServiceType.EMBEDDING:
-        assert isinstance(data, list)
-        next_data = {"text": inputs["inputs"], "embedding": data[0]}
-    elif self.services[cur_node].service_type == ServiceType.RETRIEVER:
-
-        docs = [doc["text"] for doc in data["retrieved_docs"]]
-
-        with_rerank = runtime_graph.downstream(cur_node)[0].startswith("rerank")
-        if with_rerank and docs:
-            # forward to rerank
-            # prepare inputs for rerank
-            next_data["query"] = data["initial_query"]
-            next_data["texts"] = [doc["text"] for doc in data["retrieved_docs"]]
-        else:
-            # forward to llm
-            if not docs and with_rerank:
-                # delete the rerank from retriever -> rerank -> llm
-                for ds in reversed(runtime_graph.downstream(cur_node)):
-                    for nds in runtime_graph.downstream(ds):
-                        runtime_graph.add_edge(cur_node, nds)
-                    runtime_graph.delete_node_if_exists(ds)
-
-            # handle template
-            # if user provides template, then format the prompt with it
-            # otherwise, use the default template
-            prompt = data["initial_query"]
-            chat_template = llm_parameters_dict["chat_template"]
-            if chat_template:
-                prompt_template = PromptTemplate.from_template(chat_template)
-                input_variables = prompt_template.input_variables
-                if sorted(input_variables) == ["context", "question"]:
-                    prompt = prompt_template.format(question=data["initial_query"], context="\n".join(docs))
-                elif input_variables == ["question"]:
-                    prompt = prompt_template.format(question=data["initial_query"])
-                else:
-                    print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
-                    prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs)
-            else:
-                prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs)
-
-            next_data["inputs"] = prompt
-
-    elif self.services[cur_node].service_type == ServiceType.RERANK:
-        # rerank the inputs with the scores
-        reranker_parameters = kwargs.get("reranker_parameters", None)
-        top_n = reranker_parameters.top_n if reranker_parameters else 1
-        docs = inputs["texts"]
-        reranked_docs = []
-        for best_response in data[:top_n]:
-            reranked_docs.append(docs[best_response["index"]])
-
-        # handle template
-        # if user provides template, then format the prompt with it
-        # otherwise, use the default template
-        prompt = inputs["query"]
-        chat_template = llm_parameters_dict["chat_template"]
-        if chat_template:
-            prompt_template = PromptTemplate.from_template(chat_template)
-            input_variables = prompt_template.input_variables
-            if sorted(input_variables) == ["context", "question"]:
-                prompt = prompt_template.format(question=prompt, context="\n".join(docs))
-            elif input_variables == ["question"]:
-                prompt = prompt_template.format(question=prompt)
-            else:
-                print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
-                prompt = ChatTemplate.generate_rag_prompt(prompt, docs)
-        else:
-            prompt = ChatTemplate.generate_rag_prompt(prompt, docs)
-
-        next_data["inputs"] = prompt
-
-    return next_data
-
-
-def align_generator(self, gen, **kwargs):
-    # openai reaponse format
-    # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
-    for line in gen:
-        line = line.decode("utf-8")
-        start = line.find("{")
-        end = line.rfind("}") + 1
-
-        json_str = line[start:end]
-        try:
-            # sometimes yield empty chunk, do a fallback here
-            json_data = json.loads(json_str)
-            if json_data["choices"][0]["finish_reason"] != "eos_token":
-                yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
-        except Exception as e:
-            yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
-    yield "data: [DONE]\n\n"
-
-
-class ChatQnAService:
-    def __init__(self, host="0.0.0.0", port=8000):
-        self.host = host
-        self.port = port
-        ServiceOrchestrator.align_inputs = align_inputs
-        ServiceOrchestrator.align_outputs = align_outputs
-        ServiceOrchestrator.align_generator = align_generator
-        self.megaservice = ServiceOrchestrator()
-
-    def add_remote_service(self):
-
-        embedding = MicroService(
-            name="embedding",
-            host=EMBEDDING_SERVER_HOST_IP,
-            port=EMBEDDING_SERVER_PORT,
-            endpoint="/embed",
-            use_remote_service=True,
-            service_type=ServiceType.EMBEDDING,
-        )
-
-        retriever = MicroService(
-            name="retriever",
-            host=RETRIEVER_SERVICE_HOST_IP,
-            port=RETRIEVER_SERVICE_PORT,
-            endpoint="/v1/retrieval",
-            use_remote_service=True,
-            service_type=ServiceType.RETRIEVER,
-        )
-
-        rerank = MicroService(
-            name="rerank",
-            host=RERANK_SERVER_HOST_IP,
-            port=RERANK_SERVER_PORT,
-            endpoint="/rerank",
-            use_remote_service=True,
-            service_type=ServiceType.RERANK,
-        )
-
-        llm = MicroService(
-            name="llm",
-            host=LLM_SERVER_HOST_IP,
-            port=LLM_SERVER_PORT,
-            endpoint="/v1/chat/completions",
-            use_remote_service=True,
-            service_type=ServiceType.LLM,
-        )
-        self.megaservice.add(embedding).add(retriever).add(rerank).add(llm)
-        self.megaservice.flow_to(embedding, retriever)
-        self.megaservice.flow_to(retriever, rerank)
-        self.megaservice.flow_to(rerank, llm)
-        self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
-
-    def add_remote_service_without_rerank(self):
-
-        embedding = MicroService(
-            name="embedding",
-            host=EMBEDDING_SERVER_HOST_IP,
-            port=EMBEDDING_SERVER_PORT,
-            endpoint="/embed",
-            use_remote_service=True,
-            service_type=ServiceType.EMBEDDING,
-        )
-
-        retriever = MicroService(
-            name="retriever",
-            host=RETRIEVER_SERVICE_HOST_IP,
-            port=RETRIEVER_SERVICE_PORT,
-            endpoint="/v1/retrieval",
-            use_remote_service=True,
-            service_type=ServiceType.RETRIEVER,
-        )
-
-        llm = MicroService(
-            name="llm",
-            host=LLM_SERVER_HOST_IP,
-            port=LLM_SERVER_PORT,
-            endpoint="/v1/chat/completions",
-            use_remote_service=True,
-            service_type=ServiceType.LLM,
-        )
-        self.megaservice.add(embedding).add(retriever).add(llm)
-        self.megaservice.flow_to(embedding, retriever)
-        self.megaservice.flow_to(retriever, llm)
-        self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--without-rerank", action="store_true")
-
-    args = parser.parse_args()
-
-    chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT)
-    if args.without_rerank:
-        chatqna.add_remote_service_without_rerank()
-    else:
-        chatqna.add_remote_service()
diff --git a/ChatQnA/chatqna_without_rerank.py b/ChatQnA/chatqna_without_rerank.py
deleted file mode 100644
index eb6cca4b4..000000000
--- a/ChatQnA/chatqna_without_rerank.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-
-from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType
-
-MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0")
-MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
-EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0")
-EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000))
-RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
-RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000))
-LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
-LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))
-
-
-class ChatQnAService:
-    def __init__(self, host="0.0.0.0", port=8000):
-        self.host = host
-        self.port = port
-        self.megaservice = ServiceOrchestrator()
-
-    def add_remote_service(self):
-        embedding = MicroService(
-            name="embedding",
-            host=EMBEDDING_SERVICE_HOST_IP,
-            port=EMBEDDING_SERVICE_PORT,
-            endpoint="/v1/embeddings",
-            use_remote_service=True,
-            service_type=ServiceType.EMBEDDING,
-        )
-        retriever = MicroService(
-            name="retriever",
-            host=RETRIEVER_SERVICE_HOST_IP,
-            port=RETRIEVER_SERVICE_PORT,
-            endpoint="/v1/retrieval",
-            use_remote_service=True,
-            service_type=ServiceType.RETRIEVER,
-        )
-        llm = MicroService(
-            name="llm",
-            host=LLM_SERVICE_HOST_IP,
-            port=LLM_SERVICE_PORT,
-            endpoint="/v1/chat/completions",
-            use_remote_service=True,
-            service_type=ServiceType.LLM,
-        )
-        self.megaservice.add(embedding).add(retriever).add(llm)
-        self.megaservice.flow_to(embedding, retriever)
-        self.megaservice.flow_to(retriever, llm)
-        self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
-
-
-if __name__ == "__main__":
-    chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT)
-    chatqna.add_remote_service()
diff --git a/ChatQnA/docker_compose/intel/cpu/aipc/README.md b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
index 283dc9888..d075b9843 100644
--- a/ChatQnA/docker_compose/intel/cpu/aipc/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/README.md
@@ -16,31 +16,19 @@ If you are in a proxy environment, set the proxy-related environment variables:
 export http_proxy="Your_HTTP_Proxy"
 export https_proxy="Your_HTTPs_Proxy"
 
-### 1. Build Embedding Image
-
-```bash
-docker build --no-cache -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/tei/langchain/Dockerfile .
-```
-
-### 2. Build Retriever Image
+### 1. Build Retriever Image
 
 ```bash
 docker build --no-cache -t opea/retriever-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/redis/langchain/Dockerfile .
 ```
 
-### 3. Build Rerank Image
-
-```bash
-docker build --no-cache -t opea/reranking-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/tei/Dockerfile .
-```
-
-### 4. Set up Ollama Service and Build LLM Image
+### 2. Set up Ollama Service and Build LLM Image
 
 We use [Ollama](https://ollama.com/) as our LLM service for AIPC.
 
 Please set up Ollama on your PC follow the instructions. This will set the entrypoint needed for the Ollama to suit the ChatQnA examples.
 
-#### 4.1 Set Up Ollama LLM Service
+#### 2.1 Set Up Ollama LLM Service
 
 Install Ollama service with one command
 
@@ -79,20 +67,20 @@ NAME            ID              SIZE    MODIFIED
 llama3:latest   365c0bd3c000    4.7 GB  5 days ago
 ```
 
-#### 4.2 Build LLM Image
+#### 2.2 Build LLM Image
 
 ```bash
 docker build --no-cache -t opea/llm-ollama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/ollama/langchain/Dockerfile .
 ```
 
-### 5. Build Dataprep Image
+### 3. Build Dataprep Image
 
 ```bash
 docker build --no-cache -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain/Dockerfile .
 cd ..
 ```
 
-### 6. Build MegaService Docker Image
+### 4. Build MegaService Docker Image
 
 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build MegaService Docker image via below command:
 
@@ -103,7 +91,7 @@ docker build --no-cache -t opea/chatqna:latest -f Dockerfile .
 cd ../../..
 ```
 
-### 7. Build UI Docker Image
+### 5. Build UI Docker Image
 
 Build frontend Docker image via below command:
 
@@ -113,15 +101,13 @@ docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https
 cd ../../../..
 ```
 
-Then run the command `docker images`, you will have the following 7 Docker Images:
+Then run the command `docker images`, you will have the following 5 Docker Images:
 
 1. `opea/dataprep-redis:latest`
-2. `opea/embedding-tei:latest`
-3. `opea/retriever-redis:latest`
-4. `opea/reranking-tei:latest`
-5. `opea/llm-ollama:latest`
-6. `opea/chatqna:latest`
-7. `opea/chatqna-ui:latest`
+2. `opea/retriever-redis:latest`
+3. `opea/llm-ollama:latest`
+4. `opea/chatqna:latest`
+5. `opea/chatqna-ui:latest`
 
 ## 🚀 Start Microservices
 
@@ -162,15 +148,14 @@ export https_proxy=${your_http_proxy}
 export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
-export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
 export REDIS_URL="redis://${host_ip}:6379"
 export INDEX_NAME="rag-redis"
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
 export MEGA_SERVICE_HOST_IP=${host_ip}
-export EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export EMBEDDING_SERVER_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
-export RERANK_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
+export RERANK_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/chatqna"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep"
 
@@ -184,15 +169,14 @@ export OLLAMA_MODEL="llama3"
 set EMBEDDING_MODEL_ID=BAAI/bge-base-en-v1.5
 set RERANK_MODEL_ID=BAAI/bge-reranker-base
 set TEI_EMBEDDING_ENDPOINT=http://%host_ip%:6006
-set TEI_RERANKING_ENDPOINT=http://%host_ip%:8808
 set REDIS_URL=redis://%host_ip%:6379
 set INDEX_NAME=rag-redis
 set HUGGINGFACEHUB_API_TOKEN=%your_hf_api_token%
 set MEGA_SERVICE_HOST_IP=%host_ip%
-set EMBEDDING_SERVICE_HOST_IP=%host_ip%
+set EMBEDDING_SERVER_HOST_IP=%host_ip%
 set RETRIEVER_SERVICE_HOST_IP=%host_ip%
-set RERANK_SERVICE_HOST_IP=%host_ip%
-set LLM_SERVICE_HOST_IP=%host_ip%
+set RERANK_SERVER_HOST_IP=%host_ip%
+set LLM_SERVER_HOST_IP=%host_ip%
 set BACKEND_SERVICE_ENDPOINT=http://%host_ip%:8888/v1/chatqna
 set DATAPREP_SERVICE_ENDPOINT=http://%host_ip%:6007/v1/dataprep
 
@@ -231,16 +215,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
        -H 'Content-Type: application/json'
    ```
 
-2. Embedding Microservice
-
-   ```bash
-   curl http://${host_ip}:6000/v1/embeddings\
-     -X POST \
-     -d '{"text":"hello"}' \
-     -H 'Content-Type: application/json'
-   ```
-
-3. Retriever Microservice  
+2. Retriever Microservice  
    To validate the retriever microservice, you need to generate a mock embedding vector of length 768 in Python script:
 
    ```bash
@@ -251,7 +226,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
      -H 'Content-Type: application/json'
    ```
 
-4. TEI Reranking Service
+3. TEI Reranking Service
 
    ```bash
    curl http://${host_ip}:8808/rerank \
@@ -260,22 +235,13 @@ For details on how to verify the correctness of the response, refer to [how-to-v
        -H 'Content-Type: application/json'
    ```
 
-5. Reranking Microservice
-
-   ```bash
-   curl http://${host_ip}:8000/v1/reranking\
-     -X POST \
-     -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \
-     -H 'Content-Type: application/json'
-   ```
-
-6. Ollama Service
+4. Ollama Service
 
    ```bash
    curl http://${host_ip}:11434/api/generate -d '{"model": "llama3", "prompt":"What is Deep Learning?"}'
    ```
 
-7. LLM Microservice
+5. LLM Microservice
 
    ```bash
    curl http://${host_ip}:9000/v1/chat/completions\
@@ -284,7 +250,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
      -H 'Content-Type: application/json'
    ```
 
-8. MegaService
+6. MegaService
 
    ```bash
    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -292,7 +258,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
         }'
    ```
 
-9. Dataprep Microservice（Optional）
+7. Dataprep Microservice（Optional）
 
    If you want to update the default knowledge base, you can use the following commands:
 
diff --git a/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml b/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
index fedbe12a8..3d70480e6 100644
--- a/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/aipc/compose.yaml
@@ -36,20 +36,6 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
     command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -82,23 +68,6 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  reranking:
-    image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-    container_name: reranking-tei-aipc-server
-    depends_on:
-      - tei-reranking-service
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   llm:
     image: ${REGISTRY:-opea}/llm-ollama
     container_name: llm-ollama
@@ -121,10 +90,8 @@ services:
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - embedding
       - retriever
       - tei-reranking-service
-      - reranking
       - llm
     ports:
       - "8888:8888"
@@ -133,10 +100,14 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVICE_PORT:-6006}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - RERANK_SERVER_HOST_IP=${RERANK_SERVICE_HOST_IP}
+      - RERANK_SERVER_PORT=${RERANK_SERVICE_PORT:-8808}
+      - LLM_SERVER_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVICE_PORT:-9000}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-aipc-ui-server:
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README.md b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
index ff636ea2c..b14bf7381 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README.md
@@ -97,61 +97,20 @@ After launching your instance, you can connect to it using SSH (for Linux instan
 
 First of all, you need to build Docker Images locally and install the python package of it.
 
-### 1. Build Embedding Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build --no-cache -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/tei/langchain/Dockerfile .
-```
-
-### 2. Build Retriever Image
+### 1. Build Retriever Image
 
 ```bash
 docker build --no-cache -t opea/retriever-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/redis/langchain/Dockerfile .
 ```
 
-### 3. Build Rerank Image
-
-> Skip for ChatQnA without Rerank pipeline
-
-```bash
-docker build --no-cache -t opea/reranking-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/tei/Dockerfile .
-```
-
-### 4. Build LLM Image
-
-#### Use TGI as backend
-
-```bash
-docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/tgi/Dockerfile .
-```
-
-#### Use vLLM as backend
-
-Build vLLM docker.
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd ./vllm/
-docker build --no-cache -t opea/vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.cpu .
-cd ..
-```
-
-Build microservice.
-
-```bash
-docker build --no-cache -t opea/llm-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/langchain/Dockerfile .
-```
-
-### 5. Build Dataprep Image
+### 2. Build Dataprep Image
 
 ```bash
 docker build --no-cache -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain/Dockerfile .
 cd ..
 ```
 
-### 6. Build MegaService Docker Image
+### 3. Build MegaService Docker Image
 
 1. MegaService with Rerank
 
@@ -173,7 +132,7 @@ cd ..
    docker build --no-cache -t opea/chatqna-without-rerank:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.without_rerank .
    ```
 
-### 7. Build UI Docker Image
+### 4. Build UI Docker Image
 
 Build frontend Docker image via below command:
 
@@ -182,7 +141,7 @@ cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
 ```
 
-### 8. Build Conversational React UI Docker Image (Optional)
+### 5. Build Conversational React UI Docker Image (Optional)
 
 Build frontend Docker image that enables Conversational experience with ChatQnA megaservice via below command:
 
@@ -193,23 +152,20 @@ cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
 ```
 
-### 9. Build Nginx Docker Image
+### 6. Build Nginx Docker Image
 
 ```bash
 cd GenAIComps
 docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/nginx/Dockerfile .
 ```
 
-Then run the command `docker images`, you will have the following 8 Docker Images:
+Then run the command `docker images`, you will have the following 5 Docker Images:
 
 1. `opea/dataprep-redis:latest`
-2. `opea/embedding-tei:latest`
-3. `opea/retriever-redis:latest`
-4. `opea/reranking-tei:latest`
-5. `opea/llm-tgi:latest` or `opea/llm-vllm:latest`
-6. `opea/chatqna:latest` or `opea/chatqna-without-rerank:latest`
-7. `opea/chatqna-ui:latest`
-8. `opea/nginx:latest`
+2. `opea/retriever-redis:latest`
+3. `opea/chatqna:latest` or `opea/chatqna-without-rerank:latest`
+4. `opea/chatqna-ui:latest`
+5. `opea/nginx:latest`
 
 ## 🚀 Start Microservices
 
@@ -315,16 +271,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
        -H 'Content-Type: application/json'
    ```
 
-2. Embedding Microservice
-
-   ```bash
-   curl http://${host_ip}:6000/v1/embeddings\
-     -X POST \
-     -d '{"text":"hello"}' \
-     -H 'Content-Type: application/json'
-   ```
-
-3. Retriever Microservice
+2. Retriever Microservice
 
    To consume the retriever microservice, you need to generate a mock embedding vector by Python script. The length of embedding vector
    is determined by the embedding model.
@@ -340,7 +287,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
      -H 'Content-Type: application/json'
    ```
 
-4. TEI Reranking Service
+3. TEI Reranking Service
 
    > Skip for ChatQnA without Rerank pipeline
 
@@ -351,18 +298,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
        -H 'Content-Type: application/json'
    ```
 
-5. Reranking Microservice
-
-   > Skip for ChatQnA without Rerank pipeline
-
-   ```bash
-   curl http://${host_ip}:8000/v1/reranking\
-     -X POST \
-     -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \
-     -H 'Content-Type: application/json'
-   ```
-
-6. LLM backend Service
+4. LLM backend Service
 
    In first startup, this service will take more time to download the model files. After it's finished, the service will be ready.
 
@@ -395,31 +331,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
      -d '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}'
    ```
 
-7. LLM Microservice
-
-   This service depends on above LLM backend service startup. It will be ready after long time, to wait for them being ready in first startup.
-
-   ```bash
-   # TGI service
-   curl http://${host_ip}:9000/v1/chat/completions\
-     -X POST \
-     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-     -H 'Content-Type: application/json'
-   ```
-
-   For parameters in TGI modes, please refer to [HuggingFace InferenceClient API](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation) (except we rename "max_new_tokens" to "max_tokens".)
-
-   ```bash
-   # vLLM Service
-   curl http://${host_ip}:9000/v1/chat/completions \
-    -X POST \
-    -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-    -H 'Content-Type: application/json'
-   ```
-
-   For parameters in vLLM modes, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
-
-8. MegaService
+5. MegaService
 
    ```bash
     curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -427,7 +339,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
           }'
    ```
 
-9. Nginx Service
+6. Nginx Service
 
    ```bash
    curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
@@ -435,7 +347,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
        -d '{"messages": "What is the revenue of Nike in 2023?"}'
    ```
 
-10. Dataprep Microservice（Optional）
+7. Dataprep Microservice（Optional）
 
 If you want to update the default knowledge base, you can use the following commands:
 
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
index bcdf7b3b8..a9f0f4e47 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_qdrant.md
@@ -70,38 +70,20 @@ git clone https://github.com/opea-project/GenAIComps.git
 cd GenAIComps
 ```
 
-### 1. Build Embedding Image
-
-```bash
-docker build --no-cache -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/tei/langchain/Dockerfile .
-```
-
-### 2. Build Retriever Image
+### 1. Build Retriever Image
 
 ```bash
 docker build --no-cache -t opea/retriever-qdrant:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/qdrant/haystack/Dockerfile .
 ```
 
-### 3. Build Rerank Image
-
-```bash
-docker build --no-cache -t opea/reranking-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/tei/Dockerfile .`
-```
-
-### 4. Build LLM Image
-
-```bash
-docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/tgi/Dockerfile .
-```
-
-### 5. Build Dataprep Image
+### 2. Build Dataprep Image
 
 ```bash
 docker build --no-cache -t opea/dataprep-qdrant:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/qdrant/langchain/Dockerfile .
 cd ..
 ```
 
-### 6. Build MegaService Docker Image
+### 3. Build MegaService Docker Image
 
 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build MegaService Docker image via below command:
 
@@ -112,7 +94,7 @@ docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_pr
 cd ../../..
 ```
 
-### 7. Build UI Docker Image
+### 4. Build UI Docker Image
 
 Build frontend Docker image via below command:
 
@@ -122,7 +104,7 @@ docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https
 cd ../../../..
 ```
 
-### 8. Build Conversational React UI Docker Image (Optional)
+### 5. Build Conversational React UI Docker Image (Optional)
 
 Build frontend Docker image that enables Conversational experience with ChatQnA megaservice via below command:
 
@@ -136,15 +118,12 @@ docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https
 cd ../../../..
 ```
 
-Then run the command `docker images`, you will have the following 7 Docker Images:
+Then run the command `docker images`, you will have the following 4 Docker Images:
 
 1. `opea/dataprep-qdrant:latest`
-2. `opea/embedding-tei:latest`
-3. `opea/retriever-qdrant:latest`
-4. `opea/reranking-tei:latest`
-5. `opea/llm-tgi:latest`
-6. `opea/chatqna:latest`
-7. `opea/chatqna-ui:latest`
+2. `opea/retriever-qdrant:latest`
+3. `opea/chatqna:latest`
+4. `opea/chatqna-ui:latest`
 
 ## 🚀 Start Microservices
 
@@ -194,17 +173,15 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6040"
-export TEI_RERANKING_ENDPOINT="http://${host_ip}:6041"
-export TGI_LLM_ENDPOINT="http://${host_ip}:6042"
 export QDRANT_HOST=${host_ip}
 export QDRANT_PORT=6333
 export INDEX_NAME="rag-qdrant"
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export EMBEDDING_SERVER_HOST_IP=${host_ip}
 export MEGA_SERVICE_HOST_IP=${host_ip}
-export EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
-export RERANK_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
+export RERANK_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8912/v1/chatqna"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6043/v1/dataprep"
 ```
@@ -234,16 +211,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
        -H 'Content-Type: application/json'
    ```
 
-2. Embedding Microservice
-
-   ```bash
-   curl http://${host_ip}:6044/v1/embeddings\
-     -X POST \
-     -d '{"text":"hello"}' \
-     -H 'Content-Type: application/json'
-   ```
-
-3. Retriever Microservice
+2. Retriever Microservice
 
    To consume the retriever microservice, you need to generate a mock embedding vector by Python script. The length of embedding vector
    is determined by the embedding model.
@@ -259,7 +227,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
      -H 'Content-Type: application/json'
    ```
 
-4. TEI Reranking Service
+3. TEI Reranking Service
 
    ```bash
    curl http://${host_ip}:6041/rerank \
@@ -268,16 +236,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
        -H 'Content-Type: application/json'
    ```
 
-5. Reranking Microservice
-
-   ```bash
-   curl http://${host_ip}:6046/v1/reranking\
-     -X POST \
-     -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \
-     -H 'Content-Type: application/json'
-   ```
-
-6. TGI Service
+4. TGI Service
 
    In first startup, this service will take more time to download the model files. After it's finished, the service will be ready.
 
@@ -302,16 +261,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
      -H 'Content-Type: application/json'
    ```
 
-7. LLM Microservice
-
-   ```bash
-   curl http://${host_ip}:6047/v1/chat/completions\
-     -X POST \
-     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-     -H 'Content-Type: application/json'
-   ```
-
-8. MegaService
+5. MegaService
 
    ```bash
    curl http://${host_ip}:8912/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -319,7 +269,7 @@ For details on how to verify the correctness of the response, refer to [how-to-v
         }'
    ```
 
-9. Dataprep Microservice（Optional）
+6. Dataprep Microservice（Optional）
 
    If you want to update the default knowledge base, you can use the following commands:
 
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
index 6f253093d..128642c99 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose.yaml
@@ -38,20 +38,6 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
     command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -85,23 +71,6 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  reranking:
-    image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-    container_name: reranking-tei-xeon-server
-    depends_on:
-      - tei-reranking-service
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   tgi-service:
     image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
     container_name: tgi-service
@@ -118,36 +87,16 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
-  llm:
-    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-    container_name: llm-tgi-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   chaqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-xeon-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - embedding
       - dataprep-redis-service
       - retriever
       - tei-reranking-service
-      - reranking
       - tgi-service
-      - llm
     ports:
       - "8888:8888"
     environment:
@@ -155,10 +104,14 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-6006}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP}
+      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-8808}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-9009}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-xeon-ui-server:
@@ -178,25 +131,6 @@ services:
       - DELETE_FILE=${DATAPREP_DELETE_FILE_ENDPOINT}
     ipc: host
     restart: always
-  chaqna-xeon-nginx-server:
-    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
-    container_name: chaqna-xeon-nginx-server
-    depends_on:
-      - chaqna-xeon-backend-server
-      - chaqna-xeon-ui-server
-    ports:
-      - "${NGINX_PORT:-80}:80"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
-      - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
-      - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
-      - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
-      - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
-    ipc: host
-    restart: always
 
 networks:
   default:
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_no_wrapper.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_no_wrapper.yaml
deleted file mode 100644
index 317a206fb..000000000
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_no_wrapper.yaml
+++ /dev/null
@@ -1,184 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  redis-vector-db:
-    image: redis/redis-stack:7.2.0-v9
-    container_name: redis-vector-db
-    ports:
-      - "6379:6379"
-      - "8001:8001"
-  dataprep-redis-service:
-    image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest}
-    container_name: dataprep-redis-server
-    depends_on:
-      - redis-vector-db
-      - tei-embedding-service
-    ports:
-      - "6007:6007"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      REDIS_URL: ${REDIS_URL}
-      REDIS_HOST: ${REDIS_HOST}
-      INDEX_NAME: ${INDEX_NAME}
-      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-  tei-embedding-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    container_name: tei-embedding-server
-    ports:
-      - "6006:80"
-    volumes:
-      - "./data:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  # embedding:
-  #   image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-  #   container_name: embedding-tei-server
-  #   depends_on:
-  #     - tei-embedding-service
-  #   ports:
-  #     - "6000:6000"
-  #   ipc: host
-  #   environment:
-  #     no_proxy: ${no_proxy}
-  #     http_proxy: ${http_proxy}
-  #     https_proxy: ${https_proxy}
-  #     TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-  #   restart: unless-stopped
-  retriever:
-    image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
-    container_name: retriever-redis-server
-    depends_on:
-      - redis-vector-db
-    ports:
-      - "7000:7000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      REDIS_URL: ${REDIS_URL}
-      INDEX_NAME: ${INDEX_NAME}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    restart: unless-stopped
-  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    container_name: tei-reranking-server
-    ports:
-      - "8808:80"
-    volumes:
-      - "./data:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  # reranking:
-  #   image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-  #   container_name: reranking-tei-xeon-server
-  #   depends_on:
-  #     - tei-reranking-service
-  #   ports:
-  #     - "8000:8000"
-  #   ipc: host
-  #   environment:
-  #     no_proxy: ${no_proxy}
-  #     http_proxy: ${http_proxy}
-  #     https_proxy: ${https_proxy}
-  #     TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-  #     HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-  #     HF_HUB_DISABLE_PROGRESS_BARS: 1
-  #     HF_HUB_ENABLE_HF_TRANSFER: 0
-  #   restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
-    container_name: tgi-service
-    ports:
-      - "9009:80"
-    volumes:
-      - "./data:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
-  # llm:
-  #   image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-  #   container_name: llm-tgi-server
-  #   depends_on:
-  #     - tgi-service
-  #   ports:
-  #     - "9000:9000"
-  #   ipc: host
-  #   environment:
-  #     no_proxy: ${no_proxy}
-  #     http_proxy: ${http_proxy}
-  #     https_proxy: ${https_proxy}
-  #     TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-  #     HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-  #     HF_HUB_DISABLE_PROGRESS_BARS: 1
-  #     HF_HUB_ENABLE_HF_TRANSFER: 0
-  #   restart: unless-stopped
-  chaqna-xeon-backend-server:
-    image: ${REGISTRY:-opea}/chatqna-no-wrapper:${TAG:-latest}
-    container_name: chatqna-xeon-backend-server
-    depends_on:
-      - redis-vector-db
-      - tei-embedding-service
-      # - embedding
-      - dataprep-redis-service
-      - retriever
-      - tei-reranking-service
-      # - reranking
-      - tgi-service
-      # - llm
-    ports:
-      - "8888:8888"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
-      - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP}
-      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
-    ipc: host
-    restart: always
-  chaqna-xeon-ui-server:
-    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
-    container_name: chatqna-xeon-ui-server
-    depends_on:
-      - chaqna-xeon-backend-server
-    ports:
-      - "5173:5173"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - CHAT_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-      - UPLOAD_FILE_BASE_URL=${DATAPREP_SERVICE_ENDPOINT}
-      - GET_FILE=${DATAPREP_GET_FILE_ENDPOINT}
-      - DELETE_FILE=${DATAPREP_DELETE_FILE_ENDPOINT}
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
index 6d30a6c23..06bf7e534 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_qdrant.yaml
@@ -38,20 +38,6 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
     command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6044:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-qdrant:${TAG:-latest}
     container_name: retriever-qdrant-server
@@ -84,23 +70,6 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  reranking:
-    image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-    container_name: reranking-tei-xeon-server
-    depends_on:
-      - tei-reranking-service
-    ports:
-      - "6046:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   tgi-service:
     image: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
     container_name: tgi-service
@@ -117,35 +86,15 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
-  llm:
-    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-    container_name: llm-tgi-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "6047:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   chaqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-xeon-backend-server
     depends_on:
       - qdrant-vector-db
       - tei-embedding-service
-      - embedding
       - retriever
       - tei-reranking-service
-      - reranking
       - tgi-service
-      - llm
     ports:
       - "8912:8888"
     environment:
@@ -153,14 +102,15 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_PORT=${EMBEDDING_SERVICE_PORT}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-6040}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RETRIEVER_SERVICE_PORT=${RETRIEVER_SERVICE_PORT}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - RERANK_SERVICE_PORT=${RERANK_SERVICE_PORT}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
-      - LLM_SERVICE_PORT=${LLM_SERVICE_PORT}
+      - RETRIEVER_SERVICE_PORT=${RETRIEVER_SERVICE_PORT:-6045}
+      - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP}
+      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-6041}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-6042}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-xeon-ui-server:
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
index 9852ca77e..bb5921f74 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_vllm.yaml
@@ -37,23 +37,6 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
     command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
-      LANGCHAIN_TRACING_V2: ${LANGCHAIN_TRACING_V2}
-      LANGCHAIN_PROJECT: "opea-embedding-service"
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -90,26 +73,6 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  reranking:
-    image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-    container_name: reranking-tei-xeon-server
-    depends_on:
-      - tei-reranking-service
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
-      LANGCHAIN_TRACING_V2: ${LANGCHAIN_TRACING_V2}
-      LANGCHAIN_PROJECT: "opea-reranking-service"
-    restart: unless-stopped
   vllm_service:
     image: ${REGISTRY:-opea}/vllm:${TAG:-latest}
     container_name: vllm-service
@@ -125,39 +88,15 @@ services:
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       LLM_MODEL_ID: ${LLM_MODEL_ID}
     command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80
-  llm:
-    image: ${REGISTRY:-opea}/llm-vllm:${TAG:-latest}
-    container_name: llm-vllm-server
-    depends_on:
-      - vllm_service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      vLLM_ENDPOINT: ${vLLM_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      LLM_MODEL: ${LLM_MODEL_ID}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      LANGCHAIN_API_KEY: ${LANGCHAIN_API_KEY}
-      LANGCHAIN_TRACING_V2: ${LANGCHAIN_TRACING_V2}
-      LANGCHAIN_PROJECT: "opea-llm-service"
-    restart: unless-stopped
   chaqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-xeon-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - embedding
       - retriever
       - tei-reranking-service
-      - reranking
       - vllm_service
-      - llm
     ports:
       - "8888:8888"
     environment:
@@ -165,10 +104,14 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-6006}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP}
+      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-8808}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-9009}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-xeon-ui-server:
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
index aac8f03c7..b109af394 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_without_rerank.yaml
@@ -38,20 +38,6 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
     command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -85,34 +71,15 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
-  llm:
-    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-    container_name: llm-tgi-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   chaqna-xeon-backend-server:
     image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
     container_name: chatqna-xeon-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - embedding
       - dataprep-redis-service
       - retriever
       - tgi-service
-      - llm
     ports:
       - "8888:8888"
     environment:
@@ -120,9 +87,12 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-6006}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-9009}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-xeon-ui-server:
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh
index 381191133..25c2971d5 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/set_env.sh
@@ -8,17 +8,14 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
-export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
-export TGI_LLM_ENDPOINT="http://${host_ip}:9009"
-export vLLM_LLM_ENDPOINT="http://${host_ip}:9009"
 export REDIS_URL="redis://${host_ip}:6379"
 export INDEX_NAME="rag-redis"
 export REDIS_HOST=${host_ip}
 export MEGA_SERVICE_HOST_IP=${host_ip}
-export EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export EMBEDDING_SERVER_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
-export RERANK_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
+export RERANK_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/chatqna"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep"
 export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_file"
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
index b3ad9318e..3b63a48c2 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -70,73 +70,19 @@ curl http://${host_ip}:8888/v1/chatqna \
 
 First of all, you need to build Docker Images locally. This step can be ignored after the Docker images published to Docker hub.
 
-### 1. Build Embedding Image
-
-```bash
-git clone https://github.com/opea-project/GenAIComps.git
-cd GenAIComps
-docker build --no-cache -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/tei/langchain/Dockerfile .
-```
-
-### 2. Build Retriever Image
+### 1. Build Retriever Image
 
 ```bash
 docker build --no-cache -t opea/retriever-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/redis/langchain/Dockerfile .
 ```
 
-### 3. Build Rerank Image
-
-> Skip for ChatQnA without Rerank pipeline
-
-```bash
-docker build --no-cache -t opea/reranking-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/tei/Dockerfile .
-```
-
-### 4. Build LLM Image
-
-You can use different LLM serving solutions, choose one of following four options.
-
-#### 4.1 Use TGI
-
-```bash
-docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/tgi/Dockerfile .
-```
-
-#### 4.2 Use VLLM
-
-Build vllm docker.
-
-```bash
-docker build --no-cache -t opea/llm-vllm-hpu:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu .
-```
-
-Build microservice docker.
-
-```bash
-docker build --no-cache -t opea/llm-vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/langchain/Dockerfile .
-```
-
-#### 4.3 Use VLLM-on-Ray
-
-Build vllm-on-ray docker.
-
-```bash
-docker build --no-cache -t opea/llm-vllm-ray-hpu:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/ray/dependency/Dockerfile .
-```
-
-Build microservice docker.
-
-```bash
-docker build --no-cache -t opea/llm-vllm-ray:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/vllm/ray/Dockerfile .
-```
-
-### 5. Build Dataprep Image
+### 2. Build Dataprep Image
 
 ```bash
 docker build --no-cache -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain/Dockerfile .
 ```
 
-### 6. Build Guardrails Docker Image (Optional)
+### 3. Build Guardrails Docker Image (Optional)
 
 To fortify AI initiatives in production, Guardrails microservice can secure model inputs and outputs, building Trustworthy, Safe, and Secure LLM-based Applications.
 
@@ -144,7 +90,7 @@ To fortify AI initiatives in production, Guardrails microservice can secure mode
 docker build -t opea/guardrails-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/llama_guard/langchain/Dockerfile .
 ```
 
-### 7. Build MegaService Docker Image
+### 4. Build MegaService Docker Image
 
 1. MegaService with Rerank
 
@@ -176,7 +122,7 @@ docker build -t opea/guardrails-tgi:latest --build-arg https_proxy=$https_proxy
    docker build --no-cache -t opea/chatqna-without-rerank:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile.without_rerank .
    ```
 
-### 8. Build UI Docker Image
+### 5. Build UI Docker Image
 
 Construct the frontend Docker image using the command below:
 
@@ -185,7 +131,7 @@ cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile .
 ```
 
-### 9. Build Conversational React UI Docker Image (Optional)
+### 6. Build Conversational React UI Docker Image (Optional)
 
 Build frontend Docker image that enables Conversational experience with ChatQnA megaservice via below command:
 
@@ -196,21 +142,18 @@ cd GenAIExamples/ChatQnA/ui
 docker build --no-cache -t opea/chatqna-conversation-ui:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f ./docker/Dockerfile.react .
 ```
 
-### 10. Build Nginx Docker Image
+### 7. Build Nginx Docker Image
 
 ```bash
 cd GenAIComps
 docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/nginx/Dockerfile .
 ```
 
-Then run the command `docker images`, you will have the following 8 Docker Images:
+Then run the command `docker images`, you will have the following 5 Docker Images:
 
-- `opea/embedding-tei:latest`
 - `opea/retriever-redis:latest`
-- `opea/reranking-tei:latest`
-- `opea/llm-tgi:latest` or `opea/llm-vllm:latest` or `opea/llm-vllm-ray:latest`
 - `opea/dataprep-redis:latest`
-- `opea/chatqna:latest` or `opea/chatqna-guardrails:latest` or `opea/chatqna-without-rerank:latest`
+- `opea/chatqna:latest`
 - `opea/chatqna-ui:latest`
 - `opea/nginx:latest`
 
@@ -338,16 +281,7 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
        -H 'Content-Type: application/json'
    ```
 
-2. Embedding Microservice
-
-   ```bash
-   curl http://${host_ip}:6000/v1/embeddings \
-     -X POST \
-     -d '{"text":"hello"}' \
-     -H 'Content-Type: application/json'
-   ```
-
-3. Retriever Microservice
+2. Retriever Microservice
 
    To consume the retriever microservice, you need to generate a mock embedding vector by Python script. The length of embedding vector
    is determined by the embedding model.
@@ -363,7 +297,7 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
      -H 'Content-Type: application/json'
    ```
 
-4. TEI Reranking Service
+3. TEI Reranking Service
 
    > Skip for ChatQnA without Rerank pipeline
 
@@ -374,18 +308,7 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
        -H 'Content-Type: application/json'
    ```
 
-5. Reranking Microservice
-
-   > Skip for ChatQnA without Rerank pipeline
-
-   ```bash
-   curl http://${host_ip}:8000/v1/reranking \
-     -X POST \
-     -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \
-     -H 'Content-Type: application/json'
-   ```
-
-6. LLM backend Service
+4. LLM backend Service
 
    In first startup, this service will take more time to download the model files. After it's finished, the service will be ready.
 
@@ -430,39 +353,7 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
      -d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
    ```
 
-7. LLM Microservice
-
-   ```bash
-   # TGI service
-   curl http://${host_ip}:9000/v1/chat/completions\
-     -X POST \
-     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-     -H 'Content-Type: application/json'
-   ```
-
-   For parameters in TGI mode, please refer to [HuggingFace InferenceClient API](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.text_generation) (except we rename "max_new_tokens" to "max_tokens".)
-
-   ```bash
-   # vLLM Service
-   curl http://${host_ip}:9000/v1/chat/completions \
-    -X POST \
-    -d '{"query":"What is Deep Learning?","max_tokens":17,"top_p":1,"temperature":0.7,"frequency_penalty":0,"presence_penalty":0, "streaming":false}' \
-    -H 'Content-Type: application/json'
-   ```
-
-   For parameters in vLLM Mode, can refer to [LangChain VLLMOpenAI API](https://api.python.langchain.com/en/latest/llms/langchain_community.llms.vllm.VLLMOpenAI.html)
-
-   ```bash
-   # vLLM-on-Ray Service
-   curl http://${host_ip}:9000/v1/chat/completions \
-     -X POST \
-     -d '{"query":"What is Deep Learning?","max_tokens":17,"presence_penalty":1.03","streaming":false}' \
-     -H 'Content-Type: application/json'
-   ```
-
-   For parameters in vLLM-on-Ray mode, can refer to [LangChain ChatOpenAI API](https://python.langchain.com/v0.2/api_reference/openai/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html)
-
-8. MegaService
+5. MegaService
 
    ```bash
    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -470,7 +361,7 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
          }'
    ```
 
-9. Nginx Service
+6. Nginx Service
 
    ```bash
    curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
@@ -478,7 +369,7 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
        -d '{"messages": "What is the revenue of Nike in 2023?"}'
    ```
 
-10. Dataprep Microservice（Optional）
+7. Dataprep Microservice（Optional）
 
 If you want to update the default knowledge base, you can use the following commands:
 
@@ -547,7 +438,7 @@ curl -X POST "http://${host_ip}:6007/v1/dataprep/delete_file" \
      -H "Content-Type: application/json"
 ```
 
-10. Guardrails (Optional)
+8. Guardrails (Optional)
 
 ```bash
 curl http://${host_ip}:9090/v1/guardrails\
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
index 6c8c0191f..bb839671a 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -39,26 +39,12 @@ services:
       no_proxy: ${no_proxy}
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
-      HABANA_VISIBLE_DEVICES: ${tei_embedding_devices}
+      HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       MAX_WARMUP_SEQUENCE_LENGTH: 512
       INIT_HCCL_ON_ACQUIRE: 0
       ENABLE_EXPERIMENTAL_FLAGS: true
     command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -90,23 +76,6 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  reranking:
-    image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-    container_name: reranking-tei-gaudi-server
-    depends_on:
-      - tei-reranking-service
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   tgi-service:
     image: ghcr.io/huggingface/tgi-gaudi:2.0.5
     container_name: tgi-gaudi-server
@@ -121,7 +90,7 @@ services:
       HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
-      HABANA_VISIBLE_DEVICES: ${llm_service_devices}
+      HABANA_VISIBLE_DEVICES: all
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       ENABLE_HPU_GRAPH: true
       LIMIT_HPU_GRAPH: true
@@ -131,36 +100,16 @@ services:
     cap_add:
       - SYS_NICE
     ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
-  llm:
-    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-    container_name: llm-tgi-gaudi-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
+    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096
   chaqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - embedding
       - retriever
       - tei-reranking-service
-      - reranking
       - tgi-service
-      - llm
     ports:
       - "8888:8888"
     environment:
@@ -168,10 +117,14 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-8090}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP}
+      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-8808}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8005}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-gaudi-ui-server:
@@ -191,25 +144,6 @@ services:
       - DELETE_FILE=${DATAPREP_DELETE_FILE_ENDPOINT}
     ipc: host
     restart: always
-  chaqna-gaudi-nginx-server:
-    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
-    container_name: chaqna-gaudi-nginx-server
-    depends_on:
-      - chaqna-gaudi-backend-server
-      - chaqna-gaudi-ui-server
-    ports:
-      - "${NGINX_PORT:-80}:80"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - FRONTEND_SERVICE_IP=${FRONTEND_SERVICE_IP}
-      - FRONTEND_SERVICE_PORT=${FRONTEND_SERVICE_PORT}
-      - BACKEND_SERVICE_NAME=${BACKEND_SERVICE_NAME}
-      - BACKEND_SERVICE_IP=${BACKEND_SERVICE_IP}
-      - BACKEND_SERVICE_PORT=${BACKEND_SERVICE_PORT}
-    ipc: host
-    restart: always
 
 networks:
   default:
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
index 1132cf44e..72be7ce0b 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_guardrails.yaml
@@ -82,20 +82,6 @@ services:
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       MAX_WARMUP_SEQUENCE_LENGTH: 512
     command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -127,23 +113,6 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  reranking:
-    image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-    container_name: reranking-tei-gaudi-server
-    depends_on:
-      - tei-reranking-service
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   tgi-service:
     image: ghcr.io/huggingface/tgi-gaudi:2.0.5
     container_name: tgi-gaudi-server
@@ -169,23 +138,6 @@ services:
       - SYS_NICE
     ipc: host
     command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
-  llm:
-    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-    container_name: llm-tgi-gaudi-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   chaqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna-guardrails:${TAG:-latest}
     container_name: chatqna-gaudi-guardrails-server
@@ -194,12 +146,9 @@ services:
       - tgi-guardrails-service
       - guardrails
       - tei-embedding-service
-      - embedding
       - retriever
       - tei-reranking-service
-      - reranking
       - tgi-service
-      - llm
     ports:
       - "8888:8888"
     environment:
@@ -208,10 +157,15 @@ services:
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
       - GUARDRAIL_SERVICE_HOST_IP=${GUARDRAIL_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - GUARDRAIL_SERVICE_PORT=${GUARDRAIL_SERVICE_PORT:-9090}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-8090}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP}
+      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-8808}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8005}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-gaudi-ui-server:
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper.yaml
deleted file mode 100644
index 157c6e43e..000000000
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_no_wrapper.yaml
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  redis-vector-db:
-    image: redis/redis-stack:7.2.0-v9
-    container_name: redis-vector-db
-    ports:
-      - "6379:6379"
-      - "8001:8001"
-  dataprep-redis-service:
-    image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest}
-    container_name: dataprep-redis-server
-    depends_on:
-      - redis-vector-db
-      - tei-embedding-service
-    ports:
-      - "6007:6007"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      REDIS_URL: ${REDIS_URL}
-      INDEX_NAME: ${INDEX_NAME}
-      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
-    container_name: tei-embedding-gaudi-server
-    ports:
-      - "8090:80"
-    volumes:
-      - "./data:/data"
-    runtime: habana
-    cap_add:
-      - SYS_NICE
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HABANA_VISIBLE_DEVICES: all
-      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      MAX_WARMUP_SEQUENCE_LENGTH: 512
-      INIT_HCCL_ON_ACQUIRE: 0
-      ENABLE_EXPERIMENTAL_FLAGS: true
-    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  # embedding:
-  #   image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-  #   container_name: embedding-tei-server
-  #   depends_on:
-  #     - tei-embedding-service
-  #   ports:
-  #     - "6000:6000"
-  #   ipc: host
-  #   environment:
-  #     no_proxy: ${no_proxy}
-  #     http_proxy: ${http_proxy}
-  #     https_proxy: ${https_proxy}
-  #     TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-  #   restart: unless-stopped
-  retriever:
-    image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
-    container_name: retriever-redis-server
-    depends_on:
-      - redis-vector-db
-    ports:
-      - "7000:7000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      REDIS_URL: ${REDIS_URL}
-      INDEX_NAME: ${INDEX_NAME}
-    restart: unless-stopped
-  tei-reranking-service:
-    image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    container_name: tei-reranking-gaudi-server
-    ports:
-      - "8808:80"
-    volumes:
-      - "./data:/data"
-    shm_size: 1g
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  # reranking:
-  #   image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-  #   container_name: reranking-tei-gaudi-server
-  #   depends_on:
-  #     - tei-reranking-service
-  #   ports:
-  #     - "8000:8000"
-  #   ipc: host
-  #   environment:
-  #     no_proxy: ${no_proxy}
-  #     http_proxy: ${http_proxy}
-  #     https_proxy: ${https_proxy}
-  #     TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-  #     HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-  #     HF_HUB_DISABLE_PROGRESS_BARS: 1
-  #     HF_HUB_ENABLE_HF_TRANSFER: 0
-  #   restart: unless-stopped
-  tgi-service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
-    container_name: tgi-gaudi-server
-    ports:
-      - "8005:80"
-    volumes:
-      - "./data:/data"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      HABANA_VISIBLE_DEVICES: all
-      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      ENABLE_HPU_GRAPH: true
-      LIMIT_HPU_GRAPH: true
-      USE_FLASH_ATTENTION: true
-      FLASH_ATTENTION_RECOMPUTE: true
-    runtime: habana
-    cap_add:
-      - SYS_NICE
-    ipc: host
-    command: --model-id ${LLM_MODEL_ID} --max-input-length 2048 --max-total-tokens 4096
-  # llm:
-  #   image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-  #   container_name: llm-tgi-gaudi-server
-  #   depends_on:
-  #     - tgi-service
-  #   ports:
-  #     - "9000:9000"
-  #   ipc: host
-  #   environment:
-  #     no_proxy: ${no_proxy}
-  #     http_proxy: ${http_proxy}
-  #     https_proxy: ${https_proxy}
-  #     TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-  #     HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-  #     HF_HUB_DISABLE_PROGRESS_BARS: 1
-  #     HF_HUB_ENABLE_HF_TRANSFER: 0
-  #   restart: unless-stopped
-  chaqna-gaudi-backend-server:
-    image: ${REGISTRY:-opea}/chatqna-no-wrapper:${TAG:-latest}
-    container_name: chatqna-gaudi-backend-server
-    depends_on:
-      - redis-vector-db
-      - tei-embedding-service
-      # - embedding
-      - retriever
-      - tei-reranking-service
-      # - reranking
-      - tgi-service
-      # - llm
-    ports:
-      - "8888:8888"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
-      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-8090}
-      - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP}
-      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-8808}
-      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
-      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8005}
-      - LOGFLAG=${LOGFLAG}
-    ipc: host
-    restart: always
-  chaqna-gaudi-ui-server:
-    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
-    container_name: chatqna-gaudi-ui-server
-    depends_on:
-      - chaqna-gaudi-backend-server
-    ports:
-      - "5173:5173"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - CHAT_BASE_URL=${BACKEND_SERVICE_ENDPOINT}
-      - UPLOAD_FILE_BASE_URL=${DATAPREP_SERVICE_ENDPOINT}
-      - GET_FILE=${DATAPREP_GET_FILE_ENDPOINT}
-      - DELETE_FILE=${DATAPREP_DELETE_FILE_ENDPOINT}
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml
index b221ebe06..f47868856 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm.yaml
@@ -43,20 +43,6 @@ services:
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       MAX_WARMUP_SEQUENCE_LENGTH: 512
     command: --model-id ${EMBEDDING_MODEL_ID}
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -88,23 +74,6 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  reranking:
-    image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-    container_name: reranking-tei-gaudi-server
-    depends_on:
-      - tei-reranking-service
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   vllm-service:
     image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest}
     container_name: vllm-gaudi-server
@@ -125,34 +94,15 @@ services:
       - SYS_NICE
     ipc: host
     command: /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL_ID --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"
-  llm:
-    image: ${REGISTRY:-opea}/llm-vllm:${TAG:-latest}
-    container_name: llm-vllm-gaudi-server
-    depends_on:
-      - vllm-service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      vLLM_ENDPOINT: ${vLLM_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      LLM_MODEL: ${LLM_MODEL_ID}
-    restart: unless-stopped
   chaqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - embedding
       - retriever
       - tei-reranking-service
-      - reranking
       - vllm-service
-      - llm
     ports:
       - "8888:8888"
     environment:
@@ -160,11 +110,14 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-8090}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
-      - LLM_SERVICE_PORT=${LLM_SERVICE_PORT}
+      - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP}
+      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-8808}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8007}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-gaudi-ui-server:
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml
index 2552d7a41..e8c572dcb 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml
@@ -43,20 +43,6 @@ services:
       OMPI_MCA_btl_vader_single_copy_mechanism: none
       MAX_WARMUP_SEQUENCE_LENGTH: 512
     command: --model-id ${EMBEDDING_MODEL_ID}
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -88,23 +74,6 @@ services:
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
     command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  reranking:
-    image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-    container_name: reranking-tei-gaudi-server
-    depends_on:
-      - tei-reranking-service
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   vllm-ray-service:
     image: ${REGISTRY:-opea}/llm-vllm-ray-hpu:${TAG:-latest}
     container_name: vllm-ray-gaudi-server
@@ -125,34 +94,15 @@ services:
       - SYS_NICE
     ipc: host
     command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL_ID --tensor_parallel_size 2 --enforce_eager True"
-  llm:
-    image: ${REGISTRY:-opea}/llm-vllm-ray:${TAG:-latest}
-    container_name: llm-vllm-ray-gaudi-server
-    depends_on:
-      - vllm-ray-service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      vLLM_RAY_ENDPOINT: ${vLLM_RAY_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      LLM_MODEL: ${LLM_MODEL_ID}
-    restart: unless-stopped
   chaqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - embedding
       - retriever
       - tei-reranking-service
-      - reranking
       - vllm-ray-service
-      - llm
     ports:
       - "8888:8888"
     environment:
@@ -160,11 +110,14 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-8090}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - RERANK_SERVICE_HOST_IP=${RERANK_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
-      - LLM_SERVICE_PORT=${LLM_SERVICE_PORT}
+      - RERANK_SERVER_HOST_IP=${RERANK_SERVER_HOST_IP}
+      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-8808}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8006}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-gaudi-ui-server:
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
index dfa0ad46a..0858ef460 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_without_rerank.yaml
@@ -45,20 +45,6 @@ services:
       INIT_HCCL_ON_ACQUIRE: 0
       ENABLE_EXPERIMENTAL_FLAGS: true
     command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -99,33 +85,14 @@ services:
       - SYS_NICE
     ipc: host
     command: --model-id ${LLM_MODEL_ID} --max-input-length 1024 --max-total-tokens 2048
-  llm:
-    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-    container_name: llm-tgi-gaudi-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   chaqna-gaudi-backend-server:
     image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
     container_name: chatqna-gaudi-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - embedding
       - retriever
       - tgi-service
-      - llm
     ports:
       - "8888:8888"
     environment:
@@ -133,9 +100,12 @@ services:
       - https_proxy=${https_proxy}
       - http_proxy=${http_proxy}
       - MEGA_SERVICE_HOST_IP=${MEGA_SERVICE_HOST_IP}
-      - EMBEDDING_SERVICE_HOST_IP=${EMBEDDING_SERVICE_HOST_IP}
+      - EMBEDDING_SERVER_HOST_IP=${EMBEDDING_SERVER_HOST_IP}
+      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-8090}
       - RETRIEVER_SERVICE_HOST_IP=${RETRIEVER_SERVICE_HOST_IP}
-      - LLM_SERVICE_HOST_IP=${LLM_SERVICE_HOST_IP}
+      - LLM_SERVER_HOST_IP=${LLM_SERVER_HOST_IP}
+      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8005}
+      - LOGFLAG=${LOGFLAG}
     ipc: host
     restart: always
   chaqna-gaudi-ui-server:
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
index 6ffda97fa..7448ae625 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/how_to_validate_service.md
@@ -26,14 +26,6 @@ The warning messages point out the veriabls are **NOT** set.
 
 ```
 ubuntu@gaudi-vm:~/GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi$ docker compose -f ./compose.yaml up -d
-WARN[0000] The "LANGCHAIN_API_KEY" variable is not set. Defaulting to a blank string.
-WARN[0000] The "LANGCHAIN_TRACING_V2" variable is not set. Defaulting to a blank string.
-WARN[0000] The "LANGCHAIN_API_KEY" variable is not set. Defaulting to a blank string.
-WARN[0000] The "LANGCHAIN_TRACING_V2" variable is not set. Defaulting to a blank string.
-WARN[0000] The "LANGCHAIN_API_KEY" variable is not set. Defaulting to a blank string.
-WARN[0000] The "LANGCHAIN_TRACING_V2" variable is not set. Defaulting to a blank string.
-WARN[0000] The "LANGCHAIN_API_KEY" variable is not set. Defaulting to a blank string.
-WARN[0000] The "LANGCHAIN_TRACING_V2" variable is not set. Defaulting to a blank string.
 WARN[0000] /home/ubuntu/GenAIExamples/ChatQnA/docker_compose/intel/hpu/gaudi/compose.yaml: `version` is obsolete
 ```
 
@@ -172,24 +164,7 @@ This test the embedding service. It sends "What is Deep Learning?" to the embedd
 
 **Note**: The vector dimension are decided by the embedding model and the output value is dependent on model and input data.
 
-### 2 Embedding Microservice
-
-```
-curl http://${host_ip}:6000/v1/embeddings\
-  -X POST \
-  -d '{"text":"What is Deep Learning?"}' \
-  -H 'Content-Type: application/json'
-```
-
-This test the embedding microservice. In this test, it sends out `What is Deep Learning?` to embedding.
-Embedding microservice get input data, call embedding service to embedding data.
-Embedding server are with NO state, but microservice keep the state. There is `id` in the output of `Embedding Microservice`.
-
-```
-{"id":"e8c85e588a235a4bc4747a23b3a71d8f","text":"What is Deep Learning?","embedding":[0.00030903306,-0.06356524,0.0025720573,-0.012404448,0.050649878, ...,   0.02776986,-0.0246678,0.03999176,0.037477136,-0.006806653,0.02261455,-0.04570737,-0.033122733,0.022785513,0.0160026,-0.021343587,-0.029969815,-0.0049176104]}
-```
-
-### 3 Retriever Microservice
+### 2 Retriever Microservice
 
 To consume the retriever microservice, you need to generate a mock embedding vector by Python script.
 The length of embedding vector is determined by the embedding model.
@@ -212,7 +187,7 @@ The output is retrieved text that relevant to the input data:
 
 ```
 
-### 4 TEI Reranking Service
+### 3 TEI Reranking Service
 
 Reranking service
 
@@ -228,24 +203,7 @@ Output is:
 
 It scores the input
 
-### 5 Reranking Microservice
-
-```
-curl http://${host_ip}:8000/v1/reranking\
-  -X POST \
-  -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \
-  -H 'Content-Type: application/json'
-```
-
-Here is the output:
-
-```
-{"id":"e1eb0e44f56059fc01aa0334b1dac313","query":"Human: Answer the question based only on the following context:\n    Deep learning is...\n    Question: What is Deep Learning?","max_new_tokens":1024,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}
-```
-
-You may notice reranking microservice are with state ('ID' and other meta data), while reranking service are not.
-
-### 6 TGI Service
+### 4 TGI Service
 
 ```
 curl http://${host_ip}:8008/generate \
@@ -277,56 +235,7 @@ and the log shows model warm up, please wait for a while and try it later.
 2024-06-05T05:45:27.867833811Z 2024-06-05T05:45:27.867759Z  INFO text_generation_router: router/src/main.rs:221: Warming up model
 ```
 
-### 7 LLM Microservice
-
-```
-curl http://${host_ip}:9000/v1/chat/completions\
-  -X POST \
-  -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-  -H 'Content-Type: application/json'
-```
-
-You will get generated text from LLM:
-
-```
-data: b'\n'
-
-data: b'\n'
-
-data: b'Deep'
-
-data: b' learning'
-
-data: b' is'
-
-data: b' a'
-
-data: b' subset'
-
-data: b' of'
-
-data: b' machine'
-
-data: b' learning'
-
-data: b' that'
-
-data: b' uses'
-
-data: b' algorithms'
-
-data: b' to'
-
-data: b' learn'
-
-data: b' from'
-
-data: b' data'
-
-data: [DONE]
-```
-
-### 8 MegaService
+### 5 MegaService
 
 ```
 curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
index d3b805953..1612f88f4 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -8,15 +8,13 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090"
-export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8005"
 export REDIS_URL="redis://${host_ip}:6379"
 export INDEX_NAME="rag-redis"
 export MEGA_SERVICE_HOST_IP=${host_ip}
-export EMBEDDING_SERVICE_HOST_IP=${host_ip}
+export EMBEDDING_SERVER_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
-export RERANK_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
+export RERANK_SERVER_HOST_IP=${host_ip}
+export LLM_SERVER_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/chatqna"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep"
 export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_file"
diff --git a/ChatQnA/docker_compose/nvidia/gpu/README.md b/ChatQnA/docker_compose/nvidia/gpu/README.md
index 62c7c9151..5cd8d3ef0 100644
--- a/ChatQnA/docker_compose/nvidia/gpu/README.md
+++ b/ChatQnA/docker_compose/nvidia/gpu/README.md
@@ -77,37 +77,19 @@ git clone https://github.com/opea-project/GenAIComps.git
 cd GenAIComps
 ```
 
-### 2. Build Embedding Image
-
-```bash
-docker build --no-cache -t opea/embedding-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/embeddings/tei/langchain/Dockerfile .
-```
-
-### 3. Build Retriever Image
+### 2. Build Retriever Image
 
 ```bash
 docker build --no-cache -t opea/retriever-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/redis/langchain/Dockerfile .
 ```
 
-### 4. Build Rerank Image
-
-```bash
-docker build --no-cache -t opea/reranking-tei:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/reranks/tei/Dockerfile .
-```
-
-### 5. Build LLM Image
-
-```bash
-docker build --no-cache -t opea/llm-tgi:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/llms/text-generation/tgi/Dockerfile .
-```
-
-### 6. Build Dataprep Image
+### 3. Build Dataprep Image
 
 ```bash
 docker build --no-cache -t opea/dataprep-redis:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/redis/langchain/Dockerfile .
 ```
 
-### 7. Build MegaService Docker Image
+### 4. Build MegaService Docker Image
 
 To construct the Mega Service, we utilize the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservice pipeline within the `chatqna.py` Python script. Build the MegaService Docker image using the command below:
 
@@ -118,7 +100,7 @@ docker build --no-cache -t opea/chatqna:latest --build-arg https_proxy=$https_pr
 cd ../../..
 ```
 
-### 8. Build UI Docker Image
+### 5. Build UI Docker Image
 
 Construct the frontend Docker image using the command below:
 
@@ -128,7 +110,7 @@ docker build --no-cache -t opea/chatqna-ui:latest --build-arg https_proxy=$https
 cd ../../../..
 ```
 
-### 9. Build React UI Docker Image (Optional)
+### 6. Build React UI Docker Image (Optional)
 
 Construct the frontend Docker image using the command below:
 
@@ -138,23 +120,20 @@ docker build --no-cache -t opea/chatqna-react-ui:latest --build-arg https_proxy=
 cd ../../../..
 ```
 
-### 10. Build Nginx Docker Image
+### 7. Build Nginx Docker Image
 
 ```bash
 cd GenAIComps
 docker build -t opea/nginx:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/nginx/Dockerfile .
 ```
 
-Then run the command `docker images`, you will have the following 8 Docker Images:
+Then run the command `docker images`, you will have the following 5 Docker Images:
 
-1. `opea/embedding-tei:latest`
-2. `opea/retriever-redis:latest`
-3. `opea/reranking-tei:latest`
-4. `opea/llm-tgi:latest`
-5. `opea/dataprep-redis:latest`
-6. `opea/chatqna:latest`
-7. `opea/chatqna-ui:latest` or `opea/chatqna-react-ui:latest`
-8. `opea/nginx:latest`
+1. `opea/retriever-redis:latest`
+2. `opea/dataprep-redis:latest`
+3. `opea/chatqna:latest`
+4. `opea/chatqna-ui:latest` or `opea/chatqna-react-ui:latest`
+5. `opea/nginx:latest`
 
 ## 🚀 Start MicroServices and MegaService
 
@@ -215,16 +194,7 @@ docker compose up -d
        -H 'Content-Type: application/json'
    ```
 
-2. Embedding Microservice
-
-   ```bash
-   curl http://${host_ip}:6000/v1/embeddings \
-     -X POST \
-     -d '{"text":"hello"}' \
-     -H 'Content-Type: application/json'
-   ```
-
-3. Retriever Microservice
+2. Retriever Microservice
 
    To consume the retriever microservice, you need to generate a mock embedding vector by Python script. The length of embedding vector
    is determined by the embedding model.
@@ -240,7 +210,7 @@ docker compose up -d
      -H 'Content-Type: application/json'
    ```
 
-4. TEI Reranking Service
+3. TEI Reranking Service
 
    ```bash
    curl http://${host_ip}:8808/rerank \
@@ -249,16 +219,7 @@ docker compose up -d
        -H 'Content-Type: application/json'
    ```
 
-5. Reranking Microservice
-
-   ```bash
-   curl http://${host_ip}:8000/v1/reranking \
-     -X POST \
-     -d '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}' \
-     -H 'Content-Type: application/json'
-   ```
-
-6. TGI Service
+4. TGI Service
 
    In first startup, this service will take more time to download the model files. After it's finished, the service will be ready.
 
@@ -283,16 +244,7 @@ docker compose up -d
      -H 'Content-Type: application/json'
    ```
 
-7. LLM Microservice
-
-   ```bash
-   curl http://${host_ip}:9000/v1/chat/completions \
-     -X POST \
-     -d '{"query":"What is Deep Learning?","max_tokens":17,"top_k":10,"top_p":0.95,"typical_p":0.95,"temperature":0.01,"repetition_penalty":1.03,"streaming":true}' \
-     -H 'Content-Type: application/json'
-   ```
-
-8. MegaService
+5. MegaService
 
    ```bash
    curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
@@ -300,7 +252,7 @@ docker compose up -d
         }'
    ```
 
-9. Nginx Service
+6. Nginx Service
 
    ```bash
    curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
@@ -308,7 +260,7 @@ docker compose up -d
        -d '{"messages": "What is the revenue of Nike in 2023?"}'
    ```
 
-10. Dataprep Microservice（Optional）
+7. Dataprep Microservice（Optional）
 
 If you want to update the default knowledge base, you can use the following commands:
 
diff --git a/ChatQnA/docker_compose/nvidia/gpu/compose.yaml b/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
index 218e11bec..723f35082 100644
--- a/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
+++ b/ChatQnA/docker_compose/nvidia/gpu/compose.yaml
@@ -46,20 +46,6 @@ services:
             - driver: nvidia
               count: 1
               capabilities: [gpu]
-  embedding:
-    image: ${REGISTRY:-opea}/embedding-tei:${TAG:-latest}
-    container_name: embedding-tei-server
-    depends_on:
-      - tei-embedding-service
-    ports:
-      - "6000:6000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
-    restart: unless-stopped
   retriever:
     image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
     container_name: retriever-redis-server
@@ -98,23 +84,6 @@ services:
             - driver: nvidia
               count: 1
               capabilities: [gpu]
-  reranking:
-    image: ${REGISTRY:-opea}/reranking-tei:${TAG:-latest}
-    container_name: reranking-tei-server
-    depends_on:
-      - tei-reranking-service
-    ports:
-      - "8000:8000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TEI_RERANKING_ENDPOINT: ${TEI_RERANKING_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   tgi-service:
     image: ghcr.io/huggingface/text-generation-inference:2.2.0
     container_name: tgi-server
@@ -138,35 +107,15 @@ services:
             - driver: nvidia
               count: 1
               capabilities: [gpu]
-  llm:
-    image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
-    container_name: llm-tgi-server
-    depends_on:
-      - tgi-service
-    ports:
-      - "9000:9000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      TGI_LLM_ENDPOINT: ${TGI_LLM_ENDPOINT}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-    restart: unless-stopped
   chaqna-backend-server:
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
     container_name: chatqna-backend-server
     depends_on:
       - redis-vector-db
       - tei-embedding-service
-      - embedding
       - retriever
       - tei-reranking-service
-      - reranking
       - tgi-service
-      - llm
     ports:
       - "8888:8888"
     environment:
diff --git a/ChatQnA/docker_compose/nvidia/gpu/set_env.sh b/ChatQnA/docker_compose/nvidia/gpu/set_env.sh
index 49a7ad7d8..f97d07e20 100644
--- a/ChatQnA/docker_compose/nvidia/gpu/set_env.sh
+++ b/ChatQnA/docker_compose/nvidia/gpu/set_env.sh
@@ -8,15 +8,9 @@ export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
 export RERANK_MODEL_ID="BAAI/bge-reranker-base"
 export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
 export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:8090"
-export TEI_RERANKING_ENDPOINT="http://${host_ip}:8808"
-export TGI_LLM_ENDPOINT="http://${host_ip}:8008"
-export REDIS_URL="redis://${host_ip}:6379"
 export INDEX_NAME="rag-redis"
 export MEGA_SERVICE_HOST_IP=${host_ip}
-export EMBEDDING_SERVICE_HOST_IP=${host_ip}
 export RETRIEVER_SERVICE_HOST_IP=${host_ip}
-export RERANK_SERVICE_HOST_IP=${host_ip}
-export LLM_SERVICE_HOST_IP=${host_ip}
 export BACKEND_SERVICE_ENDPOINT="http://${host_ip}:8888/v1/chatqna"
 export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:6007/v1/dataprep"
 export DATAPREP_GET_FILE_ENDPOINT="http://${host_ip}:6007/v1/dataprep/get_file"
diff --git a/ChatQnA/docker_image_build/build.yaml b/ChatQnA/docker_image_build/build.yaml
index 573a2de3a..390231320 100644
--- a/ChatQnA/docker_image_build/build.yaml
+++ b/ChatQnA/docker_image_build/build.yaml
@@ -23,18 +23,6 @@ services:
       dockerfile: ./Dockerfile.without_rerank
     extends: chatqna
     image: ${REGISTRY:-opea}/chatqna-without-rerank:${TAG:-latest}
-  chatqna-no-wrapper:
-    build:
-      context: ../
-      dockerfile: ./Dockerfile.no_wrapper
-    extends: chatqna
-    image: ${REGISTRY:-opea}/chatqna-no-wrapper:${TAG:-latest}
-  chatqna-no-wrapper-without-rerank:
-    build:
-      context: ../
-      dockerfile: ./Dockerfile.no_wrapper_without_rerank
-    extends: chatqna
-    image: ${REGISTRY:-opea}/chatqna-no-wrapper-without-rerank:${TAG:-latest}
   chatqna-ui:
     build:
       context: ../ui
diff --git a/ChatQnA/kubernetes/intel/README_gmc.md b/ChatQnA/kubernetes/intel/README_gmc.md
index 08dc38516..dab86381f 100644
--- a/ChatQnA/kubernetes/intel/README_gmc.md
+++ b/ChatQnA/kubernetes/intel/README_gmc.md
@@ -16,12 +16,9 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment
 
 - redis-vector-db: redis/redis-stack:7.2.0-v9
 - tei_embedding_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-- embedding: opea/embedding-tei:latest
 - retriever: opea/retriever-redis:latest
 - tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-- reranking: opea/reranking-tei:latest
 - tgi-service: ghcr.io/huggingface/text-generation-inference:sha-e4201f4-intel-cpu
-- llm: opea/llm-tgi:latest
 - chaqna-xeon-backend-server: opea/chatqna:latest
 
 Should you desire to use the Gaudi accelerator, two alternate images are used for the embedding and llm services.
diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml
index 7e137bbfb..f6b7c060e 100644
--- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml
+++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna-guardrails.yaml
@@ -27,27 +27,6 @@ data:
   no_proxy: ""
   LOGFLAG: ""
 ---
-# Source: chatqna/charts/embedding-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-embedding-usvc-config
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
 # Source: chatqna/charts/guardrails-usvc/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -72,50 +51,6 @@ data:
   https_proxy: ""
   no_proxy: ""
 ---
-# Source: chatqna/charts/llm-uservice/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-llm-uservice-config
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TGI_LLM_ENDPOINT: "http://chatqna-tgi"
-  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
-  HF_HOME: "/tmp/.cache/huggingface"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
-# Source: chatqna/charts/reranking-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-reranking-usvc-config
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_RERANKING_ENDPOINT: "http://chatqna-teirerank"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
 # Source: chatqna/charts/retriever-usvc/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -208,7 +143,7 @@ metadata:
     app.kubernetes.io/managed-by: Helm
 data:
   MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
-  PORT: "2080"
+  PORT: "2083"
   HF_TOKEN: "insert-your-huggingface-token-here"
   http_proxy: ""
   https_proxy: ""
@@ -362,31 +297,6 @@ spec:
     app.kubernetes.io/name: data-prep
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/embedding-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 6000
-      targetPort: 6000
-      protocol: TCP
-      name: embedding-usvc
-  selector:
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/guardrails-usvc/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -412,31 +322,6 @@ spec:
     app.kubernetes.io/name: guardrails-usvc
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/llm-uservice/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 9000
-      targetPort: 9000
-      protocol: TCP
-      name: llm-uservice
-  selector:
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/redis-vector-db/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -466,31 +351,6 @@ spec:
     app.kubernetes.io/name: redis-vector-db
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/reranking-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-reranking-usvc
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 8000
-      targetPort: 8000
-      protocol: TCP
-      name: reranking-usvc
-  selector:
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/retriever-usvc/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -584,7 +444,7 @@ spec:
   type: ClusterIP
   ports:
     - port: 80
-      targetPort: 2080
+      targetPort: 2083
       protocol: TCP
       name: tgi
   selector:
@@ -786,240 +646,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/embedding-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: embedding-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: embedding-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-embedding-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/embedding-tei:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: embedding-usvc
-              containerPort: 6000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
-# Source: chatqna/charts/guardrails-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-guardrails-usvc
-  labels:
-    helm.sh/chart: guardrails-usvc-1.0.0
-    app.kubernetes.io/name: guardrails-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: guardrails-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: guardrails-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-guardrails-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/guardrails-tgi:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: guardrails-usvc
-              containerPort: 9090
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: guardrails-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: guardrails-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: guardrails-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
-# Source: chatqna/charts/llm-uservice/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: llm-uservice
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: llm-uservice
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-llm-uservice-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/llm-tgi:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: llm-uservice
-              containerPort: 9000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/redis-vector-db/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -1092,17 +718,17 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/reranking-usvc/templates/deployment.yaml
+# Source: chatqna/charts/guardrails-usvc/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: chatqna-reranking-usvc
+  name: chatqna-guardrails-usvc
   labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
+    helm.sh/chart: guardrails-usvc-1.0.0
+    app.kubernetes.io/name: guardrails-usvc
     app.kubernetes.io/instance: chatqna
     app.kubernetes.io/version: "v1.0"
     app.kubernetes.io/managed-by: Helm
@@ -1110,12 +736,12 @@ spec:
   replicas: 1
   selector:
     matchLabels:
-      app.kubernetes.io/name: reranking-usvc
+      app.kubernetes.io/name: guardrails-usvc
       app.kubernetes.io/instance: chatqna
   template:
     metadata:
       labels:
-        app.kubernetes.io/name: reranking-usvc
+        app.kubernetes.io/name: guardrails-usvc
         app.kubernetes.io/instance: chatqna
     spec:
       securityContext:
@@ -1124,7 +750,7 @@ spec:
         - name: chatqna
           envFrom:
             - configMapRef:
-                name: chatqna-reranking-usvc-config
+                name: chatqna-guardrails-usvc-config
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
@@ -1135,11 +761,11 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "opea/reranking-tei:latest"
+          image: "opea/guardrails-tgi:latest"
           imagePullPolicy: IfNotPresent
           ports:
-            - name: reranking-usvc
-              containerPort: 8000
+            - name: guardrails-usvc
+              containerPort: 9090
               protocol: TCP
           volumeMounts:
             - mountPath: /tmp
@@ -1148,20 +774,20 @@ spec:
             failureThreshold: 24
             httpGet:
               path: v1/health_check
-              port: reranking-usvc
+              port: guardrails-usvc
             initialDelaySeconds: 5
             periodSeconds: 5
           readinessProbe:
             httpGet:
               path: v1/health_check
-              port: reranking-usvc
+              port: guardrails-usvc
             initialDelaySeconds: 5
             periodSeconds: 5
           startupProbe:
             failureThreshold: 120
             httpGet:
               path: v1/health_check
-              port: reranking-usvc
+              port: guardrails-usvc
             initialDelaySeconds: 5
             periodSeconds: 5
           resources:
@@ -1483,7 +1109,7 @@ spec:
               name: tmp
           ports:
             - name: http
-              containerPort: 2080
+              containerPort: 2083
               protocol: TCP
           livenessProbe:
             failureThreshold: 24
@@ -1624,16 +1250,24 @@ spec:
       containers:
         - name: chatqna
           env:
-            - name: LLM_SERVICE_HOST_IP
-              value: chatqna-llm-uservice
-            - name: RERANK_SERVICE_HOST_IP
-              value: chatqna-reranking-usvc
+            - name: LLM_SERVER_HOST_IP
+              value: chatqna-tgi
+            - name: LLM_SERVER_PORT
+              value: "2080"
+            - name: RERANK_SERVER_HOST_IP
+              value: chatqna-teirerank
+            - name: RERANK_SERVER_PORT
+              value: "2082"
             - name: RETRIEVER_SERVICE_HOST_IP
               value: chatqna-retriever-usvc
-            - name: EMBEDDING_SERVICE_HOST_IP
-              value: chatqna-embedding-usvc
+            - name: EMBEDDING_SERVER_HOST_IP
+              value: chatqna-tei
+            - name: EMBEDDING_SERVER_PORT
+              value: "2081"
             - name: GUARDRAIL_SERVICE_HOST_IP
               value: chatqna-guardrails-usvc
+            - name: GUARDRAIL_SERVICE_PORT
+              value: "9090"
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
index 844d1882c..6f2110bc5 100644
--- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
+++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna.yaml
@@ -27,71 +27,6 @@ data:
   no_proxy: ""
   LOGFLAG: ""
 ---
-# Source: chatqna/charts/embedding-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-embedding-usvc-config
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
-# Source: chatqna/charts/llm-uservice/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-llm-uservice-config
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TGI_LLM_ENDPOINT: "http://chatqna-tgi"
-  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
-  HF_HOME: "/tmp/.cache/huggingface"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
-# Source: chatqna/charts/reranking-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-reranking-usvc-config
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_RERANKING_ENDPOINT: "http://chatqna-teirerank"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
 # Source: chatqna/charts/retriever-usvc/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -312,56 +247,6 @@ spec:
     app.kubernetes.io/name: data-prep
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/embedding-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 6000
-      targetPort: 6000
-      protocol: TCP
-      name: embedding-usvc
-  selector:
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
----
-# Source: chatqna/charts/llm-uservice/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 9000
-      targetPort: 9000
-      protocol: TCP
-      name: llm-uservice
-  selector:
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/redis-vector-db/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -391,31 +276,6 @@ spec:
     app.kubernetes.io/name: redis-vector-db
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/reranking-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-reranking-usvc
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 8000
-      targetPort: 8000
-      protocol: TCP
-      name: reranking-usvc
-  selector:
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/retriever-usvc/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -686,162 +546,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/embedding-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: embedding-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: embedding-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-embedding-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/embedding-tei:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: embedding-usvc
-              containerPort: 6000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
-# Source: chatqna/charts/llm-uservice/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: llm-uservice
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: llm-uservice
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-llm-uservice-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/llm-tgi:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: llm-uservice
-              containerPort: 9000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/redis-vector-db/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -914,84 +618,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/reranking-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-reranking-usvc
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: reranking-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: reranking-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-reranking-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/reranking-tei:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: reranking-usvc
-              containerPort: 8000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/retriever-usvc/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -1366,16 +992,20 @@ spec:
       containers:
         - name: chatqna
           env:
-            - name: LLM_SERVICE_HOST_IP
-              value: chatqna-llm-uservice
-            - name: RERANK_SERVICE_HOST_IP
-              value: chatqna-reranking-usvc
+            - name: LLM_SERVER_HOST_IP
+              value: chatqna-tgi
+            - name: LLM_SERVER_PORT
+              value: "2080"
+            - name: RERANK_SERVER_HOST_IP
+              value: chatqna-teirerank
+            - name: RERANK_SERVER_PORT
+              value: "2082"
             - name: RETRIEVER_SERVICE_HOST_IP
               value: chatqna-retriever-usvc
-            - name: EMBEDDING_SERVICE_HOST_IP
-              value: chatqna-embedding-usvc
-            - name: GUARDRAIL_SERVICE_HOST_IP
-              value: chatqna-guardrails-usvc
+            - name: EMBEDDING_SERVER_HOST_IP
+              value: chatqna-tei
+            - name: EMBEDDING_SERVER_PORT
+              value: "2081"
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
diff --git a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml
index cee5e31bf..4e63eb586 100644
--- a/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml
+++ b/ChatQnA/kubernetes/intel/cpu/xeon/manifest/chatqna_bf16.yaml
@@ -27,71 +27,6 @@ data:
   no_proxy: ""
   LOGFLAG: ""
 ---
-# Source: chatqna/charts/embedding-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-embedding-usvc-config
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
-# Source: chatqna/charts/llm-uservice/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-llm-uservice-config
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TGI_LLM_ENDPOINT: "http://chatqna-tgi"
-  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
-  HF_HOME: "/tmp/.cache/huggingface"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
-# Source: chatqna/charts/reranking-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-reranking-usvc-config
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_RERANKING_ENDPOINT: "http://chatqna-teirerank"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
 # Source: chatqna/charts/retriever-usvc/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -313,56 +248,6 @@ spec:
     app.kubernetes.io/name: data-prep
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/embedding-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 6000
-      targetPort: 6000
-      protocol: TCP
-      name: embedding-usvc
-  selector:
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
----
-# Source: chatqna/charts/llm-uservice/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 9000
-      targetPort: 9000
-      protocol: TCP
-      name: llm-uservice
-  selector:
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/redis-vector-db/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -392,31 +277,6 @@ spec:
     app.kubernetes.io/name: redis-vector-db
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/reranking-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-reranking-usvc
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 8000
-      targetPort: 8000
-      protocol: TCP
-      name: reranking-usvc
-  selector:
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/retriever-usvc/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -687,162 +547,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/embedding-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: embedding-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: embedding-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-embedding-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/embedding-tei:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: embedding-usvc
-              containerPort: 6000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
-# Source: chatqna/charts/llm-uservice/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: llm-uservice
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: llm-uservice
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-llm-uservice-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/llm-tgi:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: llm-uservice
-              containerPort: 9000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/redis-vector-db/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -915,84 +619,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/reranking-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-reranking-usvc
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: reranking-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: reranking-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-reranking-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/reranking-tei:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: reranking-usvc
-              containerPort: 8000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/retriever-usvc/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -1369,16 +995,20 @@ spec:
       containers:
         - name: chatqna
           env:
-            - name: LLM_SERVICE_HOST_IP
-              value: chatqna-llm-uservice
-            - name: RERANK_SERVICE_HOST_IP
-              value: chatqna-reranking-usvc
+            - name: LLM_SERVER_HOST_IP
+              value: chatqna-tgi
+            - name: LLM_SERVER_PORT
+              value: "2080"
+            - name: RERANK_SERVER_HOST_IP
+              value: chatqna-teirerank
+            - name: RERANK_SERVER_PORT
+              value: "2082"
             - name: RETRIEVER_SERVICE_HOST_IP
               value: chatqna-retriever-usvc
-            - name: EMBEDDING_SERVICE_HOST_IP
-              value: chatqna-embedding-usvc
-            - name: GUARDRAIL_SERVICE_HOST_IP
-              value: chatqna-guardrails-usvc
+            - name: EMBEDDING_SERVER_HOST_IP
+              value: chatqna-tei
+            - name: EMBEDDING_SERVER_PORT
+              value: "2081"
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml
index d33cb1704..dd7b64be9 100644
--- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml
+++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml
@@ -27,27 +27,6 @@ data:
   no_proxy: ""
   LOGFLAG: ""
 ---
-# Source: chatqna/charts/embedding-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-embedding-usvc-config
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
 # Source: chatqna/charts/guardrails-usvc/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -72,50 +51,6 @@ data:
   https_proxy: ""
   no_proxy: ""
 ---
-# Source: chatqna/charts/llm-uservice/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-llm-uservice-config
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TGI_LLM_ENDPOINT: "http://chatqna-tgi"
-  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
-  HF_HOME: "/tmp/.cache/huggingface"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
-# Source: chatqna/charts/reranking-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-reranking-usvc-config
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_RERANKING_ENDPOINT: "http://chatqna-teirerank"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
 # Source: chatqna/charts/retriever-usvc/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -208,7 +143,7 @@ metadata:
     app.kubernetes.io/managed-by: Helm
 data:
   MODEL_ID: "meta-llama/Meta-Llama-Guard-2-8B"
-  PORT: "2080"
+  PORT: "2083"
   HF_TOKEN: "insert-your-huggingface-token-here"
   http_proxy: ""
   https_proxy: ""
@@ -364,31 +299,6 @@ spec:
     app.kubernetes.io/name: data-prep
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/embedding-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 6000
-      targetPort: 6000
-      protocol: TCP
-      name: embedding-usvc
-  selector:
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/guardrails-usvc/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -414,31 +324,6 @@ spec:
     app.kubernetes.io/name: guardrails-usvc
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/llm-uservice/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 9000
-      targetPort: 9000
-      protocol: TCP
-      name: llm-uservice
-  selector:
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/redis-vector-db/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -468,31 +353,6 @@ spec:
     app.kubernetes.io/name: redis-vector-db
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/reranking-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-reranking-usvc
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 8000
-      targetPort: 8000
-      protocol: TCP
-      name: reranking-usvc
-  selector:
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/retriever-usvc/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -586,7 +446,7 @@ spec:
   type: ClusterIP
   ports:
     - port: 80
-      targetPort: 2080
+      targetPort: 2083
       protocol: TCP
       name: tgi
   selector:
@@ -788,84 +648,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/embedding-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: embedding-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: embedding-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-embedding-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/embedding-tei:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: embedding-usvc
-              containerPort: 6000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/guardrails-usvc/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -944,84 +726,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/llm-uservice/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: llm-uservice
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: llm-uservice
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-llm-uservice-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/llm-tgi:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: llm-uservice
-              containerPort: 9000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/redis-vector-db/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -1094,84 +798,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/reranking-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-reranking-usvc
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: reranking-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: reranking-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-reranking-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/reranking-tei:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: reranking-usvc
-              containerPort: 8000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/retriever-usvc/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -1486,7 +1112,7 @@ spec:
               name: tmp
           ports:
             - name: http
-              containerPort: 2080
+              containerPort: 2083
               protocol: TCP
           livenessProbe:
             failureThreshold: 24
@@ -1629,16 +1255,24 @@ spec:
       containers:
         - name: chatqna
           env:
-            - name: LLM_SERVICE_HOST_IP
-              value: chatqna-llm-uservice
-            - name: RERANK_SERVICE_HOST_IP
-              value: chatqna-reranking-usvc
+            - name: LLM_SERVER_HOST_IP
+              value: chatqna-tgi
+            - name: LLM_SERVER_PORT
+              value: "2080"
+            - name: RERANK_SERVER_HOST_IP
+              value: chatqna-teirerank
+            - name: RERANK_SERVER_PORT
+              value: "2082"
             - name: RETRIEVER_SERVICE_HOST_IP
               value: chatqna-retriever-usvc
-            - name: EMBEDDING_SERVICE_HOST_IP
-              value: chatqna-embedding-usvc
+            - name: EMBEDDING_SERVER_HOST_IP
+              value: chatqna-tei
+            - name: EMBEDDING_SERVER_PORT
+              value: "2081"
             - name: GUARDRAIL_SERVICE_HOST_IP
               value: chatqna-guardrails-usvc
+            - name: GUARDRAIL_SERVICE_PORT
+              value: "9090"
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml
index 5308cf7ac..de83aacb0 100644
--- a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml
+++ b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna.yaml
@@ -27,71 +27,6 @@ data:
   no_proxy: ""
   LOGFLAG: ""
 ---
-# Source: chatqna/charts/embedding-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-embedding-usvc-config
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
-# Source: chatqna/charts/llm-uservice/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-llm-uservice-config
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TGI_LLM_ENDPOINT: "http://chatqna-tgi"
-  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
-  HF_HOME: "/tmp/.cache/huggingface"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
-# Source: chatqna/charts/reranking-usvc/templates/configmap.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: chatqna-reranking-usvc-config
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-data:
-  TEI_RERANKING_ENDPOINT: "http://chatqna-teirerank"
-  http_proxy: ""
-  https_proxy: ""
-  no_proxy: ""
-  LOGFLAG: ""
----
 # Source: chatqna/charts/retriever-usvc/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -313,56 +248,6 @@ spec:
     app.kubernetes.io/name: data-prep
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/embedding-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 6000
-      targetPort: 6000
-      protocol: TCP
-      name: embedding-usvc
-  selector:
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
----
-# Source: chatqna/charts/llm-uservice/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 9000
-      targetPort: 9000
-      protocol: TCP
-      name: llm-uservice
-  selector:
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/redis-vector-db/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -392,31 +277,6 @@ spec:
     app.kubernetes.io/name: redis-vector-db
     app.kubernetes.io/instance: chatqna
 ---
-# Source: chatqna/charts/reranking-usvc/templates/service.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: v1
-kind: Service
-metadata:
-  name: chatqna-reranking-usvc
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  type: ClusterIP
-  ports:
-    - port: 8000
-      targetPort: 8000
-      protocol: TCP
-      name: reranking-usvc
-  selector:
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
----
 # Source: chatqna/charts/retriever-usvc/templates/service.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -687,162 +547,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/embedding-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-embedding-usvc
-  labels:
-    helm.sh/chart: embedding-usvc-1.0.0
-    app.kubernetes.io/name: embedding-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: embedding-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: embedding-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-embedding-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/embedding-tei:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: embedding-usvc
-              containerPort: 6000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: embedding-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
-# Source: chatqna/charts/llm-uservice/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-llm-uservice
-  labels:
-    helm.sh/chart: llm-uservice-1.0.0
-    app.kubernetes.io/name: llm-uservice
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: llm-uservice
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: llm-uservice
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-llm-uservice-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: false
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/llm-tgi:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: llm-uservice
-              containerPort: 9000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: llm-uservice
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/redis-vector-db/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -915,84 +619,6 @@ spec:
         - name: tmp
           emptyDir: {}
 ---
-# Source: chatqna/charts/reranking-usvc/templates/deployment.yaml
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: chatqna-reranking-usvc
-  labels:
-    helm.sh/chart: reranking-usvc-1.0.0
-    app.kubernetes.io/name: reranking-usvc
-    app.kubernetes.io/instance: chatqna
-    app.kubernetes.io/version: "v1.0"
-    app.kubernetes.io/managed-by: Helm
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app.kubernetes.io/name: reranking-usvc
-      app.kubernetes.io/instance: chatqna
-  template:
-    metadata:
-      labels:
-        app.kubernetes.io/name: reranking-usvc
-        app.kubernetes.io/instance: chatqna
-    spec:
-      securityContext:
-        {}
-      containers:
-        - name: chatqna
-          envFrom:
-            - configMapRef:
-                name: chatqna-reranking-usvc-config
-          securityContext:
-            allowPrivilegeEscalation: false
-            capabilities:
-              drop:
-              - ALL
-            readOnlyRootFilesystem: true
-            runAsNonRoot: true
-            runAsUser: 1000
-            seccompProfile:
-              type: RuntimeDefault
-          image: "opea/reranking-tei:latest"
-          imagePullPolicy: IfNotPresent
-          ports:
-            - name: reranking-usvc
-              containerPort: 8000
-              protocol: TCP
-          volumeMounts:
-            - mountPath: /tmp
-              name: tmp
-          livenessProbe:
-            failureThreshold: 24
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          readinessProbe:
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          startupProbe:
-            failureThreshold: 120
-            httpGet:
-              path: v1/health_check
-              port: reranking-usvc
-            initialDelaySeconds: 5
-            periodSeconds: 5
-          resources:
-            {}
-      volumes:
-        - name: tmp
-          emptyDir: {}
----
 # Source: chatqna/charts/retriever-usvc/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -1369,16 +995,20 @@ spec:
       containers:
         - name: chatqna
           env:
-            - name: LLM_SERVICE_HOST_IP
-              value: chatqna-llm-uservice
-            - name: RERANK_SERVICE_HOST_IP
-              value: chatqna-reranking-usvc
+            - name: LLM_SERVER_HOST_IP
+              value: chatqna-tgi
+            - name: LLM_SERVER_PORT
+              value: "2080"
+            - name: RERANK_SERVER_HOST_IP
+              value: chatqna-teirerank
+            - name: RERANK_SERVER_PORT
+              value: "2082"
             - name: RETRIEVER_SERVICE_HOST_IP
               value: chatqna-retriever-usvc
-            - name: EMBEDDING_SERVICE_HOST_IP
-              value: chatqna-embedding-usvc
-            - name: GUARDRAIL_SERVICE_HOST_IP
-              value: chatqna-guardrails-usvc
+            - name: EMBEDDING_SERVER_HOST_IP
+              value: chatqna-tei
+            - name: EMBEDDING_SERVER_PORT
+              value: "2081"
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
diff --git a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
index 8f6dd963c..b7958ca80 100644
--- a/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_guardrails_on_gaudi.sh
@@ -19,7 +19,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-guardrails chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi guardrails-tgi"
+    service_list="chatqna-guardrails chatqna-ui dataprep-redis retriever-redis guardrails-tgi"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
@@ -35,17 +35,19 @@ function start_services() {
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8008"
     export REDIS_URL="redis://${ip_address}:6379"
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_HOST_IP=${ip_address}
     export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
+    export RERANK_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
     export GUARDRAIL_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_PORT=8090
+    export RERANK_SERVER_PORT=8808
+    export LLM_SERVER_PORT=8008
+    export GUARDRAIL_SERVICE_PORT=9090
     export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
     export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
     export GURADRAILS_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"
@@ -120,14 +122,6 @@ function validate_microservices() {
         "tei-embedding-gaudi-server" \
         '{"inputs":"What is Deep Learning?"}'
 
-    # embedding microservice
-    validate_services \
-        "${ip_address}:6000/v1/embeddings" \
-        '"text":"What is Deep Learning?","embedding":[' \
-        "embedding" \
-        "embedding-tei-server" \
-        '{"text":"What is Deep Learning?"}'
-
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
     # retrieval microservice
@@ -147,14 +141,6 @@ function validate_microservices() {
         "tei-reranking-gaudi-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # rerank microservice
-    validate_services \
-        "${ip_address}:8000/v1/reranking" \
-        "Deep learning is..." \
-        "rerank" \
-        "reranking-tei-gaudi-server" \
-        '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}'
-
     # tgi for llm service
     validate_services \
         "${ip_address}:8008/generate" \
@@ -163,22 +149,6 @@ function validate_microservices() {
         "tgi-gaudi-server" \
         '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
 
-    # llm microservice
-    validate_services \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm" \
-        "llm-tgi-gaudi-server" \
-        '{"query":"What is Deep Learning?"}'
-
-    # tgi for guardrails service
-    validate_services \
-        "${ip_address}:8088/generate" \
-        "generated_text" \
-        "tgi-guardrails" \
-        "tgi-guardrails-server" \
-        '{"inputs":"How do you buy a tiger in the US?","parameters":{"max_new_tokens":32}}'
-
     # guardrails microservice
     validate_services \
         "${ip_address}:9090/v1/guardrails" \
@@ -186,14 +156,13 @@ function validate_microservices() {
         "guardrails" \
         "guardrails-tgi-gaudi-server" \
         '{"text":"How do you buy a tiger in the US?"}'
-
 }
 
 function validate_megaservice() {
     # Curl the Mega Service
     validate_services \
         "${ip_address}:8888/v1/chatqna" \
-        "billion" \
+        "data: " \
         "mega-chatqna" \
         "chatqna-gaudi-guardrails-server" \
         '{"messages": "What is the revenue of Nike in 2023?"}'
diff --git a/ChatQnA/tests/test_compose_no_wrapper_on_gaudi.sh b/ChatQnA/tests/test_compose_no_wrapper_on_gaudi.sh
deleted file mode 100644
index c760d9eb7..000000000
--- a/ChatQnA/tests/test_compose_no_wrapper_on_gaudi.sh
+++ /dev/null
@@ -1,251 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -e
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    cd $WORKPATH/docker_image_build
-    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
-
-    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-no-wrapper chatqna-ui dataprep-redis retriever-redis"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:latest
-
-    docker images && sleep 1s
-}
-
-function start_services() {
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
-    export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8005"
-    export REDIS_URL="redis://${ip_address}:6379"
-    export REDIS_HOST=${ip_address}
-    export INDEX_NAME="rag-redis"
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVER_HOST_IP=${ip_address}
-    export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVER_HOST_IP=${ip_address}
-    export LLM_SERVER_HOST_IP=${ip_address}
-    export EMBEDDING_SERVER_PORT=8090
-    export RERANK_SERVER_PORT=8808
-    export LLM_SERVER_PORT=8005
-    export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
-    export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
-    export DATAPREP_GET_FILE_ENDPOINT="http://${ip_address}:6008/v1/dataprep/get_file"
-    export DATAPREP_DELETE_FILE_ENDPOINT="http://${ip_address}:6009/v1/dataprep/delete_file"
-
-    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
-
-    # Start Docker Containers
-    docker compose -f compose_no_wrapper.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-
-    n=0
-    until [[ "$n" -ge 500 ]]; do
-        docker logs tgi-gaudi-server > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
-            break
-        fi
-        sleep 1s
-        n=$((n+1))
-    done
-}
-
-function validate_service() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
-        cd $LOG_PATH
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
-    else
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    fi
-    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
-    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
-
-    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-
-    # check response status
-    if [ "$HTTP_STATUS" -ne "200" ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        exit 1
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-    fi
-    # check response body
-    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
-        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
-        exit 1
-    else
-        echo "[ $SERVICE_NAME ] Content is as expected."
-    fi
-
-    sleep 1s
-}
-
-function validate_microservices() {
-    # Check if the microservices are running correctly.
-
-    # tei for embedding service
-    validate_service \
-        "${ip_address}:8090/embed" \
-        "[[" \
-        "tei-embedding" \
-        "tei-embedding-gaudi-server" \
-        '{"inputs":"What is Deep Learning?"}'
-
-    sleep 1m # retrieval can't curl as expected, try to wait for more time
-
-    # test /v1/dataprep upload file
-    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep" \
-        "Data preparation succeeded" \
-        "dataprep_upload_file" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep upload link
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep" \
-        "Data preparation succeeded" \
-        "dataprep_upload_link" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep/get_file
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep/get_file" \
-        '{"name":' \
-        "dataprep_get" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep/delete_file
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep/delete_file" \
-        '{"status":true}' \
-        "dataprep_del" \
-        "dataprep-redis-server"
-
-    # retrieval microservice
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    validate_service \
-        "${ip_address}:7000/v1/retrieval" \
-        "retrieved_docs" \
-        "retrieval-microservice" \
-        "retriever-redis-server" \
-        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
-
-    # tei for rerank microservice
-    validate_service \
-        "${ip_address}:8808/rerank" \
-        '{"index":1,"score":' \
-        "tei-rerank" \
-        "tei-reranking-gaudi-server" \
-        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
-
-    # tgi for llm service
-    validate_service \
-        "${ip_address}:8005/generate" \
-        "generated_text" \
-        "tgi-llm" \
-        "tgi-gaudi-server" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
-}
-
-function validate_megaservice() {
-    # Curl the Mega Service
-    validate_service \
-        "${ip_address}:8888/v1/chatqna" \
-        "data: " \
-        "chatqna-megaservice" \
-        "chatqna-gaudi-backend-server" \
-        '{"messages": "What is the revenue of Nike in 2023?"}'
-
-}
-
-function validate_frontend() {
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniforge3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
-
-    conda install -c conda-forge nodejs -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
-
-    exit_status=0
-    npx playwright test || exit_status=$?
-
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
-}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose stop && docker compose rm -f
-}
-
-function main() {
-
-    stop_docker
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    start_time=$(date +%s)
-    start_services
-    end_time=$(date +%s)
-    duration=$((end_time-start_time))
-    echo "Mega service start duration is $duration s"
-
-    if [ "${mode}" == "perf" ]; then
-        python3 $WORKPATH/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        validate_megaservice
-        # validate_frontend
-    fi
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
diff --git a/ChatQnA/tests/test_compose_no_wrapper_on_xeon.sh b/ChatQnA/tests/test_compose_no_wrapper_on_xeon.sh
deleted file mode 100644
index 70ad7dc08..000000000
--- a/ChatQnA/tests/test_compose_no_wrapper_on_xeon.sh
+++ /dev/null
@@ -1,256 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -e
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    cd $WORKPATH/docker_image_build
-    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
-
-    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-no-wrapper chatqna-ui chatqna-conversation-ui dataprep-redis retriever-redis"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-
-    docker images && sleep 1s
-}
-
-function start_services() {
-    cd $WORKPATH/docker_compose/intel/cpu/xeon
-
-    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
-    export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:6006"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:9009"
-    export REDIS_URL="redis://${ip_address}:6379"
-    export REDIS_HOST=${ip_address}
-    export INDEX_NAME="rag-redis"
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-    export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVER_HOST_IP=${ip_address}
-    export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVER_HOST_IP=${ip_address}
-    export LLM_SERVER_HOST_IP=${ip_address}
-    export EMBEDDING_SERVER_PORT=6006
-    export RERANK_SERVER_PORT=8808
-    export LLM_SERVER_PORT=9009
-    export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
-    export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
-    export DATAPREP_GET_FILE_ENDPOINT="http://${ip_address}:6007/v1/dataprep/get_file"
-    export DATAPREP_DELETE_FILE_ENDPOINT="http://${ip_address}:6007/v1/dataprep/delete_file"
-
-    sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
-
-    # Start Docker Containers
-    docker compose -f compose_no_wrapper.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-
-    n=0
-    until [[ "$n" -ge 500 ]]; do
-        docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log
-        if grep -q Connected ${LOG_PATH}/tgi_service_start.log; then
-            break
-        fi
-        sleep 1s
-        n=$((n+1))
-    done
-}
-
-function validate_service() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    if [[ $SERVICE_NAME == *"dataprep_upload_file"* ]]; then
-        cd $LOG_PATH
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'files=@./dataprep_file.txt' -H 'Content-Type: multipart/form-data' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_upload_link"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -F 'link_list=["https://www.ces.tech/"]' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_get"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' "$URL")
-    elif [[ $SERVICE_NAME == *"dataprep_del"* ]]; then
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d '{"file_path": "all"}' -H 'Content-Type: application/json' "$URL")
-    else
-        HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    fi
-    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
-    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
-
-    docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-
-    # check response status
-    if [ "$HTTP_STATUS" -ne "200" ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        exit 1
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-    fi
-    # check response body
-    if [[ "$RESPONSE_BODY" != *"$EXPECTED_RESULT"* ]]; then
-        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
-        exit 1
-    else
-        echo "[ $SERVICE_NAME ] Content is as expected."
-    fi
-
-    sleep 1s
-}
-
-function validate_microservices() {
-    # Check if the microservices are running correctly.
-
-    # tei for embedding service
-    validate_service \
-        "${ip_address}:6006/embed" \
-        "[[" \
-        "tei-embedding" \
-        "tei-embedding-server" \
-        '{"inputs":"What is Deep Learning?"}'
-
-    sleep 1m # retrieval can't curl as expected, try to wait for more time
-
-    # test /v1/dataprep upload file
-    echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep" \
-        "Data preparation succeeded" \
-        "dataprep_upload_file" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep upload link
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep" \
-        "Data preparation succeeded" \
-        "dataprep_upload_link" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep/get_file
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep/get_file" \
-        '{"name":' \
-        "dataprep_get" \
-        "dataprep-redis-server"
-
-    # test /v1/dataprep/delete_file
-    validate_service \
-        "http://${ip_address}:6007/v1/dataprep/delete_file" \
-        '{"status":true}' \
-        "dataprep_del" \
-        "dataprep-redis-server"
-
-    # retrieval microservice
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    validate_service \
-        "${ip_address}:7000/v1/retrieval" \
-        "retrieved_docs" \
-        "retrieval-microservice" \
-        "retriever-redis-server" \
-        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
-
-    # tei for rerank microservice
-    validate_service \
-        "${ip_address}:8808/rerank" \
-        '{"index":1,"score":' \
-        "tei-rerank" \
-        "tei-reranking-server" \
-        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
-
-    # tgi for llm service
-    validate_service \
-        "${ip_address}:9009/generate" \
-        "generated_text" \
-        "tgi-llm" \
-        "tgi-service" \
-        '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
-}
-
-function validate_megaservice() {
-    # Curl the Mega Service
-    validate_service \
-        "${ip_address}:8888/v1/chatqna" \
-        "data: " \
-        "chatqna-megaservice" \
-        "chatqna-xeon-backend-server" \
-        '{"messages": "What is the revenue of Nike in 2023?"}'
-
-}
-
-function validate_frontend() {
-    echo "[ TEST INFO ]: --------- frontend test started ---------"
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniforge3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-    echo "[ TEST INFO ]: --------- conda env activated ---------"
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
-
-    conda install -c conda-forge nodejs -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
-
-    exit_status=0
-    npx playwright test || exit_status=$?
-
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
-}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/intel/cpu/xeon
-    docker compose stop && docker compose rm -f
-}
-
-function main() {
-
-    stop_docker
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    start_time=$(date +%s)
-    start_services
-    end_time=$(date +%s)
-    duration=$((end_time-start_time))
-    echo "Mega service start duration is $duration s" && sleep 1s
-
-    if [ "${mode}" == "perf" ]; then
-        python3 $WORKPATH/tests/chatqna_benchmark.py
-    elif [ "${mode}" == "" ]; then
-        validate_microservices
-        echo "==== microservices validated ===="
-        validate_megaservice
-        echo "==== megaservice validated ===="
-        # validate_frontend
-        # echo "==== frontend validated ===="
-    fi
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
diff --git a/ChatQnA/tests/test_compose_on_gaudi.sh b/ChatQnA/tests/test_compose_on_gaudi.sh
index 37b822285..2820ce321 100644
--- a/ChatQnA/tests/test_compose_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_on_gaudi.sh
@@ -19,7 +19,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi nginx"
+    service_list="chatqna chatqna-ui dataprep-redis retriever-redis"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
@@ -33,7 +33,7 @@ function start_services() {
     cd $WORKPATH/docker_compose/intel/hpu/gaudi
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
     export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
     export TGI_LLM_ENDPOINT="http://${ip_address}:8005"
@@ -42,27 +42,22 @@ function start_services() {
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_HOST_IP=${ip_address}
     export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
+    export RERANK_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_PORT=8090
+    export RERANK_SERVER_PORT=8808
+    export LLM_SERVER_PORT=8005
     export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
     export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
     export DATAPREP_GET_FILE_ENDPOINT="http://${ip_address}:6008/v1/dataprep/get_file"
     export DATAPREP_DELETE_FILE_ENDPOINT="http://${ip_address}:6009/v1/dataprep/delete_file"
-    export llm_service_devices=all
-    export tei_embedding_devices=all
-    export FRONTEND_SERVICE_IP=${host_ip}
-    export FRONTEND_SERVICE_PORT=5173
-    export BACKEND_SERVICE_NAME=chatqna
-    export BACKEND_SERVICE_IP=${host_ip}
-    export BACKEND_SERVICE_PORT=8888
-    export NGINX_PORT=80
 
     sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
 
     # Start Docker Containers
-    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
+    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
 
     n=0
     until [[ "$n" -ge 500 ]]; do
@@ -128,14 +123,6 @@ function validate_microservices() {
         "tei-embedding-gaudi-server" \
         '{"inputs":"What is Deep Learning?"}'
 
-    # embedding microservice
-    validate_service \
-        "${ip_address}:6000/v1/embeddings" \
-        '"text":"What is Deep Learning?","embedding":[' \
-        "embedding-microservice" \
-        "embedding-tei-server" \
-        '{"text":"What is Deep Learning?"}'
-
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
     # test /v1/dataprep upload file
@@ -184,14 +171,6 @@ function validate_microservices() {
         "tei-reranking-gaudi-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # rerank microservice
-    validate_service \
-        "${ip_address}:8000/v1/reranking" \
-        "Deep learning is..." \
-        "rerank-microservice" \
-        "reranking-tei-gaudi-server" \
-        '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}'
-
     # tgi for llm service
     validate_service \
         "${ip_address}:8005/generate" \
@@ -200,14 +179,6 @@ function validate_microservices() {
         "tgi-gaudi-server" \
         '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
 
-    # llm microservice
-    validate_service \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm-microservice" \
-        "llm-tgi-gaudi-server" \
-        '{"query":"What is Deep Learning?"}'
-
 }
 
 function validate_megaservice() {
@@ -264,9 +235,13 @@ function main() {
     duration=$((end_time-start_time))
     echo "Mega service start duration is $duration s"
 
-    validate_microservices
-    validate_megaservice
-    validate_frontend
+    if [ "${mode}" == "perf" ]; then
+        python3 $WORKPATH/tests/chatqna_benchmark.py
+    elif [ "${mode}" == "" ]; then
+        validate_microservices
+        validate_megaservice
+        # validate_frontend
+    fi
 
     stop_docker
     echo y | docker system prune
diff --git a/ChatQnA/tests/test_compose_on_xeon.sh b/ChatQnA/tests/test_compose_on_xeon.sh
index e110984d6..6008b0f73 100644
--- a/ChatQnA/tests/test_compose_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_on_xeon.sh
@@ -19,7 +19,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui chatqna-conversation-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-tgi nginx"
+    service_list="chatqna chatqna-ui chatqna-conversation-ui dataprep-redis retriever-redis"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
@@ -29,38 +29,34 @@ function build_docker_images() {
 }
 
 function start_services() {
-    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    cd $WORKPATH/docker_compose/intel/cpu/xeon
 
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
+    export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:6006"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:9009"
     export REDIS_URL="redis://${ip_address}:6379"
     export REDIS_HOST=${ip_address}
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_HOST_IP=${ip_address}
     export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
+    export RERANK_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_PORT=6006
+    export RERANK_SERVER_PORT=8808
+    export LLM_SERVER_PORT=9009
     export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
     export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
     export DATAPREP_GET_FILE_ENDPOINT="http://${ip_address}:6007/v1/dataprep/get_file"
     export DATAPREP_DELETE_FILE_ENDPOINT="http://${ip_address}:6007/v1/dataprep/delete_file"
-    export FRONTEND_SERVICE_IP=${host_ip}
-    export FRONTEND_SERVICE_PORT=5173
-    export BACKEND_SERVICE_NAME=chatqna
-    export BACKEND_SERVICE_IP=${host_ip}
-    export BACKEND_SERVICE_PORT=8888
-    export NGINX_PORT=80
 
     sed -i "s/backend_address/$ip_address/g" $WORKPATH/ui/svelte/.env
 
     # Start Docker Containers
-    docker compose up -d > ${LOG_PATH}/start_services_with_compose.log
+    docker compose -f compose.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
+
     n=0
     until [[ "$n" -ge 500 ]]; do
         docker logs tgi-service > ${LOG_PATH}/tgi_service_start.log
@@ -125,14 +121,6 @@ function validate_microservices() {
         "tei-embedding-server" \
         '{"inputs":"What is Deep Learning?"}'
 
-    # embedding microservice
-    validate_service \
-        "${ip_address}:6000/v1/embeddings" \
-        '"text":"What is Deep Learning?","embedding":[' \
-        "embedding-microservice" \
-        "embedding-tei-server" \
-        '{"text":"What is Deep Learning?"}'
-
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
     # test /v1/dataprep upload file
@@ -181,14 +169,6 @@ function validate_microservices() {
         "tei-reranking-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # rerank microservice
-    validate_service \
-        "${ip_address}:8000/v1/reranking" \
-        "Deep learning is..." \
-        "rerank-microservice" \
-        "reranking-tei-xeon-server" \
-        '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}'
-
     # tgi for llm service
     validate_service \
         "${ip_address}:9009/generate" \
@@ -197,14 +177,6 @@ function validate_microservices() {
         "tgi-service" \
         '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
 
-    # llm microservice
-    validate_service \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm-microservice" \
-        "llm-tgi-server" \
-        '{"query":"What is Deep Learning?"}'
-
 }
 
 function validate_megaservice() {
@@ -249,7 +221,7 @@ function validate_frontend() {
 }
 
 function stop_docker() {
-    cd $WORKPATH/docker_compose/intel/cpu/xeon/
+    cd $WORKPATH/docker_compose/intel/cpu/xeon
     docker compose stop && docker compose rm -f
 }
 
@@ -270,8 +242,8 @@ function main() {
         echo "==== microservices validated ===="
         validate_megaservice
         echo "==== megaservice validated ===="
-        validate_frontend
-        echo "==== frontend validated ===="
+        # validate_frontend
+        # echo "==== frontend validated ===="
     fi
 
     stop_docker
diff --git a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
index ddb2ebba0..bcc4cdc5f 100644
--- a/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_qdrant_on_xeon.sh
@@ -19,7 +19,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-qdrant embedding-tei retriever-qdrant reranking-tei llm-tgi"
+    service_list="chatqna chatqna-ui dataprep-qdrant retriever-qdrant"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker images && sleep 1s
@@ -32,21 +32,20 @@ function start_services() {
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:6040"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:6041"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:6042"
     export QDRANT_HOST=${ip_address}
     export QDRANT_PORT=6333
     export INDEX_NAME="rag-qdrant"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_HOST_IP=${ip_address}
     export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_PORT=6044
+    export RERANK_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+    export MEGA_SERVICE_PORT=8912
+    export EMBEDDING_SERVER_PORT=6040
     export RETRIEVER_SERVICE_PORT=6045
-    export RERANK_SERVICE_PORT=6046
-    export LLM_SERVICE_PORT=6047
+    export RERANK_SERVER_PORT=6041
+    export LLM_SERVER_PORT=6042
     export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8912/v1/chatqna"
     export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6043/v1/dataprep"
 
@@ -114,14 +113,6 @@ function validate_microservices() {
         "tei-embedding-server" \
         '{"inputs":"What is Deep Learning?"}'
 
-    # embedding microservice
-    validate_services \
-        "${ip_address}:6044/v1/embeddings" \
-        '"text":"What is Deep Learning?","embedding":[' \
-        "embedding" \
-        "embedding-tei-server" \
-        '{"text":"What is Deep Learning?"}'
-
     # test /v1/dataprep upload file
     echo "Deep learning is a subset of machine learning that utilizes neural networks with multiple layers to analyze various levels of abstract data representations. It enables computers to identify patterns and make decisions with minimal human intervention by learning from large amounts of data." > $LOG_PATH/dataprep_file.txt
     validate_services \
@@ -154,14 +145,6 @@ function validate_microservices() {
         "tei-reranking-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # rerank microservice
-    validate_services \
-        "${ip_address}:6046/v1/reranking" \
-        "Deep learning is..." \
-        "rerank" \
-        "reranking-tei-xeon-server" \
-        '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}'
-
     # tgi for llm service
     validate_services \
         "${ip_address}:6042/generate" \
@@ -169,15 +152,6 @@ function validate_microservices() {
         "tgi-llm" \
         "tgi-service" \
         '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
-    # llm microservice
-    validate_services \
-        "${ip_address}:6047/v1/chat/completions" \
-        "data: " \
-        "llm" \
-        "llm-tgi-server" \
-        '{"query":"Deep Learning"}'
-
 }
 
 function validate_megaservice() {
diff --git a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh
index ab45e195c..5fa2257c3 100644
--- a/ChatQnA/tests/test_compose_vllm_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_vllm_on_gaudi.sh
@@ -19,7 +19,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-vllm-hpu llm-vllm"
+    service_list="chatqna chatqna-ui dataprep-redis retriever-redis llm-vllm-hpu"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
@@ -33,17 +33,17 @@ function start_services() {
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export vLLM_LLM_ENDPOINT="http://${ip_address}:8007"
-    export LLM_SERVICE_PORT=9000
     export REDIS_URL="redis://${ip_address}:6379"
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_HOST_IP=${ip_address}
     export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
+    export RERANK_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_PORT=8090
+    export RERANK_SERVER_PORT=8808
+    export LLM_SERVER_PORT=8007
     export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
     export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
 
@@ -102,14 +102,6 @@ function validate_microservices() {
         "tei-embedding-gaudi-server" \
         '{"inputs":"What is Deep Learning?"}'
 
-    # embedding microservice
-    validate_services \
-        "${ip_address}:6000/v1/embeddings" \
-        '"text":"What is Deep Learning?","embedding":\[' \
-        "embedding" \
-        "embedding-tei-server" \
-        '{"text":"What is Deep Learning?"}'
-
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
     # retrieval microservice
@@ -129,14 +121,6 @@ function validate_microservices() {
         "tei-reranking-gaudi-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # rerank microservice
-    validate_services \
-        "${ip_address}:8000/v1/reranking" \
-        "Deep learning is..." \
-        "rerank" \
-        "reranking-tei-gaudi-server" \
-        '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}'
-
     # vllm for llm service
     validate_services \
         "${ip_address}:8007/v1/completions" \
@@ -144,15 +128,6 @@ function validate_microservices() {
         "vllm-llm" \
         "vllm-gaudi-server" \
         '{"model": "Intel/neural-chat-7b-v3-3","prompt": "What is Deep Learning?","max_tokens": 32,"temperature": 0}'
-
-    # llm microservice
-    validate_services \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm" \
-        "llm-vllm-gaudi-server" \
-        '{"query":"What is Deep Learning?"}'
-
 }
 
 function validate_megaservice() {
diff --git a/ChatQnA/tests/test_compose_vllm_on_xeon.sh b/ChatQnA/tests/test_compose_vllm_on_xeon.sh
index e5274a199..2f6b05523 100644
--- a/ChatQnA/tests/test_compose_vllm_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_vllm_on_xeon.sh
@@ -20,7 +20,7 @@ function build_docker_images() {
     git clone https://github.com/vllm-project/vllm.git
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-vllm vllm"
+    service_list="chatqna chatqna-ui dataprep-redis retriever-redis vllm"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
@@ -36,17 +36,17 @@ function start_services() {
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:6006"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export vLLM_LLM_ENDPOINT="http://${ip_address}:9009"
-    export LLM_SERVICE_PORT=9000
     export REDIS_URL="redis://${ip_address}:6379"
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_HOST_IP=${ip_address}
     export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
+    export RERANK_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_PORT=6006
+    export RERANK_SERVER_PORT=8808
+    export LLM_SERVER_PORT=9009
     export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
     export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
 
@@ -104,14 +104,6 @@ function validate_microservices() {
         "tei-embedding-server" \
         '{"inputs":"What is Deep Learning?"}'
 
-    # embedding microservice
-    validate_services \
-        "${ip_address}:6000/v1/embeddings" \
-        '"text":"What is Deep Learning?","embedding":\[' \
-        "embedding" \
-        "embedding-tei-server" \
-        '{"text":"What is Deep Learning?"}'
-
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
     # retrieval microservice
@@ -131,14 +123,6 @@ function validate_microservices() {
         "tei-reranking-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # rerank microservice
-    validate_services \
-        "${ip_address}:8000/v1/reranking" \
-        "Deep learning is..." \
-        "rerank" \
-        "reranking-tei-xeon-server" \
-        '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}'
-
     # vllm for llm service
     validate_services \
         "${ip_address}:9009/v1/completions" \
@@ -146,15 +130,6 @@ function validate_microservices() {
         "vllm-llm" \
         "vllm-service" \
         '{"model": "Intel/neural-chat-7b-v3-3", "prompt": "What is Deep Learning?", "max_tokens": 32, "temperature": 0}'
-
-    # llm microservice
-    validate_services \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm" \
-        "llm-vllm-server" \
-        '{"query":"What is Deep Learning?"}'
-
 }
 
 function validate_megaservice() {
@@ -217,7 +192,7 @@ function main() {
     elif [ "${mode}" == "" ]; then
         validate_microservices
         validate_megaservice
-        validate_frontend
+        #validate_frontend
     fi
 
     stop_docker
diff --git a/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh
index 4e2b12e61..88eaa39b4 100644
--- a/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh
@@ -19,7 +19,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis embedding-tei retriever-redis reranking-tei llm-vllm-ray-hpu llm-vllm-ray"
+    service_list="chatqna chatqna-ui dataprep-redis retriever-redis llm-vllm-ray-hpu"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
@@ -34,17 +34,17 @@ function start_services() {
     export RERANK_MODEL_ID="BAAI/bge-reranker-base"
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
-    export TEI_RERANKING_ENDPOINT="http://${ip_address}:8808"
-    export vLLM_RAY_LLM_ENDPOINT="http://${ip_address}:8006"
-    export LLM_SERVICE_PORT=9000
     export REDIS_URL="redis://${ip_address}:6379"
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_HOST_IP=${ip_address}
     export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export RERANK_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
+    export RERANK_SERVER_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_PORT=8090
+    export RERANK_SERVER_PORT=8808
+    export LLM_SERVER_PORT=8006
     export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
     export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
 
@@ -103,14 +103,6 @@ function validate_microservices() {
         "tei-embedding-gaudi-server" \
         '{"inputs":"What is Deep Learning?"}'
 
-    # embedding microservice
-    validate_services \
-        "${ip_address}:6000/v1/embeddings" \
-        '"text":"What is Deep Learning?","embedding":\[' \
-        "embedding" \
-        "embedding-tei-server" \
-        '{"text":"What is Deep Learning?"}'
-
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
     # retrieval microservice
@@ -130,14 +122,6 @@ function validate_microservices() {
         "tei-reranking-gaudi-server" \
         '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
 
-    # rerank microservice
-    validate_services \
-        "${ip_address}:8000/v1/reranking" \
-        "Deep learning is..." \
-        "rerank" \
-        "reranking-tei-gaudi-server" \
-        '{"initial_query":"What is Deep Learning?", "retrieved_docs": [{"text":"Deep Learning is not..."}, {"text":"Deep learning is..."}]}'
-
     # vllm-on-ray for llm service
     validate_services \
         "${ip_address}:8006/v1/chat/completions" \
@@ -145,15 +129,6 @@ function validate_microservices() {
         "vllm-ray-llm" \
         "vllm-ray-gaudi-server" \
         '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
-
-    # llm microservice
-    validate_services \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm" \
-        "llm-vllm-ray-gaudi-server" \
-        '{"query":"What is Deep Learning?"}'
-
 }
 
 function validate_megaservice() {
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
index 3de5c435d..d699f79eb 100644
--- a/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_gaudi.sh
@@ -19,7 +19,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-without-rerank chatqna-ui dataprep-redis embedding-tei retriever-redis llm-tgi"
+    service_list="chatqna-without-rerank chatqna-ui dataprep-redis retriever-redis"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
@@ -34,15 +34,16 @@ function start_services() {
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:8090"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:8005"
     export REDIS_URL="redis://${ip_address}:6379"
     export REDIS_HOST=${ip_address}
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_HOST_IP=${ip_address}
     export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_PORT=8090
+    export LLM_SERVER_PORT=8005
     export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
     export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
     export DATAPREP_GET_FILE_ENDPOINT="http://${ip_address}:6008/v1/dataprep/get_file"
@@ -117,14 +118,6 @@ function validate_microservices() {
         "tei-embedding-gaudi-server" \
         '{"inputs":"What is Deep Learning?"}'
 
-    # embedding microservice
-    validate_service \
-        "${ip_address}:6000/v1/embeddings" \
-        '"text":"What is Deep Learning?","embedding":[' \
-        "embedding-microservice" \
-        "embedding-tei-server" \
-        '{"text":"What is Deep Learning?"}'
-
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
     # test /v1/dataprep upload file
@@ -172,15 +165,6 @@ function validate_microservices() {
         "tgi-llm" \
         "tgi-gaudi-server" \
         '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
-    # llm microservice
-    validate_service \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm-microservice" \
-        "llm-tgi-gaudi-server" \
-        '{"query":"What is Deep Learning?"}'
-
 }
 
 function validate_megaservice() {
diff --git a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
index 3ab079765..bac771f5a 100644
--- a/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
+++ b/ChatQnA/tests/test_compose_without_rerank_on_xeon.sh
@@ -19,7 +19,7 @@ function build_docker_images() {
     git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
 
     echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna-without-rerank chatqna-ui chatqna-conversation-ui dataprep-redis embedding-tei retriever-redis llm-tgi"
+    service_list="chatqna-without-rerank chatqna-ui chatqna-conversation-ui dataprep-redis retriever-redis"
     docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
@@ -34,15 +34,16 @@ function start_services() {
     export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
     export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
     export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:6006"
-    export TGI_LLM_ENDPOINT="http://${ip_address}:9009"
     export REDIS_URL="redis://${ip_address}:6379"
     export REDIS_HOST=${ip_address}
     export INDEX_NAME="rag-redis"
     export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
     export MEGA_SERVICE_HOST_IP=${ip_address}
-    export EMBEDDING_SERVICE_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_HOST_IP=${ip_address}
     export RETRIEVER_SERVICE_HOST_IP=${ip_address}
-    export LLM_SERVICE_HOST_IP=${ip_address}
+    export LLM_SERVER_HOST_IP=${ip_address}
+    export EMBEDDING_SERVER_PORT=6006
+    export LLM_SERVER_PORT=9009
     export BACKEND_SERVICE_ENDPOINT="http://${ip_address}:8888/v1/chatqna"
     export DATAPREP_SERVICE_ENDPOINT="http://${ip_address}:6007/v1/dataprep"
     export DATAPREP_GET_FILE_ENDPOINT="http://${ip_address}:6007/v1/dataprep/get_file"
@@ -116,14 +117,6 @@ function validate_microservices() {
         "tei-embedding-server" \
         '{"inputs":"What is Deep Learning?"}'
 
-    # embedding microservice
-    validate_service \
-        "${ip_address}:6000/v1/embeddings" \
-        '"text":"What is Deep Learning?","embedding":[' \
-        "embedding-microservice" \
-        "embedding-tei-server" \
-        '{"text":"What is Deep Learning?"}'
-
     sleep 1m # retrieval can't curl as expected, try to wait for more time
 
     # test /v1/dataprep upload file
@@ -171,15 +164,6 @@ function validate_microservices() {
         "tgi-llm" \
         "tgi-service" \
         '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":17, "do_sample": true}}'
-
-    # llm microservice
-    validate_service \
-        "${ip_address}:9000/v1/chat/completions" \
-        "data: " \
-        "llm-microservice" \
-        "llm-tgi-server" \
-        '{"query":"What is Deep Learning?"}'
-
 }
 
 function validate_megaservice() {