diff --git a/AudioQnA/docker/xeon/compose.yaml b/AudioQnA/docker/xeon/compose.yaml
index 7c111176d..f640ecae3 100644
--- a/AudioQnA/docker/xeon/compose.yaml
+++ b/AudioQnA/docker/xeon/compose.yaml
@@ -41,7 +41,7 @@ services:
     environment:
       TTS_ENDPOINT: ${TTS_ENDPOINT}
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
@@ -53,7 +53,7 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
     container_name: llm-tgi-server
diff --git a/ChatQnA/docker/gpu/compose.yaml b/ChatQnA/docker/gpu/compose.yaml
index 31bcfac9a..5e50214d1 100644
--- a/ChatQnA/docker/gpu/compose.yaml
+++ b/ChatQnA/docker/gpu/compose.yaml
@@ -116,7 +116,7 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.0
+    image: ghcr.io/huggingface/text-generation-inference:2.2.0
     container_name: tgi-server
     ports:
       - "8008:80"
diff --git a/ChatQnA/docker/xeon/compose.yaml b/ChatQnA/docker/xeon/compose.yaml
index 3bad42f57..3828aa57f 100644
--- a/ChatQnA/docker/xeon/compose.yaml
+++ b/ChatQnA/docker/xeon/compose.yaml
@@ -102,7 +102,7 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.1.0
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "9009:80"
@@ -116,7 +116,7 @@ services:
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
     container_name: llm-tgi-server
diff --git a/ChatQnA/docker/xeon/compose_qdrant.yaml b/ChatQnA/docker/xeon/compose_qdrant.yaml
index 9c4bbf023..a149d9426 100644
--- a/ChatQnA/docker/xeon/compose_qdrant.yaml
+++ b/ChatQnA/docker/xeon/compose_qdrant.yaml
@@ -102,7 +102,7 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.1.0
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "6042:80"
@@ -116,7 +116,7 @@ services:
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
     container_name: llm-tgi-server
diff --git a/ChatQnA/kubernetes/README.md b/ChatQnA/kubernetes/README.md
index bcbca25b6..55be03943 100644
--- a/ChatQnA/kubernetes/README.md
+++ b/ChatQnA/kubernetes/README.md
@@ -20,7 +20,7 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment
 - retriever: opea/retriever-redis:latest
 - tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 - reranking: opea/reranking-tei:latest
-- tgi-service: ghcr.io/huggingface/text-generation-inference:1.4
+- tgi-service: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
 - llm: opea/llm-tgi:latest
 - chaqna-xeon-backend-server: opea/chatqna:latest
 
diff --git a/ChatQnA/kubernetes/manifests/xeon/chatqna.yaml b/ChatQnA/kubernetes/manifests/xeon/chatqna.yaml
index b8b1ebaae..aba91c8ae 100644
--- a/ChatQnA/kubernetes/manifests/xeon/chatqna.yaml
+++ b/ChatQnA/kubernetes/manifests/xeon/chatqna.yaml
@@ -190,6 +190,7 @@ metadata:
 data:
   MODEL_ID: "Intel/neural-chat-7b-v3-3"
   PORT: "2080"
+  CUDA_GRAPHS: "0"
   HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
   HF_TOKEN: "insert-your-huggingface-token-here"
   MAX_INPUT_TOKENS: "1024"
@@ -993,7 +994,7 @@ spec:
                 name: chatqna-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/CodeGen/docker/xeon/compose.yaml b/CodeGen/docker/xeon/compose.yaml
index c33a12ef4..ba7bcdabf 100644
--- a/CodeGen/docker/xeon/compose.yaml
+++ b/CodeGen/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "8028:80"
@@ -15,7 +15,7 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
     container_name: llm-tgi-server
diff --git a/CodeGen/kubernetes/manifests/xeon/codegen.yaml b/CodeGen/kubernetes/manifests/xeon/codegen.yaml
index 06b8a7544..7ee9e5448 100644
--- a/CodeGen/kubernetes/manifests/xeon/codegen.yaml
+++ b/CodeGen/kubernetes/manifests/xeon/codegen.yaml
@@ -41,6 +41,7 @@ metadata:
 data:
   MODEL_ID: "meta-llama/CodeLlama-7b-hf"
   PORT: "2080"
+  CUDA_GRAPHS: "0"
   HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
   HF_TOKEN: "insert-your-huggingface-token-here"
   MAX_INPUT_TOKENS: "1024"
@@ -229,7 +230,7 @@ spec:
                 name: codegen-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/CodeGen/kubernetes/manifests/xeon/ui/react-codegen.yaml b/CodeGen/kubernetes/manifests/xeon/ui/react-codegen.yaml
index 874893bb4..ac21f6835 100644
--- a/CodeGen/kubernetes/manifests/xeon/ui/react-codegen.yaml
+++ b/CodeGen/kubernetes/manifests/xeon/ui/react-codegen.yaml
@@ -117,6 +117,8 @@ spec:
               value: ise-uiuc/Magicoder-S-DS-6.7B
             - name: PORT
               value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
             - name: http_proxy
               value:
             - name: https_proxy
@@ -124,7 +126,7 @@ spec:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/CodeGen/tests/test_codegen_on_xeon.sh b/CodeGen/tests/test_codegen_on_xeon.sh
index 0a781452a..6e759dc4e 100644
--- a/CodeGen/tests/test_codegen_on_xeon.sh
+++ b/CodeGen/tests/test_codegen_on_xeon.sh
@@ -22,7 +22,7 @@ function build_docker_images() {
     service_list="codegen codegen-ui llm-tgi"
     docker compose -f docker_build_compose.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
-    docker pull ghcr.io/huggingface/text-generation-inference:1.4
+    docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     docker images
 }
 
diff --git a/CodeTrans/docker/xeon/compose.yaml b/CodeTrans/docker/xeon/compose.yaml
index f4a5ad68a..c4666621c 100644
--- a/CodeTrans/docker/xeon/compose.yaml
+++ b/CodeTrans/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: codetrans-tgi-service
     ports:
       - "8008:80"
@@ -15,7 +15,7 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
     container_name: llm-tgi-server
diff --git a/CodeTrans/kubernetes/manifests/xeon/codetrans.yaml b/CodeTrans/kubernetes/manifests/xeon/codetrans.yaml
index bbe1621f0..26359f705 100644
--- a/CodeTrans/kubernetes/manifests/xeon/codetrans.yaml
+++ b/CodeTrans/kubernetes/manifests/xeon/codetrans.yaml
@@ -41,6 +41,7 @@ metadata:
 data:
   MODEL_ID: "HuggingFaceH4/mistral-7b-grok"
   PORT: "2080"
+  CUDA_GRAPHS: "0"
   HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
   HF_TOKEN: "insert-your-huggingface-token-here"
   MAX_INPUT_TOKENS: "1024"
@@ -229,7 +230,7 @@ spec:
                 name: codetrans-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/DocSum/docker/xeon/compose.yaml b/DocSum/docker/xeon/compose.yaml
index 33c92ce52..ffb2ba7f5 100644
--- a/DocSum/docker/xeon/compose.yaml
+++ b/DocSum/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "8008:80"
@@ -16,7 +16,7 @@ services:
     volumes:
       - "./data:/data"
     shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: ${REGISTRY:-opea}/llm-docsum-tgi:${TAG:-latest}
     container_name: llm-docsum-server
diff --git a/DocSum/kubernetes/README.md b/DocSum/kubernetes/README.md
index 0172b940a..6627f8cb7 100644
--- a/DocSum/kubernetes/README.md
+++ b/DocSum/kubernetes/README.md
@@ -8,7 +8,7 @@ Install GMC in your Kubernetes cluster, if you have not already done so, by foll
 The DocSum application is defined as a Custom Resource (CR) file that the above GMC operator acts upon. It first checks if the microservices listed in the CR yaml file are running, if not it starts them and then proceeds to connect them. When the DocSum RAG pipeline is ready, the service endpoint details are returned, letting you use the application. Should you use "kubectl get pods" commands you will see all the component microservices, in particular embedding, retriever, rerank, and llm.
 
 The DocSum pipeline uses  prebuilt images. The Xeon version uses the prebuilt image llm-docsum-tgi:latest which internally leverages the
-the image ghcr.io/huggingface/text-generation-inference:1.4. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
+the image ghcr.io/huggingface/text-generation-inference:latest-intel-cpu. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
 service tgi-gaudi-svc, which uses the image ghcr.io/huggingface/tgi-gaudi:1.2.1. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use Intel/neural-chat-7b-v3-3.
 
 [NOTE]
diff --git a/DocSum/kubernetes/manifests/xeon/docsum.yaml b/DocSum/kubernetes/manifests/xeon/docsum.yaml
index f4b4997ae..dbfa4f27b 100644
--- a/DocSum/kubernetes/manifests/xeon/docsum.yaml
+++ b/DocSum/kubernetes/manifests/xeon/docsum.yaml
@@ -41,6 +41,7 @@ metadata:
 data:
   MODEL_ID: "Intel/neural-chat-7b-v3-3"
   PORT: "2080"
+  CUDA_GRAPHS: "0"
   HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
   HF_TOKEN: "insert-your-huggingface-token-here"
   MAX_INPUT_TOKENS: "1024"
@@ -229,7 +230,7 @@ spec:
                 name: docsum-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/DocSum/kubernetes/manifests/xeon/ui/react-docsum.yaml b/DocSum/kubernetes/manifests/xeon/ui/react-docsum.yaml
index eb83b00b0..4f902a22a 100644
--- a/DocSum/kubernetes/manifests/xeon/ui/react-docsum.yaml
+++ b/DocSum/kubernetes/manifests/xeon/ui/react-docsum.yaml
@@ -117,6 +117,8 @@ spec:
               value: Intel/neural-chat-7b-v3-3
             - name: PORT
               value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
             - name: http_proxy
               value:
             - name: https_proxy
@@ -124,7 +126,7 @@ spec:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/FaqGen/docker/xeon/compose.yaml b/FaqGen/docker/xeon/compose.yaml
index 406f4ca43..d5d955984 100644
--- a/FaqGen/docker/xeon/compose.yaml
+++ b/FaqGen/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-xeon-server
     ports:
       - "8008:80"
@@ -16,7 +16,7 @@ services:
     volumes:
       - "./data:/data"
     shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm_faqgen:
     image: ${REGISTRY:-opea}/llm-faqgen-tgi:${TAG:-latest}
     container_name: llm-faqgen-server
diff --git a/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml b/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml
index 24581e8a4..76a68080c 100644
--- a/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml
+++ b/FaqGen/kubernetes/manifests/gaudi/faqgen.yaml
@@ -48,6 +48,8 @@ spec:
         args:
         - --model-id
         - 'meta-llama/Meta-Llama-3-8B-Instruct'
+        - --cuda_graphs
+        - '0'
         - --max-input-length
         - '3096'
         - --max-total-tokens
diff --git a/FaqGen/kubernetes/manifests/xeon/faqgen.yaml b/FaqGen/kubernetes/manifests/xeon/faqgen.yaml
index b1d102df9..ddf81afdf 100644
--- a/FaqGen/kubernetes/manifests/xeon/faqgen.yaml
+++ b/FaqGen/kubernetes/manifests/xeon/faqgen.yaml
@@ -34,6 +34,8 @@ spec:
         args:
         - --model-id
         - 'meta-llama/Meta-Llama-3-8B-Instruct'
+        - --cuda_graphs
+        - '0'
         - --max-input-length
         - '3096'
         - --max-total-tokens
diff --git a/FaqGen/kubernetes/manifests/xeon/ui/react-faqgen.yaml b/FaqGen/kubernetes/manifests/xeon/ui/react-faqgen.yaml
index 2b38848fd..bfb3be0b0 100644
--- a/FaqGen/kubernetes/manifests/xeon/ui/react-faqgen.yaml
+++ b/FaqGen/kubernetes/manifests/xeon/ui/react-faqgen.yaml
@@ -117,6 +117,8 @@ spec:
               value: Intel/neural-chat-7b-v3-3
             - name: PORT
               value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
             - name: http_proxy
               value:
             - name: https_proxy
@@ -124,7 +126,7 @@ spec:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
diff --git a/SearchQnA/docker/xeon/compose.yaml b/SearchQnA/docker/xeon/compose.yaml
index cdbf87e0e..4dcf9b923 100644
--- a/SearchQnA/docker/xeon/compose.yaml
+++ b/SearchQnA/docker/xeon/compose.yaml
@@ -73,7 +73,7 @@ services:
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
@@ -85,7 +85,7 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
     container_name: llm-tgi-server
diff --git a/SearchQnA/tests/test_searchqna_on_xeon.sh b/SearchQnA/tests/test_searchqna_on_xeon.sh
index 583723a09..8c083dc9a 100644
--- a/SearchQnA/tests/test_searchqna_on_xeon.sh
+++ b/SearchQnA/tests/test_searchqna_on_xeon.sh
@@ -23,7 +23,7 @@ function build_docker_images() {
     docker compose -f docker_build_compose.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
 
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/text-generation-inference:1.4
+    docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     docker images
 }
 
diff --git a/Translation/docker/xeon/compose.yaml b/Translation/docker/xeon/compose.yaml
index da7a43763..1ca7cee9d 100644
--- a/Translation/docker/xeon/compose.yaml
+++ b/Translation/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "8008:80"
@@ -15,7 +15,7 @@ services:
     volumes:
       - "./data:/data"
     shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: ${REGISTRY:-opea}/llm-tgi:${TAG:-latest}
     container_name: llm-tgi-server
diff --git a/VisualQnA/docker/xeon/README.md b/VisualQnA/docker/xeon/README.md
index 32103aff0..346f3bc4e 100644
--- a/VisualQnA/docker/xeon/README.md
+++ b/VisualQnA/docker/xeon/README.md
@@ -70,20 +70,15 @@ docker build --no-cache -t opea/visualqna-ui:latest --build-arg https_proxy=$htt
 cd ../../../..
 ```
 
-### 4. Build TGI Xeon Image
-
-Since TGI official image has not supported llava-next for CPU, we'll need to build it based on Dockerfile_intel.
+### 4. Pull TGI Xeon Image
 
 ```bash
-git clone https://github.com/huggingface/text-generation-inference
-cd text-generation-inference/
-docker build -t opea/llava-tgi-xeon:latest --build-arg PLATFORM=cpu --build-arg http_proxy=${http_proxy} --build-arg https_proxy=${https_proxy} . -f Dockerfile_intel
-cd ../
+docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
 ```
 
 Then run the command `docker images`, you will have the following 4 Docker Images:
 
-1. `opea/llava-tgi-xeon:latest`
+1. `ghcr.io/huggingface/text-generation-inference:latest-intel-cpu`
 2. `opea/lvm-tgi:latest`
 3. `opea/visualqna:latest`
 4. `opea/visualqna-ui:latest`
diff --git a/VisualQnA/docker/xeon/compose.yaml b/VisualQnA/docker/xeon/compose.yaml
index e38f64cf9..6bb9206b4 100644
--- a/VisualQnA/docker/xeon/compose.yaml
+++ b/VisualQnA/docker/xeon/compose.yaml
@@ -3,7 +3,7 @@
 
 services:
   llava-tgi-service:
-    image: ${REGISTRY:-opea}/llava-tgi-xeon:${TAG:-latest}
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-llava-xeon-server
     ports:
       - "9399:80"