opea-project · chensuyue · Aug 18, 2024 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
@@ -44,7 +44,7 @@ services:
     environment:
       TTS_ENDPOINT: ${TTS_ENDPOINT}
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
@@ -56,7 +56,7 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: opea/llm-tgi:latest
     container_name: llm-tgi-server

@@ -119,7 +119,7 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.0
+    image: ghcr.io/huggingface/text-generation-inference:2.2.0
     container_name: tgi-server
     ports:
       - "8008:80"

@@ -105,7 +105,7 @@ services:
       HF_HUB_ENABLE_HF_TRANSFER: 0
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.1.0
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "9009:80"
@@ -119,7 +119,7 @@ services:
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: opea/llm-tgi:latest
     container_name: llm-tgi-server

@@ -112,7 +112,7 @@ services:
       LANGCHAIN_PROJECT: "opea-reranking-service"
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.1.0
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "9009:80"
@@ -126,7 +126,7 @@ services:
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: opea/llm-tgi:latest
     container_name: llm-tgi-server

@@ -20,7 +20,7 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment
 - retriever: opea/retriever-redis:latest
 - tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
 - reranking: opea/reranking-tei:latest
-- tgi-service: ghcr.io/huggingface/text-generation-inference:1.4
+- tgi-service: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
 - llm: opea/llm-tgi:latest
 - chaqna-xeon-backend-server: opea/chatqna:latest
 

@@ -190,6 +190,7 @@ metadata:
 data:
   MODEL_ID: "Intel/neural-chat-7b-v3-3"
   PORT: "2080"
+  CUDA_GRAPHS: "0"
   HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
   HF_TOKEN: "insert-your-huggingface-token-here"
   MAX_INPUT_TOKENS: "1024"
@@ -993,7 +994,7 @@ spec:
                 name: chatqna-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data

@@ -6,7 +6,7 @@ version: "3.8"
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "8028:80"
@@ -18,7 +18,7 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: opea/llm-tgi:latest
     container_name: llm-tgi-server

@@ -41,6 +41,7 @@ metadata:
 data:
   MODEL_ID: "meta-llama/CodeLlama-7b-hf"
   PORT: "2080"
+  CUDA_GRAPHS: "0"
   HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
   HF_TOKEN: "insert-your-huggingface-token-here"
   MAX_INPUT_TOKENS: "1024"
@@ -229,7 +230,7 @@ spec:
                 name: codegen-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data

@@ -117,14 +117,16 @@ spec:
               value: ise-uiuc/Magicoder-S-DS-6.7B
             - name: PORT
               value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
             - name: http_proxy
               value:
             - name: https_proxy
               value:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data

@@ -15,7 +15,7 @@ function build_docker_images() {
 
     docker build -t opea/llm-tgi:latest -f comps/llms/text-generation/tgi/Dockerfile .
 
-    docker pull ghcr.io/huggingface/text-generation-inference:1.4
+    docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
 
     cd $WORKPATH/docker
     docker build --no-cache -t opea/codegen:latest -f Dockerfile .

@@ -6,7 +6,7 @@ version: "3.8"
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: codetrans-tgi-service
     ports:
       - "8008:80"
@@ -18,7 +18,7 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: opea/llm-tgi:latest
     container_name: llm-tgi-server

@@ -41,6 +41,7 @@ metadata:
 data:
   MODEL_ID: "HuggingFaceH4/mistral-7b-grok"
   PORT: "2080"
+  CUDA_GRAPHS: "0"
   HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
   HF_TOKEN: "insert-your-huggingface-token-here"
   MAX_INPUT_TOKENS: "1024"
@@ -229,7 +230,7 @@ spec:
                 name: codetrans-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data

@@ -6,7 +6,7 @@ version: "3.8"
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "8008:80"
@@ -19,7 +19,7 @@ services:
     volumes:
       - "./data:/data"
     shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: opea/llm-docsum-tgi:latest
     container_name: llm-docsum-server

@@ -8,7 +8,7 @@ Install GMC in your Kubernetes cluster, if you have not already done so, by foll
 The DocSum application is defined as a Custom Resource (CR) file that the above GMC operator acts upon. It first checks if the microservices listed in the CR yaml file are running, if not it starts them and then proceeds to connect them. When the DocSum RAG pipeline is ready, the service endpoint details are returned, letting you use the application. Should you use "kubectl get pods" commands you will see all the component microservices, in particular embedding, retriever, rerank, and llm.
 
 The DocSum pipeline uses  prebuilt images. The Xeon version uses the prebuilt image llm-docsum-tgi:latest which internally leverages the
-the image ghcr.io/huggingface/text-generation-inference:1.4. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
+the image ghcr.io/huggingface/text-generation-inference:latest-intel-cpu. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
 service tgi-gaudi-svc, which uses the image ghcr.io/huggingface/tgi-gaudi:1.2.1. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use Intel/neural-chat-7b-v3-3.
 
 [NOTE]

@@ -41,6 +41,7 @@ metadata:
 data:
   MODEL_ID: "Intel/neural-chat-7b-v3-3"
   PORT: "2080"
+  CUDA_GRAPHS: "0"
   HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
   HF_TOKEN: "insert-your-huggingface-token-here"
   MAX_INPUT_TOKENS: "1024"
@@ -229,7 +230,7 @@ spec:
                 name: docsum-tgi-config
           securityContext:
             {}
-          image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data

@@ -117,14 +117,16 @@ spec:
               value: Intel/neural-chat-7b-v3-3
             - name: PORT
               value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
             - name: http_proxy
               value:
             - name: https_proxy
               value:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data

@@ -5,7 +5,7 @@ version: "3.8"
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-xeon-server
     ports:
       - "8008:80"
@@ -18,7 +18,7 @@ services:
     volumes:
       - "./data:/data"
     shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm_faqgen:
     image: opea/llm-faqgen-tgi:latest
     container_name: llm-faqgen-server

@@ -96,14 +96,16 @@ spec:
               value: Intel/neural-chat-7b-v3-3
             - name: PORT
               value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
             - name: http_proxy
               value:
             - name: https_proxy
               value:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data

@@ -96,14 +96,16 @@ spec:
               value: Intel/neural-chat-7b-v3-3
             - name: PORT
               value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
             - name: http_proxy
               value:
             - name: https_proxy
               value:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data

@@ -117,14 +117,16 @@ spec:
               value: Intel/neural-chat-7b-v3-3
             - name: PORT
               value: "80"
+            - name: CUDA_GRAPHS
+              value: "0"
             - name: http_proxy
               value:
             - name: https_proxy
               value:
             - name: no_proxy
               value:
           securityContext: {}
-          image: "ghcr.io/huggingface/text-generation-inference:1.4"
+          image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
           imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data

@@ -76,7 +76,7 @@ services:
       HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
     restart: unless-stopped
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "3006:80"
@@ -88,7 +88,7 @@ services:
       http_proxy: ${http_proxy}
       https_proxy: ${https_proxy}
       HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: opea/llm-tgi:latest
     container_name: llm-tgi-server

@@ -18,7 +18,7 @@ function build_docker_images() {
     docker build -t opea/reranking-tei:latest  -f comps/reranks/tei/docker/Dockerfile .
     docker build -t opea/llm-tgi:latest  -f comps/llms/text-generation/tgi/Dockerfile .
     docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/text-generation-inference:1.4
+    docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     cd $WORKPATH/docker
     docker build -t opea/searchqna:latest -f Dockerfile .
 

@@ -16,7 +16,7 @@ version: "3.8"
 
 services:
   tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:1.4
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-service
     ports:
       - "8008:80"
@@ -28,7 +28,7 @@ services:
     volumes:
       - "./data:/data"
     shm_size: 1g
-    command: --model-id ${LLM_MODEL_ID}
+    command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
   llm:
     image: opea/llm-tgi:latest
     container_name: llm-tgi-server

@@ -71,12 +71,12 @@ cd ../../../..
 ### 4. Pull TGI image
 
 ```bash
-docker pull ghcr.io/huggingface/text-generation-inference:2.2.0
+docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
 ```
 
 Then run the command `docker images`, you will have the following 4 Docker Images:
 
-1. `ghcr.io/huggingface/text-generation-inference:2.2.0`
+1. `ghcr.io/huggingface/text-generation-inference:latest-intel-cpu`
 2. `opea/lvm-tgi:latest`
 3. `opea/visualqna:latest`
 4. `opea/visualqna-ui:latest`

@@ -6,7 +6,7 @@ version: "3.8"
 
 services:
   llava-tgi-service:
-    image: ghcr.io/huggingface/text-generation-inference:2.2.0
+    image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
     container_name: tgi-llava-xeon-server
     ports:
       - "9399:80"
@@ -19,7 +19,7 @@ services:
       https_proxy: ${https_proxy}
       HF_HUB_DISABLE_PROGRESS_BARS: 1
       HF_HUB_ENABLE_HF_TRANSFER: 0
-    command: --model-id ${LVM_MODEL_ID}
+    command: --model-id ${LVM_MODEL_ID} --cuda-graphs 0
   lvm-tgi:
     image: opea/lvm-tgi:latest
     container_name: lvm-tgi-server