Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using TGI official release docker image for intel cpu #581

Merged
merged 10 commits into from
Aug 18, 2024
4 changes: 2 additions & 2 deletions AudioQnA/docker/xeon/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ services:
environment:
TTS_ENDPOINT: ${TTS_ENDPOINT}
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:1.4
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: tgi-service
ports:
- "3006:80"
Expand All @@ -56,7 +56,7 @@ services:
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
command: --model-id ${LLM_MODEL_ID}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
image: opea/llm-tgi:latest
container_name: llm-tgi-server
Expand Down
2 changes: 1 addition & 1 deletion ChatQnA/docker/gpu/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ services:
HF_HUB_ENABLE_HF_TRANSFER: 0
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.0
image: ghcr.io/huggingface/text-generation-inference:2.2.0
container_name: tgi-server
ports:
- "8008:80"
Expand Down
4 changes: 2 additions & 2 deletions ChatQnA/docker/xeon/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ services:
HF_HUB_ENABLE_HF_TRANSFER: 0
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.1.0
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: tgi-service
ports:
- "9009:80"
Expand All @@ -119,7 +119,7 @@ services:
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
command: --model-id ${LLM_MODEL_ID}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
image: opea/llm-tgi:latest
container_name: llm-tgi-server
Expand Down
4 changes: 2 additions & 2 deletions ChatQnA/docker/xeon/compose_qdrant.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ services:
LANGCHAIN_PROJECT: "opea-reranking-service"
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.1.0
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: tgi-service
ports:
- "9009:80"
Expand All @@ -126,7 +126,7 @@ services:
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
command: --model-id ${LLM_MODEL_ID}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
image: opea/llm-tgi:latest
container_name: llm-tgi-server
Expand Down
2 changes: 1 addition & 1 deletion ChatQnA/kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ The ChatQnA uses the below prebuilt images if you choose a Xeon deployment
- retriever: opea/retriever-redis:latest
- tei_xeon_service: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
- reranking: opea/reranking-tei:latest
- tgi-service: ghcr.io/huggingface/text-generation-inference:1.4
- tgi-service: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
- llm: opea/llm-tgi:latest
- chaqna-xeon-backend-server: opea/chatqna:latest

Expand Down
3 changes: 2 additions & 1 deletion ChatQnA/kubernetes/manifests/xeon/chatqna.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ metadata:
data:
MODEL_ID: "Intel/neural-chat-7b-v3-3"
PORT: "2080"
CUDA_GRAPHS: "0"
HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
HF_TOKEN: "insert-your-huggingface-token-here"
MAX_INPUT_TOKENS: "1024"
Expand Down Expand Up @@ -993,7 +994,7 @@ spec:
name: chatqna-tgi-config
securityContext:
{}
image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
Expand Down
4 changes: 2 additions & 2 deletions CodeGen/docker/xeon/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ version: "3.8"

services:
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:1.4
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: tgi-service
ports:
- "8028:80"
Expand All @@ -18,7 +18,7 @@ services:
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
command: --model-id ${LLM_MODEL_ID}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
image: opea/llm-tgi:latest
container_name: llm-tgi-server
Expand Down
3 changes: 2 additions & 1 deletion CodeGen/kubernetes/manifests/xeon/codegen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ metadata:
data:
MODEL_ID: "meta-llama/CodeLlama-7b-hf"
PORT: "2080"
CUDA_GRAPHS: "0"
HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
HF_TOKEN: "insert-your-huggingface-token-here"
MAX_INPUT_TOKENS: "1024"
Expand Down Expand Up @@ -229,7 +230,7 @@ spec:
name: codegen-tgi-config
securityContext:
{}
image: "ghcr.io/huggingface/text-generation-inference:1.4"
image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
Expand Down
4 changes: 3 additions & 1 deletion CodeGen/kubernetes/manifests/xeon/ui/react-codegen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,16 @@ spec:
value: ise-uiuc/Magicoder-S-DS-6.7B
- name: PORT
value: "80"
- name: CUDA_GRAPHS
value: "0"
- name: http_proxy
value:
- name: https_proxy
value:
- name: no_proxy
value:
securityContext: {}
image: "ghcr.io/huggingface/text-generation-inference:1.4"
image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
Expand Down
2 changes: 1 addition & 1 deletion CodeGen/tests/test_codegen_on_xeon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ function build_docker_images() {

docker build -t opea/llm-tgi:latest -f comps/llms/text-generation/tgi/Dockerfile .

docker pull ghcr.io/huggingface/text-generation-inference:1.4
docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu

cd $WORKPATH/docker
docker build --no-cache -t opea/codegen:latest -f Dockerfile .
Expand Down
4 changes: 2 additions & 2 deletions CodeTrans/docker/xeon/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ version: "3.8"

services:
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:1.4
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: codetrans-tgi-service
ports:
- "8008:80"
Expand All @@ -18,7 +18,7 @@ services:
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
command: --model-id ${LLM_MODEL_ID}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
image: opea/llm-tgi:latest
container_name: llm-tgi-server
Expand Down
3 changes: 2 additions & 1 deletion CodeTrans/kubernetes/manifests/xeon/codetrans.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ metadata:
data:
MODEL_ID: "HuggingFaceH4/mistral-7b-grok"
PORT: "2080"
CUDA_GRAPHS: "0"
HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
HF_TOKEN: "insert-your-huggingface-token-here"
MAX_INPUT_TOKENS: "1024"
Expand Down Expand Up @@ -229,7 +230,7 @@ spec:
name: codetrans-tgi-config
securityContext:
{}
image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
Expand Down
4 changes: 2 additions & 2 deletions DocSum/docker/xeon/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ version: "3.8"

services:
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:1.4
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: tgi-service
ports:
- "8008:80"
Expand All @@ -19,7 +19,7 @@ services:
volumes:
- "./data:/data"
shm_size: 1g
command: --model-id ${LLM_MODEL_ID}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
image: opea/llm-docsum-tgi:latest
container_name: llm-docsum-server
Expand Down
2 changes: 1 addition & 1 deletion DocSum/kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Install GMC in your Kubernetes cluster, if you have not already done so, by foll
The DocSum application is defined as a Custom Resource (CR) file that the above GMC operator acts upon. It first checks if the microservices listed in the CR yaml file are running, if not it starts them and then proceeds to connect them. When the DocSum RAG pipeline is ready, the service endpoint details are returned, letting you use the application. Should you use "kubectl get pods" commands you will see all the component microservices, in particular embedding, retriever, rerank, and llm.

The DocSum pipeline uses prebuilt images. The Xeon version uses the prebuilt image llm-docsum-tgi:latest which internally leverages the
the image ghcr.io/huggingface/text-generation-inference:1.4. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
the image ghcr.io/huggingface/text-generation-inference:latest-intel-cpu. The service is called tgi-svc. Meanwhile, the Gaudi version launches the
service tgi-gaudi-svc, which uses the image ghcr.io/huggingface/tgi-gaudi:1.2.1. Both TGI model services serve the model specified in the LLM_MODEL_ID variable that is exported by you. In the below example we use Intel/neural-chat-7b-v3-3.

[NOTE]
Expand Down
3 changes: 2 additions & 1 deletion DocSum/kubernetes/manifests/xeon/docsum.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ metadata:
data:
MODEL_ID: "Intel/neural-chat-7b-v3-3"
PORT: "2080"
CUDA_GRAPHS: "0"
HUGGING_FACE_HUB_TOKEN: "insert-your-huggingface-token-here"
HF_TOKEN: "insert-your-huggingface-token-here"
MAX_INPUT_TOKENS: "1024"
Expand Down Expand Up @@ -229,7 +230,7 @@ spec:
name: docsum-tgi-config
securityContext:
{}
image: "ghcr.io/huggingface/text-generation-inference:2.1.0"
image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
Expand Down
4 changes: 3 additions & 1 deletion DocSum/kubernetes/manifests/xeon/ui/react-docsum.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,16 @@ spec:
value: Intel/neural-chat-7b-v3-3
- name: PORT
value: "80"
- name: CUDA_GRAPHS
value: "0"
- name: http_proxy
value:
- name: https_proxy
value:
- name: no_proxy
value:
securityContext: {}
image: "ghcr.io/huggingface/text-generation-inference:1.4"
image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
Expand Down
4 changes: 2 additions & 2 deletions FaqGen/docker/xeon/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ version: "3.8"

services:
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:1.4
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: tgi-xeon-server
ports:
- "8008:80"
Expand All @@ -18,7 +18,7 @@ services:
volumes:
- "./data:/data"
shm_size: 1g
command: --model-id ${LLM_MODEL_ID}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm_faqgen:
image: opea/llm-faqgen-tgi:latest
container_name: llm-faqgen-server
Expand Down
4 changes: 3 additions & 1 deletion FaqGen/kubernetes/manifests/gaudi/faqgen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,16 @@ spec:
value: Intel/neural-chat-7b-v3-3
- name: PORT
value: "80"
- name: CUDA_GRAPHS
value: "0"
- name: http_proxy
value:
- name: https_proxy
value:
- name: no_proxy
value:
securityContext: {}
image: "ghcr.io/huggingface/text-generation-inference:1.4"
image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
Expand Down
4 changes: 3 additions & 1 deletion FaqGen/kubernetes/manifests/xeon/faqgen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,16 @@ spec:
value: Intel/neural-chat-7b-v3-3
- name: PORT
value: "80"
- name: CUDA_GRAPHS
value: "0"
- name: http_proxy
value:
- name: https_proxy
value:
- name: no_proxy
value:
securityContext: {}
image: "ghcr.io/huggingface/text-generation-inference:1.4"
image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
Expand Down
4 changes: 3 additions & 1 deletion FaqGen/kubernetes/manifests/xeon/ui/react-faqgen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,16 @@ spec:
value: Intel/neural-chat-7b-v3-3
- name: PORT
value: "80"
- name: CUDA_GRAPHS
value: "0"
- name: http_proxy
value:
- name: https_proxy
value:
- name: no_proxy
value:
securityContext: {}
image: "ghcr.io/huggingface/text-generation-inference:1.4"
image: "ghcr.io/huggingface/text-generation-inference:latest-intel-cpu"
imagePullPolicy: IfNotPresent
volumeMounts:
- mountPath: /data
Expand Down
4 changes: 2 additions & 2 deletions SearchQnA/docker/xeon/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ services:
HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
restart: unless-stopped
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:1.4
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: tgi-service
ports:
- "3006:80"
Expand All @@ -88,7 +88,7 @@ services:
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
command: --model-id ${LLM_MODEL_ID}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
image: opea/llm-tgi:latest
container_name: llm-tgi-server
Expand Down
2 changes: 1 addition & 1 deletion SearchQnA/tests/test_searchqna_on_xeon.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ function build_docker_images() {
docker build -t opea/reranking-tei:latest -f comps/reranks/tei/docker/Dockerfile .
docker build -t opea/llm-tgi:latest -f comps/llms/text-generation/tgi/Dockerfile .
docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
docker pull ghcr.io/huggingface/text-generation-inference:1.4
docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
cd $WORKPATH/docker
docker build -t opea/searchqna:latest -f Dockerfile .

Expand Down
4 changes: 2 additions & 2 deletions Translation/docker/xeon/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ version: "3.8"

services:
tgi-service:
image: ghcr.io/huggingface/text-generation-inference:1.4
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: tgi-service
ports:
- "8008:80"
Expand All @@ -28,7 +28,7 @@ services:
volumes:
- "./data:/data"
shm_size: 1g
command: --model-id ${LLM_MODEL_ID}
command: --model-id ${LLM_MODEL_ID} --cuda-graphs 0
llm:
image: opea/llm-tgi:latest
container_name: llm-tgi-server
Expand Down
4 changes: 2 additions & 2 deletions VisualQnA/docker/xeon/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ cd ../../../..
### 4. Pull TGI image

```bash
docker pull ghcr.io/huggingface/text-generation-inference:2.2.0
docker pull ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
```

Then run the command `docker images`, you will have the following 4 Docker Images:

1. `ghcr.io/huggingface/text-generation-inference:2.2.0`
1. `ghcr.io/huggingface/text-generation-inference:latest-intel-cpu`
2. `opea/lvm-tgi:latest`
3. `opea/visualqna:latest`
4. `opea/visualqna-ui:latest`
Expand Down
4 changes: 2 additions & 2 deletions VisualQnA/docker/xeon/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ version: "3.8"

services:
llava-tgi-service:
image: ghcr.io/huggingface/text-generation-inference:2.2.0
image: ghcr.io/huggingface/text-generation-inference:latest-intel-cpu
container_name: tgi-llava-xeon-server
ports:
- "9399:80"
Expand All @@ -19,7 +19,7 @@ services:
https_proxy: ${https_proxy}
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
command: --model-id ${LVM_MODEL_ID}
command: --model-id ${LVM_MODEL_ID} --cuda-graphs 0
lvm-tgi:
image: opea/lvm-tgi:latest
container_name: lvm-tgi-server
Expand Down
Loading