diff --git a/intel_extension_for_transformers/neural_chat/docker/vllm_serving/Dockerfile_vllm b/intel_extension_for_transformers/neural_chat/docker/vllm_serving/Dockerfile_vllm new file mode 100644 index 00000000000..f4d85e2c381 --- /dev/null +++ b/intel_extension_for_transformers/neural_chat/docker/vllm_serving/Dockerfile_vllm @@ -0,0 +1,63 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +# +# THIS IS A GENERATED DOCKERFILE. +# +# This file was assembled from multiple pieces, whose use is documented +# throughout. Please refer to the TensorFlow dockerfiles documentation +# for more information. +# +# ============================================================================ + + +## NVIDIA GPU environment +FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 as nvgpu + +ARG ITREX_VER=main +ARG PYTHON_VERSION=3.10 +ARG REPO=https://github.com/intel/intel-extension-for-transformers.git +ARG REPO_PATH="" + +# See http://bugs.python.org/issue19846 +ENV LANG C.UTF-8 + +# Install system dependencies +SHELL ["/bin/bash", "--login", "-c"] +RUN apt update \ + && apt install -y build-essential \ + && apt install -y wget numactl git nvidia-cuda* \ + && apt install -y openssh-server \ + && apt install -y python${PYTHON_VERSION} python3-pip \ + && apt clean \ + && rm -rf /var/lib/apt/lists/* +RUN ln -s /usr/bin/python3 /usr/bin/python + +# Download ITREX code +RUN mkdir -p /intel-extension-for-transformers +COPY ${REPO_PATH} /intel-extension-for-transformers +RUN if [ "$REPO_PATH" == "" ]; then rm -rf intel-extension-for-transformers/* && rm -rf intel-extension-for-transformers/.* ; git clone --single-branch --branch=${ITREX_VER} ${REPO} intel-extension-for-transformers ; fi +WORKDIR /intel-extension-for-transformers + +RUN pip install -r /intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/requirements.txt +RUN cd /intel-extension-for-transformers && sed -i '/^torch==/d' requirements.txt && pip install -r requirements.txt && pip install -v . + +RUN pip install --upgrade --force-reinstall vllm + +WORKDIR /intel-extension-for-transformers + +CMD ["/usr/sbin/sshd", "-D"] + +ENTRYPOINT ["neuralchat_server"] +CMD ["start", "--config_file", "/vllm.yaml"] diff --git a/intel_extension_for_transformers/neural_chat/docker/vllm_serving/README.md b/intel_extension_for_transformers/neural_chat/docker/vllm_serving/README.md new file mode 100644 index 00000000000..1d6dfc6cbb9 --- /dev/null +++ b/intel_extension_for_transformers/neural_chat/docker/vllm_serving/README.md @@ -0,0 +1,35 @@ +Intel Neural Chat Inference Dockerfile installer for Ubuntu22.04 + +# Start NeuralChat and vLLM serving with Docker + +## Environment Setup + +## Setup NVIDIA GPU environment +Use Dockerfile_vLLM to build Docker image in your environment. +```bash +docker build . -f Dockerfile_vllm -t neuralchat_vllm:latest +``` +If you need to set proxy settings, add `--build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy` like below. +```bash +docker build . -f Dockerfile_vllm -t neuralchat_vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy +``` + +### Start NeuralChat Service +Before starting NeuralChat services, you need to configure `vllm.yaml` according to you read environment. +Make sure the specified `port` is available, `device` is `cuda` (`auto` will not work). +```bash +docker run -it --runtime=nvidia --gpus all --net=host --ipc=host -v /var/run/docker.sock:/var/run/docker.sock -v ./vllm.yaml:/vllm.yaml neuralchat_vllm:latest +``` +If you need to set proxy settings, add `-e http_proxy= -e https_proxy=` like below. +```bash +docker run -it --runtime=nvidia --gpus all -e http_proxy= -e https_proxy= --net=host --ipc=host -v /var/run/docker.sock:/var/run/docker.sock -v ./vllm.yaml:/vllm.yaml neuralchat_vllm:latest +``` + +## Consume the Service +when `docker run` command is successfully executed, you can consume the HTTP services offered by NeuralChat. + +Here is an example of consuming vLLM service, remember to substitute your real ip and port. + +```bash +curl -X POST -H "Content-Type: application/json" -d '{"prompt": "Tell me about Intel Xeon processors."}' http://localhost:8000/v1/chat/completions +``` diff --git a/intel_extension_for_transformers/neural_chat/docker/vllm_serving/vllm.yaml b/intel_extension_for_transformers/neural_chat/docker/vllm_serving/vllm.yaml new file mode 100644 index 00000000000..6862d53cf91 --- /dev/null +++ b/intel_extension_for_transformers/neural_chat/docker/vllm_serving/vllm.yaml @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This is the parameter configuration file for NeuralChat Serving. + +################################################################################# +# SERVER SETTING # +################################################################################# + +host: 0.0.0.0 +port: 8000 + +model_name_or_path: "facebook/opt-125m" +device: "cuda" + +serving: + framework: "vllm" + vllm_engine_params: + # to use continuous batching during serving, use_async_engine should be set true, + # otherwise, serving is offline and synchronous, which means the next batch will only + # be queued and processed after the processing of the last batch is finished + use_async_engine: False + tensor_parallel_size: 1 + gpu_memory_utilization: 0.9 + swap_space: 4 + +# task choices = ['textchat', 'voicechat', 'retrieval', 'text2image', 'finetune'] +tasks_list: ['textchat']