[NeuralChat] Support Neuralchat-vLLM serving with Docker (#1187)

* add vllm docker Co-authored-by: lvliang-intel <liang1.lv@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
intel · Feb 5, 2024 · 1988ddc · 1988ddc
1 parent 476bee8
commit 1988ddc
Show file tree

Hide file tree

Showing 3 changed files with 140 additions and 0 deletions.
diff --git a/intel_extension_for_transformers/neural_chat/docker/vllm_serving/Dockerfile_vllm b/intel_extension_for_transformers/neural_chat/docker/vllm_serving/Dockerfile_vllm
@@ -0,0 +1,63 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+#
+# ============================================================================
+
+
+## NVIDIA GPU environment
+FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04 as nvgpu
+
+ARG ITREX_VER=main
+ARG PYTHON_VERSION=3.10
+ARG REPO=https://github.com/intel/intel-extension-for-transformers.git
+ARG REPO_PATH=""
+
+# See http://bugs.python.org/issue19846
+ENV LANG C.UTF-8
+
+# Install system dependencies
+SHELL ["/bin/bash", "--login", "-c"]
+RUN apt update \
+    && apt install -y build-essential \
+    && apt install -y wget numactl git nvidia-cuda* \
+    && apt install -y openssh-server \
+    && apt install -y python${PYTHON_VERSION} python3-pip \
+    && apt clean \
+    && rm -rf /var/lib/apt/lists/*
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+# Download ITREX code
+RUN mkdir -p /intel-extension-for-transformers
+COPY ${REPO_PATH} /intel-extension-for-transformers
+RUN if [ "$REPO_PATH" == "" ]; then rm -rf intel-extension-for-transformers/* && rm -rf intel-extension-for-transformers/.* ; git clone --single-branch --branch=${ITREX_VER} ${REPO} intel-extension-for-transformers ; fi
+WORKDIR /intel-extension-for-transformers
+
+RUN pip install -r /intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/requirements.txt
+RUN cd /intel-extension-for-transformers && sed -i '/^torch==/d' requirements.txt && pip install -r requirements.txt && pip install -v .
+
+RUN pip install --upgrade --force-reinstall vllm
+
+WORKDIR /intel-extension-for-transformers
+
+CMD ["/usr/sbin/sshd", "-D"]
+
+ENTRYPOINT ["neuralchat_server"]
+CMD ["start", "--config_file", "/vllm.yaml"]
diff --git a/intel_extension_for_transformers/neural_chat/docker/vllm_serving/README.md b/intel_extension_for_transformers/neural_chat/docker/vllm_serving/README.md
@@ -0,0 +1,35 @@
+Intel Neural Chat Inference Dockerfile installer for Ubuntu22.04
+
+# Start NeuralChat and vLLM serving with Docker
+
+## Environment Setup
+
+## Setup NVIDIA GPU environment
+Use Dockerfile_vLLM to build Docker image in your environment.
+```bash
+docker build . -f Dockerfile_vllm -t neuralchat_vllm:latest
+```
+If you need to set proxy settings, add `--build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy` like below.
+```bash
+docker build . -f Dockerfile_vllm -t neuralchat_vllm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+```
+
+### Start NeuralChat Service
+Before starting NeuralChat services, you need to configure `vllm.yaml` according to you read environment.
+Make sure the specified `port` is available, `device` is `cuda` (`auto` will not work).
+```bash
+docker run -it --runtime=nvidia --gpus all --net=host --ipc=host -v /var/run/docker.sock:/var/run/docker.sock -v ./vllm.yaml:/vllm.yaml neuralchat_vllm:latest
+```
+If you need to set proxy settings, add `-e http_proxy=<your proxy> -e https_proxy=<your proxy>` like below.
+```bash
+docker run -it --runtime=nvidia --gpus all -e http_proxy=<your proxy> -e https_proxy=<your proxy> --net=host --ipc=host -v /var/run/docker.sock:/var/run/docker.sock -v ./vllm.yaml:/vllm.yaml neuralchat_vllm:latest
+```
+
+## Consume the Service
+when `docker run` command is successfully executed, you can consume the HTTP services offered by NeuralChat.
+
+Here is an example of consuming vLLM service, remember to substitute your real ip and port.
+
+```bash
+curl -X POST -H "Content-Type: application/json" -d '{"prompt": "Tell me about Intel Xeon processors."}' http://localhost:8000/v1/chat/completions
+```
diff --git a/intel_extension_for_transformers/neural_chat/docker/vllm_serving/vllm.yaml b/intel_extension_for_transformers/neural_chat/docker/vllm_serving/vllm.yaml
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is the parameter configuration file for NeuralChat Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+
+host: 0.0.0.0
+port: 8000
+
+model_name_or_path: "facebook/opt-125m"
+device: "cuda"
+
+serving:
+    framework: "vllm"
+    vllm_engine_params:
+        # to use continuous batching during serving, use_async_engine should be set true,
+        # otherwise, serving is offline and synchronous, which means the next batch will only
+        # be queued and processed after the processing of the last batch is finished
+        use_async_engine: False
+        tensor_parallel_size: 1
+        gpu_memory_utilization: 0.9
+        swap_space: 4
+
+# task choices = ['textchat', 'voicechat', 'retrieval', 'text2image', 'finetune']
+tasks_list: ['textchat']