From 4657036f6581e11724c34e2e4630d0cfff923e70 Mon Sep 17 00:00:00 2001 From: Sihan Chen <39623753+Spycsh@users.noreply.github.com> Date: Fri, 23 Feb 2024 13:24:21 +0800 Subject: [PATCH] [NeuralChat] CUDA serving with Triton Inference Server (#1293) --- .../triton_inference_sever/cuda/README.md | 79 +++++++++++ .../triton/text_generation/cuda/config.pbtxt | 33 +++++ .../triton/text_generation/cuda/model.py | 132 ++++++++++++++++++ 3 files changed, 244 insertions(+) create mode 100644 intel_extension_for_transformers/neural_chat/examples/serving/triton_inference_sever/cuda/README.md create mode 100644 intel_extension_for_transformers/neural_chat/serving/triton/text_generation/cuda/config.pbtxt create mode 100644 intel_extension_for_transformers/neural_chat/serving/triton/text_generation/cuda/model.py diff --git a/intel_extension_for_transformers/neural_chat/examples/serving/triton_inference_sever/cuda/README.md b/intel_extension_for_transformers/neural_chat/examples/serving/triton_inference_sever/cuda/README.md new file mode 100644 index 00000000000..ce3b1bef2eb --- /dev/null +++ b/intel_extension_for_transformers/neural_chat/examples/serving/triton_inference_sever/cuda/README.md @@ -0,0 +1,79 @@ +# Serving NeuralChat Text Generation with Triton Inference Server (CUDA) + +Nvidia Triton Inference Server is a widely adopted inference serving software. We also support serving and deploying NeuralChat models with Triton Inference Server on CUDA devices. + +## Prepare serving scripts + +``` +cd /neural_chat/examples/serving +mkdir -p models/text_generation/1/ +cp ../../serving/triton/text_generation/cuda/model.py models/text_generation/1/model.py +cp ../../serving/triton/text_generation/cuda/config.pbtxt models/text_generation/config.pbtxt +``` + + +Then your folder structure under the current `serving` folder should be like: + +``` +serving/ +├── models +│ └── text_generation +│ ├── 1 +│ │ ├── model.py +│ └── config.pbtxt +├── README.md +``` + +## Start Triton Inference Server + +``` +cd /neural_chat/examples/serving +docker run -d --gpus all -e PYTHONPATH=/opt/tritonserver/intel-extension-for-transformers --net=host -v ${PWD}/models:/models spycsh/triton_neuralchat_gpu:v2 tritonserver --model-repository=/models --http-port 8021 +``` + +Pass `-v` to map your model on your host machine to the docker container. + +## Multi-card serving (optional) + +You can also do multi-card serving to get better throughput by specifying a instance group provided by Triton Inference Server. + +To do that, please edit the the field `instance_group` in your `config.pbtxt`. + +One example would be like following: + +``` +instance_group [ + { + count: 1 + kind: KIND_GPU + gpus: [0, 1] + } +] +``` + +This means for every gpu device, we initialize an execution instance. Please check configuration details through this [link](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/model_configuration.html#multiple-model-instances). + +## Quick check whether the server is up + +To check whether the server is up: + +``` +curl -v localhost:8021/v2/health/ready +``` + +You will find a `HTTP/1.1 200 OK` if your server is up and ready for receiving requests. + +## Use Triton client to send inference request + +Start the Triton client and enter into the container + +``` +cd /neural_chat/examples/serving +docker run --gpus all --net=host -it --rm -v ${PWD}/../../serving/triton/text_generation/client.py:/workspace/text_generation/client.py nvcr.io/nvidia/tritonserver:23.11-py3-sdk +``` + +Send a request + +``` +python /workspace/text_generation/client.py --prompt="Tell me about Intel Xeon Scalable Processors." --url=localhost:8021 +``` diff --git a/intel_extension_for_transformers/neural_chat/serving/triton/text_generation/cuda/config.pbtxt b/intel_extension_for_transformers/neural_chat/serving/triton/text_generation/cuda/config.pbtxt new file mode 100644 index 00000000000..fb833271af2 --- /dev/null +++ b/intel_extension_for_transformers/neural_chat/serving/triton/text_generation/cuda/config.pbtxt @@ -0,0 +1,33 @@ +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "text_generation" +backend: "python" + +input [ + { + name: "INPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [ 1 ] + } +] + +instance_group [{ kind: KIND_GPU }] diff --git a/intel_extension_for_transformers/neural_chat/serving/triton/text_generation/cuda/model.py b/intel_extension_for_transformers/neural_chat/serving/triton/text_generation/cuda/model.py new file mode 100644 index 00000000000..9ff7d88fdc7 --- /dev/null +++ b/intel_extension_for_transformers/neural_chat/serving/triton/text_generation/cuda/model.py @@ -0,0 +1,132 @@ +# !/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import numpy as np + +# triton_python_backend_utils is available in every Triton Python model. You +# need to use this module to create inference requests and responses. It also +# contains some utility functions for extracting information from model_config +# and converting Triton input/output types to numpy types. +import triton_python_backend_utils as pb_utils + +from intel_extension_for_transformers.neural_chat import build_chatbot, PipelineConfig + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + + # You must parse model_config. JSON string is not parsed here + self.model_config = model_config = json.loads(args["model_config"]) + self.model_instance_device_id = json.loads(args['model_instance_device_id']) + import numba.cuda as cuda + cuda.select_device(self.model_instance_device_id) + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0") + + # Convert Triton types to numpy types + self.output0_dtype = pb_utils.triton_string_to_numpy( + output0_config["data_type"] + ) + self.config = PipelineConfig() + self.chatbot = build_chatbot(self.config) + + def execute(self, requests): + """`execute` MUST be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference request is made + for this model. Depending on the batching configuration (e.g. Dynamic + Batching) used, `requests` may contain multiple requests. Every + Python model, must create one pb_utils.InferenceResponse for every + pb_utils.InferenceRequest in `requests`. If there is an error, you can + set the error argument when creating a pb_utils.InferenceResponse + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + + output0_dtype = self.output0_dtype + chatbot = self.chatbot + + responses = [] + + # Every Python backend must iterate over everyone of the requests + # and create a pb_utils.InferenceResponse for each of them. + for request in requests: + # Get INPUT0 + in_0 = pb_utils.get_input_tensor_by_name(request, "INPUT0") + in_0 = in_0.as_numpy() + text = in_0[0].decode("utf-8") + print(f"input prompt: {text}") + + out_0 = chatbot.predict(query=text) + + # Create output tensors. You need pb_utils.Tensor + # objects to create pb_utils.InferenceResponse. + out_0 = np.array(out_0) + + out_tensor_0 = pb_utils.Tensor("OUTPUT0", out_0.astype(output0_dtype)) + + # Create InferenceResponse. You can set an error here in case + # there was a problem with handling this inference request. + # Below is an example of how you can set errors in inference + # response: + # + # pb_utils.InferenceResponse( + # output_tensors=..., TritonError("An error occurred")) + inference_response = pb_utils.InferenceResponse( + output_tensors=[out_tensor_0] + ) + responses.append(inference_response) + + # You should return a list of pb_utils.InferenceResponse. Length + # of this list must match the length of `requests` list. + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is OPTIONAL. This function allows + the model to perform any necessary clean ups before exit. + """ + print("Cleaning up...")