stateful inference (#2513)

* stateful inference-core layer * add grpc layer * add google rpc submodule * fmt * update sequence batch img * update sequence batch img * fmt * delete used file * fmt * fix log and update doc * update log * fmt * make BatchAggregator as base * fix conflict * fix conflict * add SequenceBatchAggregator * update ci for submodule * refactor * fmt * fmt * fix lint * code refactor * update readme * update readme * fmt * fmt * test workflow * revert test * revert test response * fmt * fmt * update readme * allow number ofjobGroup is larger than batchsize * fmt * fix typo * add stateful test data * fmt * fmt * fmt * fmt * set default maxNumSequence * fmt * fmt * revert back config.properties * fmt
pytorch · Nov 8, 2023 · e1c31e1 · e1c31e1
1 parent d0f8905
commit e1c31e1
Show file tree

Hide file tree

Showing 53 changed files with 1,747 additions and 270 deletions.
diff --git a/.github/workflows/benchmark_nightly.yml b/.github/workflows/benchmark_nightly.yml
@@ -36,6 +36,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           sudo apt-get update -y

diff --git a/.github/workflows/ci_cpu.yml b/.github/workflows/ci_cpu.yml
@@ -35,6 +35,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev

diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
@@ -39,6 +39,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -34,6 +34,8 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v3
+      with:
+        submodules: recursive
 
     - name: Setup Python 3.8
       uses: actions/setup-python@v4

diff --git a/.github/workflows/docker-ci.yaml b/.github/workflows/docker-ci.yaml
@@ -17,6 +17,8 @@ jobs:
         python-version: ["3.8", "3.9", "3.10"]
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: recursive
 
       - name: Test build_image.sh script with custom tagging and gpu flag
         working-directory: docker

diff --git a/.github/workflows/docker-nightly-build.yml b/.github/workflows/docker-nightly-build.yml
@@ -22,6 +22,8 @@ jobs:
           architecture: x64
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Login to Docker
         env:
           DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}}

diff --git a/.github/workflows/regression_tests_cpu.yml b/.github/workflows/regression_tests_cpu.yml
@@ -34,6 +34,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev

diff --git a/.github/workflows/regression_tests_cpu_binaries.yml b/.github/workflows/regression_tests_cpu_binaries.yml
@@ -21,6 +21,8 @@ jobs:
         binaries: ["pypi", "conda"]
     steps:
       - uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Setup conda with Python ${{ matrix.python-version }}
         uses: s-weigand/setup-conda@v1
         with:

diff --git a/.github/workflows/regression_tests_docker.yml b/.github/workflows/regression_tests_docker.yml
@@ -29,6 +29,8 @@ jobs:
           docker system prune --all --volumes -f
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Branch name
         run: |
           echo $GITHUB_REF_NAME

diff --git a/.github/workflows/regression_tests_gpu.yml b/.github/workflows/regression_tests_gpu.yml
@@ -42,6 +42,8 @@ jobs:
           java-version: '17'
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121

diff --git a/.github/workflows/regression_tests_gpu_binaries.yml b/.github/workflows/regression_tests_gpu_binaries.yml
@@ -28,6 +28,8 @@ jobs:
           ls -la ./
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - uses: conda-incubator/setup-miniconda@v2
         with:
           miniconda-version: "latest"

diff --git a/.github/workflows/torchserve-nightly-build.yml b/.github/workflows/torchserve-nightly-build.yml
@@ -14,6 +14,8 @@ jobs:
       - run: conda install -y conda-build anaconda-client
       - name: Checkout TorchServe
         uses: actions/checkout@v3
+        with:
+          submodules: recursive
       - name: Install dependencies
         run: |
           python ts_scripts/install_dependencies.py --environment=dev

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third_party/google/rpc"]
+	path = third_party/google/rpc
+	url = https://github.com/googleapis/googleapis.git
diff --git a/docs/grpc_api.md b/docs/grpc_api.md
@@ -33,12 +33,13 @@ Run following commands to Register, run inference and unregister, densenet161 mo
 ```bash
 git clone https://github.com/pytorch/serve
 cd serve
+git submodule init
 ```
 
  - Install gRPC python dependencies
 
 ```bash
-pip install -U grpcio protobuf grpcio-tools
+pip install -U grpcio protobuf grpcio-tools googleapis-common-protos
 ```
 
  - Start torchServe
@@ -51,7 +52,7 @@ torchserve --start --model-store models/
  - Generate python gRPC client stub using the proto files
 
 ```bash
-python -m grpc_tools.protoc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts --grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto frontend/server/src/main/resources/proto/management.proto
+python -m grpc_tools.protoc -I third_party/google/rpc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts --grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto frontend/server/src/main/resources/proto/management.proto
 ```
 
  - Register densenet161 model
@@ -95,4 +96,4 @@ def handle(data, context):
         for i in range (3):
             send_intermediate_predict_response(["intermediate_response"], context.request_ids, "Intermediate Prediction success", 200, context)
         return ["hello world "]
-```
+```
diff --git a/docs/images/stateful_batch.jpg b/docs/images/stateful_batch.jpg
diff --git a/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py b/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py
@@ -0,0 +1,140 @@
+import logging
+from abc import ABC
+
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from ts.context import Context
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+logger.info("Transformers version %s", transformers.__version__)
+
+
+class LlamaHandler(BaseHandler, ABC):
+    """
+    Transformers handler class for sequence, token classification and question answering.
+    """
+
+    def __init__(self):
+        super(LlamaHandler, self).__init__()
+        self.max_length = None
+        self.max_new_tokens = None
+        self.tokenizer = None
+        self.initialized = False
+
+    def initialize(self, ctx: Context):
+        """In this initialize function, the HF large model is loaded and
+        partitioned using DeepSpeed.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artifacts parameters.
+        """
+        model_dir = ctx.system_properties.get("model_dir")
+        self.max_length = int(ctx.model_yaml_config["handler"]["max_length"])
+        self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"])
+        model_name = ctx.model_yaml_config["handler"]["model_name"]
+        model_path = f'{model_dir}/{ctx.model_yaml_config["handler"]["model_path"]}'
+        seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
+        torch.manual_seed(seed)
+
+        logger.info("Model %s loading tokenizer", ctx.model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            device_map="balanced",
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float16,
+            load_in_8bit=True,
+            trust_remote_code=True,
+        )
+        if ctx.model_yaml_config["handler"]["fast_kernels"]:
+            from optimum.bettertransformer import BetterTransformer
+
+            try:
+                self.model = BetterTransformer.transform(self.model)
+            except RuntimeError as error:
+                logger.warning(
+                    "HuggingFace Optimum is not supporting this model,for the list of supported models, please refer to this doc,https://huggingface.co/docs/optimum/bettertransformer/overview"
+                )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        logger.info("Model %s loaded successfully", ctx.model_name)
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """
+        Basic text preprocessing, based on the user's choice of application mode.
+        Args:
+            requests (list): A list of dictionaries with a "data" or "body" field, each
+                            containing the input text to be processed.
+        Returns:
+            tuple: A tuple with two tensors: the batch of input ids and the batch of
+                attention masks.
+        """
+        input_texts = [data.get("data") or data.get("body") for data in requests]
+        input_ids_batch, attention_mask_batch = [], []
+        for input_text in input_texts:
+            input_ids, attention_mask = self.encode_input_text(input_text)
+            input_ids_batch.append(input_ids)
+            attention_mask_batch.append(attention_mask)
+        input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.model.device)
+        attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device)
+        return input_ids_batch, attention_mask_batch
+
+    def encode_input_text(self, input_text):
+        """
+        Encodes a single input text using the tokenizer.
+        Args:
+            input_text (str): The input text to be encoded.
+        Returns:
+            tuple: A tuple with two tensors: the encoded input ids and the attention mask.
+        """
+        if isinstance(input_text, (bytes, bytearray)):
+            input_text = input_text.decode("utf-8")
+        logger.info("Received text: '%s'", input_text)
+        inputs = self.tokenizer.encode_plus(
+            input_text,
+            max_length=self.max_length,
+            padding=False,
+            add_special_tokens=True,
+            return_tensors="pt",
+            truncation=True,
+        )
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+        return input_ids, attention_mask
+
+    def inference(self, input_batch):
+        """
+        Predicts the class (or classes) of the received text using the serialized transformers
+        checkpoint.
+        Args:
+            input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch
+                                of attention masks, as returned by the preprocess function.
+        Returns:
+            list: A list of strings with the predicted values for each input text in the batch.
+        """
+        input_ids_batch, attention_mask_batch = input_batch
+        input_ids_batch = input_ids_batch.to(self.device)
+        outputs = self.model.generate(
+            input_ids_batch,
+            attention_mask=attention_mask_batch,
+            max_length=self.max_new_tokens,
+        )
+
+        inferences = self.tokenizer.batch_decode(
+            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+        logger.info("Generated text: %s", inferences)
+        return inferences
+
+    def postprocess(self, inference_output):
+        """Post Process Function converts the predicted response into Torchserve readable format.
+        Args:
+            inference_output (list): It contains the predicted response of the input text.
+        Returns:
+            (list): Returns a list of the Predictions and Explanations.
+        """
+        return inference_output