Skip to content

Commit

Permalink
stateful inference (#2513)
Browse files Browse the repository at this point in the history
* stateful inference-core layer

* add grpc layer

* add google rpc submodule

* fmt

* update sequence batch img

* update sequence batch img

* fmt

* delete used file

* fmt

* fix log and update doc

* update log

* fmt

* make BatchAggregator as base

* fix conflict

* fix conflict

* add SequenceBatchAggregator

* update ci for submodule

* refactor

* fmt

* fmt

* fix lint

* code refactor

* update readme

* update readme

* fmt

* fmt

* test workflow

* revert test

* revert test response

* fmt

* fmt

* update readme

* allow number ofjobGroup is larger than batchsize

* fmt

* fix typo

* add stateful test data

* fmt

* fmt

* fmt

* fmt

* set default maxNumSequence

* fmt

* fmt

* revert back config.properties

* fmt
  • Loading branch information
lxning authored Nov 8, 2023
1 parent d0f8905 commit e1c31e1
Show file tree
Hide file tree
Showing 53 changed files with 1,747 additions and 270 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/benchmark_nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ jobs:
java-version: '17'
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install dependencies
run: |
sudo apt-get update -y
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ jobs:
java-version: '17'
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install dependencies
run: |
python ts_scripts/install_dependencies.py --environment=dev
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/ci_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ jobs:
java-version: '17'
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install dependencies
run: |
python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v3
with:
submodules: recursive

- name: Setup Python 3.8
uses: actions/setup-python@v4
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/docker-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ jobs:
python-version: ["3.8", "3.9", "3.10"]
steps:
- uses: actions/checkout@v3
with:
submodules: recursive

- name: Test build_image.sh script with custom tagging and gpu flag
working-directory: docker
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/docker-nightly-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ jobs:
architecture: x64
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- name: Login to Docker
env:
DOCKER_PASSWORD: ${{secrets.DOCKER_PASSWORD}}
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/regression_tests_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ jobs:
java-version: '17'
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install dependencies
run: |
python ts_scripts/install_dependencies.py --environment=dev
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/regression_tests_cpu_binaries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ jobs:
binaries: ["pypi", "conda"]
steps:
- uses: actions/checkout@v3
with:
submodules: recursive
- name: Setup conda with Python ${{ matrix.python-version }}
uses: s-weigand/setup-conda@v1
with:
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/regression_tests_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ jobs:
docker system prune --all --volumes -f
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- name: Branch name
run: |
echo $GITHUB_REF_NAME
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/regression_tests_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ jobs:
java-version: '17'
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install dependencies
run: |
python ts_scripts/install_dependencies.py --environment=dev --cuda=cu121
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/regression_tests_gpu_binaries.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ jobs:
ls -la ./
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- uses: conda-incubator/setup-miniconda@v2
with:
miniconda-version: "latest"
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/torchserve-nightly-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ jobs:
- run: conda install -y conda-build anaconda-client
- name: Checkout TorchServe
uses: actions/checkout@v3
with:
submodules: recursive
- name: Install dependencies
run: |
python ts_scripts/install_dependencies.py --environment=dev
Expand Down
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/google/rpc"]
path = third_party/google/rpc
url = https://github.com/googleapis/googleapis.git
7 changes: 4 additions & 3 deletions docs/grpc_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,13 @@ Run following commands to Register, run inference and unregister, densenet161 mo
```bash
git clone https://github.com/pytorch/serve
cd serve
git submodule init
```

- Install gRPC python dependencies

```bash
pip install -U grpcio protobuf grpcio-tools
pip install -U grpcio protobuf grpcio-tools googleapis-common-protos
```

- Start torchServe
Expand All @@ -51,7 +52,7 @@ torchserve --start --model-store models/
- Generate python gRPC client stub using the proto files

```bash
python -m grpc_tools.protoc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts --grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto frontend/server/src/main/resources/proto/management.proto
python -m grpc_tools.protoc -I third_party/google/rpc --proto_path=frontend/server/src/main/resources/proto/ --python_out=ts_scripts --grpc_python_out=ts_scripts frontend/server/src/main/resources/proto/inference.proto frontend/server/src/main/resources/proto/management.proto
```

- Register densenet161 model
Expand Down Expand Up @@ -95,4 +96,4 @@ def handle(data, context):
for i in range (3):
send_intermediate_predict_response(["intermediate_response"], context.request_ids, "Intermediate Prediction success", 200, context)
return ["hello world "]
```
```
Binary file added docs/images/stateful_batch.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import logging
from abc import ABC

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

from ts.context import Context
from ts.torch_handler.base_handler import BaseHandler

logger = logging.getLogger(__name__)
logger.info("Transformers version %s", transformers.__version__)


class LlamaHandler(BaseHandler, ABC):
"""
Transformers handler class for sequence, token classification and question answering.
"""

def __init__(self):
super(LlamaHandler, self).__init__()
self.max_length = None
self.max_new_tokens = None
self.tokenizer = None
self.initialized = False

def initialize(self, ctx: Context):
"""In this initialize function, the HF large model is loaded and
partitioned using DeepSpeed.
Args:
ctx (context): It is a JSON Object containing information
pertaining to the model artifacts parameters.
"""
model_dir = ctx.system_properties.get("model_dir")
self.max_length = int(ctx.model_yaml_config["handler"]["max_length"])
self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"])
model_name = ctx.model_yaml_config["handler"]["model_name"]
model_path = f'{model_dir}/{ctx.model_yaml_config["handler"]["model_path"]}'
seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
torch.manual_seed(seed)

logger.info("Model %s loading tokenizer", ctx.model_name)
self.model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="balanced",
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
load_in_8bit=True,
trust_remote_code=True,
)
if ctx.model_yaml_config["handler"]["fast_kernels"]:
from optimum.bettertransformer import BetterTransformer

try:
self.model = BetterTransformer.transform(self.model)
except RuntimeError as error:
logger.warning(
"HuggingFace Optimum is not supporting this model,for the list of supported models, please refer to this doc,https://huggingface.co/docs/optimum/bettertransformer/overview"
)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)

logger.info("Model %s loaded successfully", ctx.model_name)
self.initialized = True

def preprocess(self, requests):
"""
Basic text preprocessing, based on the user's choice of application mode.
Args:
requests (list): A list of dictionaries with a "data" or "body" field, each
containing the input text to be processed.
Returns:
tuple: A tuple with two tensors: the batch of input ids and the batch of
attention masks.
"""
input_texts = [data.get("data") or data.get("body") for data in requests]
input_ids_batch, attention_mask_batch = [], []
for input_text in input_texts:
input_ids, attention_mask = self.encode_input_text(input_text)
input_ids_batch.append(input_ids)
attention_mask_batch.append(attention_mask)
input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.model.device)
attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device)
return input_ids_batch, attention_mask_batch

def encode_input_text(self, input_text):
"""
Encodes a single input text using the tokenizer.
Args:
input_text (str): The input text to be encoded.
Returns:
tuple: A tuple with two tensors: the encoded input ids and the attention mask.
"""
if isinstance(input_text, (bytes, bytearray)):
input_text = input_text.decode("utf-8")
logger.info("Received text: '%s'", input_text)
inputs = self.tokenizer.encode_plus(
input_text,
max_length=self.max_length,
padding=False,
add_special_tokens=True,
return_tensors="pt",
truncation=True,
)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
return input_ids, attention_mask

def inference(self, input_batch):
"""
Predicts the class (or classes) of the received text using the serialized transformers
checkpoint.
Args:
input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch
of attention masks, as returned by the preprocess function.
Returns:
list: A list of strings with the predicted values for each input text in the batch.
"""
input_ids_batch, attention_mask_batch = input_batch
input_ids_batch = input_ids_batch.to(self.device)
outputs = self.model.generate(
input_ids_batch,
attention_mask=attention_mask_batch,
max_length=self.max_new_tokens,
)

inferences = self.tokenizer.batch_decode(
outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

logger.info("Generated text: %s", inferences)
return inferences

def postprocess(self, inference_output):
"""Post Process Function converts the predicted response into Torchserve readable format.
Args:
inference_output (list): It contains the predicted response of the input text.
Returns:
(list): Returns a list of the Predictions and Explanations.
"""
return inference_output
Loading

0 comments on commit e1c31e1

Please sign in to comment.