Merge branch 'main' into feat/max-concurrency

bentoml · May 30, 2023 · b5f5a72 · b5f5a72
2 parents 02a1432 + bd56fa9
commit b5f5a72
Show file tree

Hide file tree

Showing 37 changed files with 237 additions and 380 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -73,7 +73,7 @@ jobs:
         run: |
           npm install -g npm@^7 pyright
       - name: Setup bufbuild/buf
-        uses: bufbuild/buf-setup-action@v1.18.0
+        uses: bufbuild/buf-setup-action@v1.19.0
         with:
           github_token: ${{ github.token }}
       - name: Cache pip dependencies

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -33,6 +33,7 @@ If you are interested in proposing a new feature, make sure to create a new feat
 
    ```bash
    git switch main # ensure you're on the main branch
+   git fetch upstream --tags
    git branch --set-upstream-to=upstream/main
    ```
 

diff --git a/README.md b/README.md
@@ -118,7 +118,7 @@ bentoml serve
 Sent a prediction request:
 
 ```bash
-curl -F 'image=@samples/1.png' http://127.0.0.1:3000/predict_image
+curl -F 'image=@samples/1.png' http://127.0.0.1:3000/predict
 ```
 
 Build a Bento and generate a docker image:

diff --git a/docs/source/guides/batching.rst b/docs/source/guides/batching.rst
@@ -88,3 +88,52 @@ Error handling
 If adaptive batching cannot keep up with rate of the incoming requests while satisfying the max
 latency configuration, HTTP 503 Service Unavailable is returned. To workaround this error, consider
 relaxing the max latency requirement and further scaling the underlying hardware resources.
+
+Custom Batching
+---------------
+
+Currently, adaptive batching is only effective for certain types of parameters including(non-exhaustive) ``numpy.ndarray``, ``pandas.Series`` and framework-specific types such as ``torch.Tensor``.
+Batch parameters of other types are simply collected into a list and passed to the inference function. If your model accepts parameters that are not batchable by default,
+you can achieve adaptive batching by wrapping the inference function with a :ref:`Runner <concepts/runner:Custom Runner>`.
+
+We will demonstrate this with a PyTorch example which accepts a dictionary of ``torch.Tensor`` as input.
+
+.. code-block:: python
+    :caption: `service.py`
+
+    import bentoml
+    import torch
+
+    class MyRunnable(bentoml.Runnable):
+        def __init__(self):
+            super().__init__()
+            # if torch.cuda.device_count():
+            if torch.cuda.is_available():
+                self.device_id = "cuda"
+                # by default, torch.FloatTensor will be used on CPU.
+                torch.set_default_tensor_type("torch.cuda.FloatTensor")
+            else:
+                self.device_id = "cpu"
+            self.model = bentoml.pytorch.load_model("my_pytorch_model", device_id=self.device_id)
+            # We want to turn off dropout and batchnorm when running inference.
+            self.model.train(False)
+
+        @bentoml.Runnable.method(batchable=True)
+        def __call__(self, **batch):
+            # Our model accepts a dictionary of Tensor as input, but we use ``**``
+            # to deconstruct it to keyword arguments, so they can be batched rightly.
+            # It works only if every parameter is "batchable" such as torch.Tensor/np.ndarray.
+            # Move the tensors to the target device and pass it to the model as a dict.
+            batch = {k: tensor.to(self.device_id) for k, tensor in batch.items()}
+            with torch.inference_mode():
+                return self.model(batch)
+
+    # Build the runner from the runnable manually, instead of calling model.to_runner() method
+    runner = bentoml.Runner(MyRunnable)
+    svc = bentoml.Service(name="my_ml_service", runners=[runner])
+
+    # Define an API endpoint for the inference
+    @svc.api(input=bentoml.io.JSON(), output=bentoml.io.JSON())
+    async def predict(api_input):
+        # Deconstruct the input dictionary to keyword arguments for batching
+        return await runner.async_run(**api_input)
diff --git a/docs/source/guides/logging.rst b/docs/source/guides/logging.rst
@@ -132,6 +132,9 @@ When using BentoML as a library, BentoML does not configure any logs. By default
     bentoml_logger.addHandler(ch)
     bentoml_logger.setLevel(logging.DEBUG)
 
+.. note::
+
+    :bdg-info:`Important:` ``bentoml serve`` will fork the given `service.py` into the child process. This means any type of rotation FileHandler are not supported within the service definition. See `notes from Python logging module <https://docs.python.org/3/howto/logging-cookbook.html#logging-to-a-single-file-from-multiple-processes>`_ for more information.
 ----
 
 .. rubric:: Notes

diff --git a/examples/custom_runner/torch_hub_yolov5/requirements.txt b/examples/custom_runner/torch_hub_yolov5/requirements.txt
@@ -5,7 +5,8 @@ matplotlib>=3.2.2
 numpy>=1.18.5
 opencv-python>=4.1.1
 pandas>=1.1.4
-protobuf<4.21.3  # https://github.com/ultralytics/yolov5/issues/8012
+protobuf<=3.20.1 # https://github.com/ultralytics/yolov5/issues/8012
+ipython
 requests>=2.23.0
 scipy>=1.4.1  # Google Colab version
 seaborn>=0.11.0

diff --git a/examples/custom_runner/torch_hub_yolov5/service.py b/examples/custom_runner/torch_hub_yolov5/service.py
@@ -10,7 +10,7 @@ class Yolov5Runnable(bentoml.Runnable):
     SUPPORTS_CPU_MULTI_THREADING = True
 
     def __init__(self):
-        self.model = torch.hub.load("ultralytics/yolov5", "yolov5s")
+        self.model = torch.hub.load("ultralytics/yolov5:v6.2", "yolov5s")
 
         if torch.cuda.is_available():
             self.model.cuda()

diff --git a/requirements/tests-requirements.txt b/requirements/tests-requirements.txt
@@ -2,13 +2,13 @@
 pandas
 pydantic
 scikit-learn
-yamllint==1.31.0
+yamllint==1.32.0
 black[jupyter]==22.12.0
-coverage[toml]==7.2.5
+coverage[toml]==7.2.6
 setuptools>=63
 isort==5.11.4
-ruff==0.0.269
-pytest-cov==4.0.0
+ruff==0.0.270
+pytest-cov==4.1.0
 pytest==7.3.1
 pytest-xdist[psutil]==3.3.1
 pytest-asyncio==0.21.0

diff --git a/src/bentoml/_internal/bento/build_config.py b/src/bentoml/_internal/bento/build_config.py
@@ -8,7 +8,6 @@
 import subprocess
 from sys import version_info
 from shlex import quote
-from typing import TYPE_CHECKING
 
 import fs
 import attr
@@ -34,7 +33,7 @@
 from ..container.frontend.dockerfile import SUPPORTED_PYTHON_VERSIONS
 from ..container.frontend.dockerfile import CONTAINER_SUPPORTED_DISTROS
 
-if TYPE_CHECKING:
+if t.TYPE_CHECKING:
     from attr import Attribute
     from fs.base import FS
 
@@ -278,7 +277,7 @@ def to_dict(self) -> dict[str, t.Any]:
         return bentoml_cattr.unstructure(self)
 
 
-if TYPE_CHECKING:
+if t.TYPE_CHECKING:
     CondaPipType = dict[t.Literal["pip"], list[str]]
     DependencyType = list[str | CondaPipType]
 else:
@@ -312,7 +311,7 @@ def conda_dependencies_validator(
                 )
 
 
-if TYPE_CHECKING:
+if t.TYPE_CHECKING:
     ListStr: t.TypeAlias = list[str]
     CondaYamlDict = dict[str, DependencyType | list[str]]
 else:
@@ -689,7 +688,7 @@ def _python_options_structure_hook(d: t.Any, _: t.Type[PythonOptions]) -> Python
 bentoml_cattr.register_structure_hook(PythonOptions, _python_options_structure_hook)
 
 
-if TYPE_CHECKING:
+if t.TYPE_CHECKING:
     OptionsCls = DockerOptions | CondaOptions | PythonOptions
 
 
@@ -740,7 +739,7 @@ class BentoBuildConfig:
         converter=dict_options_converter(CondaOptions),
     )
 
-    if TYPE_CHECKING:
+    if t.TYPE_CHECKING:
         # NOTE: This is to ensure that BentoBuildConfig __init__
         # satisfies type checker. docker, python, and conda accepts
         # dict[str, t.Any] since our converter will handle the conversion.

diff --git a/src/bentoml/_internal/frameworks/FRAMEWORK_TEMPLATE_PY b/src/bentoml/_internal/frameworks/FRAMEWORK_TEMPLATE_PY