diff --git a/requirements/dev.txt b/requirements/dev.txt index 92e384668..818ae407c 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -40,4 +40,3 @@ twine==4.0.2 # Fetch licenses pip-licenses==4.2.0 - diff --git a/runtimes/huggingface/mlserver_huggingface/common.py b/runtimes/huggingface/mlserver_huggingface/common.py index 8ba48e22e..c7b545685 100644 --- a/runtimes/huggingface/mlserver_huggingface/common.py +++ b/runtimes/huggingface/mlserver_huggingface/common.py @@ -5,6 +5,9 @@ from functools import partial from mlserver.settings import ModelSettings +import torch +import tensorflow as tf + from optimum.pipelines import pipeline as opt_pipeline from transformers.pipelines import pipeline as trf_pipeline from transformers.pipelines.base import Pipeline @@ -31,6 +34,20 @@ def load_pipeline_from_settings( tokenizer = hf_settings.pretrained_tokenizer if not tokenizer: tokenizer = hf_settings.pretrained_model + if hf_settings.framework == "tf": + if hf_settings.inter_op_threads is not None: + tf.config.threading.set_inter_op_parallelism_threads( + hf_settings.inter_op_threads + ) + if hf_settings.intra_op_threads is not None: + tf.config.threading.set_intra_op_parallelism_threads( + hf_settings.intra_op_threads + ) + elif hf_settings.framework == "pt": + if hf_settings.inter_op_threads is not None: + torch.set_num_interop_threads(hf_settings.inter_op_threads) + if hf_settings.intra_op_threads is not None: + torch.set_num_threads(hf_settings.intra_op_threads) hf_pipeline = pipeline( hf_settings.task_name, diff --git a/runtimes/huggingface/mlserver_huggingface/settings.py b/runtimes/huggingface/mlserver_huggingface/settings.py index f46758463..0c3087b5a 100644 --- a/runtimes/huggingface/mlserver_huggingface/settings.py +++ b/runtimes/huggingface/mlserver_huggingface/settings.py @@ -85,6 +85,24 @@ class Config: or a GPU ordinal rank like 1). """ + inter_op_threads: Optional[int] = None + """ + Threads used for parallelism between independent operations. + PyTorch: + https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html + Tensorflow: + https://www.tensorflow.org/api_docs/python/tf/config/threading/set_inter_op_parallelism_threads + """ + + intra_op_threads: Optional[int] = None + """ + Threads used within an individual op for parallelism. + PyTorch: + https://pytorch.org/docs/stable/notes/cpu_threading_torchscript_inference.html + Tensorflow: + https://www.tensorflow.org/api_docs/python/tf/config/threading/set_intra_op_parallelism_threads + """ + @property def task_name(self): if self.task == "translation": diff --git a/runtimes/huggingface/setup.py b/runtimes/huggingface/setup.py index 4430b3354..6a37e1ff9 100644 --- a/runtimes/huggingface/setup.py +++ b/runtimes/huggingface/setup.py @@ -36,6 +36,7 @@ def _load_description() -> str: install_requires=[ "mlserver", "optimum[onnxruntime]>=1.4.0, <1.8.0", + "tensorflow", "Pillow", ], long_description=_load_description(),