diff --git a/openllm-client/src/openllm_client/__init__.pyi b/openllm-client/src/openllm_client/__init__.pyi
index 3b5ecfb30..bec8fc114 100644
--- a/openllm-client/src/openllm_client/__init__.pyi
+++ b/openllm-client/src/openllm_client/__init__.pyi
@@ -15,11 +15,17 @@ class HTTPClient:
   address: str
   helpers: _Helpers
   @overload
-  def __init__(self, address: str, timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
+  def __init__(
+    self, address: str, timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
+  ) -> None: ...
   @overload
-  def __init__(self, address: str = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
+  def __init__(
+    self, address: str = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
+  ) -> None: ...
   @overload
-  def __init__(self, address: None = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
+  def __init__(
+    self, address: None = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
+  ) -> None: ...
   @property
   def is_ready(self) -> bool: ...
   def health(self) -> bool: ...
@@ -60,11 +66,17 @@ class AsyncHTTPClient:
   address: str
   helpers: _AsyncHelpers
   @overload
-  def __init__(self, address: str, timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
+  def __init__(
+    self, address: str, timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
+  ) -> None: ...
   @overload
-  def __init__(self, address: str = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
+  def __init__(
+    self, address: str = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
+  ) -> None: ...
   @overload
-  def __init__(self, address: None = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...) -> None: ...
+  def __init__(
+    self, address: None = ..., timeout: int = ..., verify: bool = ..., max_retries: int = ..., api_version: str = ...
+  ) -> None: ...
   @property
   def is_ready(self) -> bool: ...
   async def health(self) -> bool: ...
diff --git a/openllm-client/src/openllm_client/_http.py b/openllm-client/src/openllm_client/_http.py
index 9923468e0..ed802a67b 100644
--- a/openllm-client/src/openllm_client/_http.py
+++ b/openllm-client/src/openllm_client/_http.py
@@ -70,10 +70,14 @@ def query(self, prompt, **attrs):
     return self.generate(prompt, **attrs)
 
   def health(self):
-    response = self._get('/readyz', response_cls=None, options={'return_raw_response': True, 'max_retries': self._max_retries})
+    response = self._get(
+      '/readyz', response_cls=None, options={'return_raw_response': True, 'max_retries': self._max_retries}
+    )
     return response.status_code == 200
 
-  def generate(self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs) -> Response:
+  def generate(
+    self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs
+  ) -> Response:
     if timeout is None:
       timeout = self._timeout
     if verify is None:
@@ -96,7 +100,9 @@ def generate_stream(
     for response_chunk in self.generate_iterator(prompt, llm_config, stop, adapter_name, timeout, verify, **attrs):
       yield StreamingResponse.from_response_chunk(response_chunk)
 
-  def generate_iterator(self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs) -> t.Iterator[Response]:
+  def generate_iterator(
+    self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs
+  ) -> t.Iterator[Response]:
     if timeout is None:
       timeout = self._timeout
     if verify is None:
@@ -146,7 +152,9 @@ def _build_auth_headers(self) -> t.Dict[str, str]:
   @property
   async def _metadata(self) -> t.Awaitable[Metadata]:
     if self.__metadata is None:
-      self.__metadata = await self._post(f'/{self._api_version}/metadata', response_cls=Metadata, json={}, options={'max_retries': self._max_retries})
+      self.__metadata = await self._post(
+        f'/{self._api_version}/metadata', response_cls=Metadata, json={}, options={'max_retries': self._max_retries}
+      )
     return self.__metadata
 
   @property
@@ -159,10 +167,14 @@ async def query(self, prompt, **attrs):
     return await self.generate(prompt, **attrs)
 
   async def health(self):
-    response = await self._get('/readyz', response_cls=None, options={'return_raw_response': True, 'max_retries': self._max_retries})
+    response = await self._get(
+      '/readyz', response_cls=None, options={'return_raw_response': True, 'max_retries': self._max_retries}
+    )
     return response.status_code == 200
 
-  async def generate(self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs) -> Response:
+  async def generate(
+    self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs
+  ) -> Response:
     if timeout is None:
       timeout = self._timeout
     if verify is None:
@@ -183,7 +195,9 @@ async def generate(self, prompt, llm_config=None, stop=None, adapter_name=None,
   async def generate_stream(
     self, prompt, llm_config=None, stop=None, adapter_name=None, timeout=None, verify=None, **attrs
   ) -> t.AsyncGenerator[StreamingResponse, t.Any]:
-    async for response_chunk in self.generate_iterator(prompt, llm_config, stop, adapter_name, timeout, verify, **attrs):
+    async for response_chunk in self.generate_iterator(
+      prompt, llm_config, stop, adapter_name, timeout, verify, **attrs
+    ):
       yield StreamingResponse.from_response_chunk(response_chunk)
 
   async def generate_iterator(
diff --git a/openllm-client/src/openllm_client/_schemas.py b/openllm-client/src/openllm_client/_schemas.py
index 037c92b3f..1723d583f 100644
--- a/openllm-client/src/openllm_client/_schemas.py
+++ b/openllm-client/src/openllm_client/_schemas.py
@@ -17,7 +17,7 @@
   from ._shim import AsyncClient, Client
 
 
-__all__ = ['Response', 'CompletionChunk', 'Metadata', 'StreamingResponse', 'Helpers']
+__all__ = ['CompletionChunk', 'Helpers', 'Metadata', 'Response', 'StreamingResponse']
 
 
 @attr.define
@@ -42,7 +42,11 @@ def _structure_metadata(data: t.Dict[str, t.Any], cls: type[Metadata]) -> Metada
     raise RuntimeError(f'Malformed metadata configuration (Server-side issue): {e}') from None
   try:
     return cls(
-      model_id=data['model_id'], timeout=data['timeout'], model_name=data['model_name'], backend=data['backend'], configuration=configuration
+      model_id=data['model_id'],
+      timeout=data['timeout'],
+      model_name=data['model_name'],
+      backend=data['backend'],
+      configuration=configuration,
     )
   except Exception as e:
     raise RuntimeError(f'Malformed metadata (Server-side issue): {e}') from None
@@ -61,7 +65,10 @@ class StreamingResponse(_SchemaMixin):
   @classmethod
   def from_response_chunk(cls, response: Response) -> StreamingResponse:
     return cls(
-      request_id=response.request_id, index=response.outputs[0].index, text=response.outputs[0].text, token_ids=response.outputs[0].token_ids[0]
+      request_id=response.request_id,
+      index=response.outputs[0].index,
+      text=response.outputs[0].text,
+      token_ids=response.outputs[0].token_ids[0],
     )
 
 
@@ -88,11 +95,17 @@ def async_client(self):
     return self._async_client
 
   def messages(self, messages, add_generation_prompt=False):
-    return self.client._post('/v1/helpers/messages', response_cls=str, json=dict(messages=messages, add_generation_prompt=add_generation_prompt))
+    return self.client._post(
+      '/v1/helpers/messages',
+      response_cls=str,
+      json=dict(messages=messages, add_generation_prompt=add_generation_prompt),
+    )
 
   async def async_messages(self, messages, add_generation_prompt=False):
     return await self.async_client._post(
-      '/v1/helpers/messages', response_cls=str, json=dict(messages=messages, add_generation_prompt=add_generation_prompt)
+      '/v1/helpers/messages',
+      response_cls=str,
+      json=dict(messages=messages, add_generation_prompt=add_generation_prompt),
     )
 
   @classmethod
diff --git a/openllm-client/src/openllm_client/_shim.py b/openllm-client/src/openllm_client/_shim.py
index 04a9a730a..4c7470e7f 100644
--- a/openllm-client/src/openllm_client/_shim.py
+++ b/openllm-client/src/openllm_client/_shim.py
@@ -140,7 +140,9 @@ def parse(self):
 
     data = self._raw_response.json()
     try:
-      return self._client._process_response_data(data=data, response_cls=self._response_cls, raw_response=self._raw_response)
+      return self._client._process_response_data(
+        data=data, response_cls=self._response_cls, raw_response=self._raw_response
+      )
     except Exception as exc:
       raise ValueError(exc) from None  # validation error here
 
@@ -271,10 +273,16 @@ def _build_headers(self, options: RequestOptions) -> httpx.Headers:
 
   def _build_request(self, options: RequestOptions) -> httpx.Request:
     return self._inner.build_request(
-      method=options.method, headers=self._build_headers(options), url=self._prepare_url(options.url), json=options.json, params=options.params
+      method=options.method,
+      headers=self._build_headers(options),
+      url=self._prepare_url(options.url),
+      json=options.json,
+      params=options.params,
     )
 
-  def _calculate_retry_timeout(self, remaining_retries: int, options: RequestOptions, headers: t.Optional[httpx.Headers] = None) -> float:
+  def _calculate_retry_timeout(
+    self, remaining_retries: int, options: RequestOptions, headers: t.Optional[httpx.Headers] = None
+  ) -> float:
     max_retries = options.get_max_retries(self._max_retries)
     # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After
     try:
@@ -315,7 +323,9 @@ def _should_retry(self, response: httpx.Response) -> bool:
       return True
     return False
 
-  def _process_response_data(self, *, response_cls: type[Response], data: t.Dict[str, t.Any], raw_response: httpx.Response) -> Response:
+  def _process_response_data(
+    self, *, response_cls: type[Response], data: t.Dict[str, t.Any], raw_response: httpx.Response
+  ) -> Response:
     return converter.structure(data, response_cls)
 
   def _process_response(
@@ -328,13 +338,24 @@ def _process_response(
     stream_cls: type[_Stream] | type[_AsyncStream] | None,
   ) -> Response:
     return APIResponse(
-      raw_response=raw_response, client=self, response_cls=response_cls, stream=stream, stream_cls=stream_cls, options=options
+      raw_response=raw_response,
+      client=self,
+      response_cls=response_cls,
+      stream=stream,
+      stream_cls=stream_cls,
+      options=options,
     ).parse()
 
 
 @attr.define(init=False)
 class Client(BaseClient[httpx.Client, Stream[t.Any]]):
-  def __init__(self, base_url: str | httpx.URL, version: str, timeout: int | httpx.Timeout = DEFAULT_TIMEOUT, max_retries: int = MAX_RETRIES):
+  def __init__(
+    self,
+    base_url: str | httpx.URL,
+    version: str,
+    timeout: int | httpx.Timeout = DEFAULT_TIMEOUT,
+    max_retries: int = MAX_RETRIES,
+  ):
     super().__init__(
       base_url=base_url,
       version=version,
@@ -366,7 +387,13 @@ def request(
     stream: bool = False,
     stream_cls: type[_Stream] | None = None,
   ) -> Response | _Stream:
-    return self._request(response_cls=response_cls, options=options, remaining_retries=remaining_retries, stream=stream, stream_cls=stream_cls)
+    return self._request(
+      response_cls=response_cls,
+      options=options,
+      remaining_retries=remaining_retries,
+      stream=stream,
+      stream_cls=stream_cls,
+    )
 
   def _request(
     self,
@@ -385,7 +412,9 @@ def _request(
       response.raise_for_status()
     except httpx.HTTPStatusError as exc:
       if retries > 0 and self._should_retry(exc.response):
-        return self._retry_request(response_cls, options, retries, exc.response.headers, stream=stream, stream_cls=stream_cls)
+        return self._retry_request(
+          response_cls, options, retries, exc.response.headers, stream=stream, stream_cls=stream_cls
+        )
       # If the response is streamed then we need to explicitly read the completed response
       exc.response.read()
       raise ValueError(exc.message) from None
@@ -398,7 +427,9 @@ def _request(
         return self._retry_request(response_cls, options, retries, stream=stream, stream_cls=stream_cls)
       raise ValueError(request) from None  # connection error
 
-    return self._process_response(response_cls=response_cls, options=options, raw_response=response, stream=stream, stream_cls=stream_cls)
+    return self._process_response(
+      response_cls=response_cls, options=options, raw_response=response, stream=stream, stream_cls=stream_cls
+    )
 
   def _retry_request(
     self,
@@ -428,7 +459,9 @@ def _get(
   ) -> Response | _Stream:
     if options is None:
       options = {}
-    return self.request(response_cls, RequestOptions(method='GET', url=path, **options), stream=stream, stream_cls=stream_cls)
+    return self.request(
+      response_cls, RequestOptions(method='GET', url=path, **options), stream=stream, stream_cls=stream_cls
+    )
 
   def _post(
     self,
@@ -442,12 +475,20 @@ def _post(
   ) -> Response | _Stream:
     if options is None:
       options = {}
-    return self.request(response_cls, RequestOptions(method='POST', url=path, json=json, **options), stream=stream, stream_cls=stream_cls)
+    return self.request(
+      response_cls, RequestOptions(method='POST', url=path, json=json, **options), stream=stream, stream_cls=stream_cls
+    )
 
 
 @attr.define(init=False)
 class AsyncClient(BaseClient[httpx.AsyncClient, AsyncStream[t.Any]]):
-  def __init__(self, base_url: str | httpx.URL, version: str, timeout: int | httpx.Timeout = DEFAULT_TIMEOUT, max_retries: int = MAX_RETRIES):
+  def __init__(
+    self,
+    base_url: str | httpx.URL,
+    version: str,
+    timeout: int | httpx.Timeout = DEFAULT_TIMEOUT,
+    max_retries: int = MAX_RETRIES,
+  ):
     super().__init__(
       base_url=base_url,
       version=version,
@@ -486,7 +527,9 @@ async def request(
     stream: bool = False,
     stream_cls: type[_AsyncStream] | None = None,
   ) -> Response | _AsyncStream:
-    return await self._request(response_cls, options, remaining_retries=remaining_retries, stream=stream, stream_cls=stream_cls)
+    return await self._request(
+      response_cls, options, remaining_retries=remaining_retries, stream=stream, stream_cls=stream_cls
+    )
 
   async def _request(
     self,
@@ -506,7 +549,9 @@ async def _request(
       response.raise_for_status()
     except httpx.HTTPStatusError as exc:
       if retries > 0 and self._should_retry(exc.response):
-        return self._retry_request(response_cls, options, retries, exc.response.headers, stream=stream, stream_cls=stream_cls)
+        return self._retry_request(
+          response_cls, options, retries, exc.response.headers, stream=stream, stream_cls=stream_cls
+        )
       # If the response is streamed then we need to explicitly read the completed response
       await exc.response.aread()
       raise ValueError(exc.message) from None
@@ -526,7 +571,9 @@ async def _request(
         return await self._retry_request(response_cls, options, retries, stream=stream, stream_cls=stream_cls)
       raise ValueError(request) from err  # connection error
 
-    return self._process_response(response_cls=response_cls, options=options, raw_response=response, stream=stream, stream_cls=stream_cls)
+    return self._process_response(
+      response_cls=response_cls, options=options, raw_response=response, stream=stream, stream_cls=stream_cls
+    )
 
   async def _retry_request(
     self,
@@ -555,7 +602,9 @@ async def _get(
   ) -> Response | _AsyncStream:
     if options is None:
       options = {}
-    return await self.request(response_cls, RequestOptions(method='GET', url=path, **options), stream=stream, stream_cls=stream_cls)
+    return await self.request(
+      response_cls, RequestOptions(method='GET', url=path, **options), stream=stream, stream_cls=stream_cls
+    )
 
   async def _post(
     self,
@@ -569,4 +618,6 @@ async def _post(
   ) -> Response | _AsyncStream:
     if options is None:
       options = {}
-    return await self.request(response_cls, RequestOptions(method='POST', url=path, json=json, **options), stream=stream, stream_cls=stream_cls)
+    return await self.request(
+      response_cls, RequestOptions(method='POST', url=path, json=json, **options), stream=stream, stream_cls=stream_cls
+    )
diff --git a/openllm-client/src/openllm_client/_stream.py b/openllm-client/src/openllm_client/_stream.py
index a5103207a..e81a7fb07 100644
--- a/openllm-client/src/openllm_client/_stream.py
+++ b/openllm-client/src/openllm_client/_stream.py
@@ -38,7 +38,9 @@ def _stream(self) -> t.Iterator[Response]:
       if sse.data.startswith('[DONE]'):
         break
       if sse.event is None:
-        yield self._client._process_response_data(data=sse.model_dump(), response_cls=self._response_cls, raw_response=self._response)
+        yield self._client._process_response_data(
+          data=sse.model_dump(), response_cls=self._response_cls, raw_response=self._response
+        )
 
 
 @attr.define(auto_attribs=True)
@@ -69,7 +71,9 @@ async def _stream(self) -> t.AsyncGenerator[Response, None]:
       if sse.data.startswith('[DONE]'):
         break
       if sse.event is None:
-        yield self._client._process_response_data(data=sse.model_dump(), response_cls=self._response_cls, raw_response=self._response)
+        yield self._client._process_response_data(
+          data=sse.model_dump(), response_cls=self._response_cls, raw_response=self._response
+        )
 
 
 @attr.define
diff --git a/openllm-client/src/openllm_client/_typing_compat.py b/openllm-client/src/openllm_client/_typing_compat.py
index 48bd0a855..15d86f8a3 100644
--- a/openllm-client/src/openllm_client/_typing_compat.py
+++ b/openllm-client/src/openllm_client/_typing_compat.py
@@ -11,5 +11,7 @@
   overload as overload,
 )
 
-Platform = Annotated[LiteralString, Literal['MacOS', 'Linux', 'Windows', 'FreeBSD', 'OpenBSD', 'iOS', 'iPadOS', 'Android', 'Unknown'], str]
+Platform = Annotated[
+  LiteralString, Literal['MacOS', 'Linux', 'Windows', 'FreeBSD', 'OpenBSD', 'iOS', 'iPadOS', 'Android', 'Unknown'], str
+]
 Architecture = Annotated[LiteralString, Literal['arm', 'arm64', 'x86', 'x86_64', 'Unknown'], str]
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index f7df0c3b2..db8da204a 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -48,7 +48,9 @@
 
 @attr.define(slots=False, repr=False, init=False)
 class LLM(t.Generic[M, T]):
-  async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
+  async def generate(
+    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
+  ):
     if adapter_name is not None and self.__llm_backend__ != 'pt':
       raise NotImplementedError(f'Adapter is not supported with {self.__llm_backend__}.')
     config = self.config.model_construct_env(**attrs)
@@ -63,10 +65,15 @@ async def generate(self, prompt, prompt_token_ids=None, stop=None, stop_token_id
       raise RuntimeError('No result is returned.')
     return final_result.with_options(
       prompt=prompt,
-      outputs=[output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index]) for output in final_result.outputs],
+      outputs=[
+        output.with_options(text=''.join(texts[output.index]), token_ids=token_ids[output.index])
+        for output in final_result.outputs
+      ],
     )
 
-  async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs):
+  async def generate_iterator(
+    self, prompt, prompt_token_ids=None, stop=None, stop_token_ids=None, request_id=None, adapter_name=None, **attrs
+  ):
     from bentoml._internal.runner.runner_handle import DummyRunnerHandle
 
     if adapter_name is not None and self.__llm_backend__ != 'pt':
@@ -131,7 +138,9 @@ async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop
   # The below are mainly for internal implementation that you don't have to worry about.
   _model_id: str
   _revision: t.Optional[str]  #
-  _quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]]
+  _quantization_config: t.Optional[
+    t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
+  ]
   _quantise: t.Optional[LiteralQuantise]
   _model_decls: t.Tuple[t.Any, ...]
   __model_attrs: t.Dict[str, t.Any]  #
@@ -147,7 +156,9 @@ async def generate_iterator(self, prompt, prompt_token_ids=None, stop=None, stop
   __llm_torch_dtype__: 'torch.dtype' = None
   __llm_config__: t.Optional[LLMConfig] = None
   __llm_backend__: LiteralBackend = None
-  __llm_quantization_config__: t.Optional[t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]] = None
+  __llm_quantization_config__: t.Optional[
+    t.Union[transformers.BitsAndBytesConfig, transformers.GPTQConfig, transformers.AwqConfig]
+  ] = None
   __llm_runner__: t.Optional[Runner[M, T]] = None
   __llm_model__: t.Optional[M] = None
   __llm_tokenizer__: t.Optional[T] = None
@@ -178,7 +189,9 @@ def __init__(
     torch_dtype = attrs.pop('torch_dtype', None)  # backward compatible
     if torch_dtype is not None:
       warnings.warn(
-        'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.', DeprecationWarning, stacklevel=3
+        'The argument "torch_dtype" is deprecated and will be removed in the future. Please use "dtype" instead.',
+        DeprecationWarning,
+        stacklevel=3,
       )
       dtype = torch_dtype
     _local = False
@@ -234,19 +247,27 @@ def __init__(
 
   class _Quantise:
     @staticmethod
-    def pt(llm: LLM, quantise=None): return quantise
+    def pt(llm: LLM, quantise=None):
+      return quantise
+
     @staticmethod
-    def vllm(llm: LLM, quantise=None): return quantise
+    def vllm(llm: LLM, quantise=None):
+      return quantise
 
   @apply(lambda val: tuple(str.lower(i) if i else i for i in val))
   def _make_tag_components(self, model_id: str, model_version: str | None, backend: str) -> tuple[str, str | None]:
     model_id, *maybe_revision = model_id.rsplit(':')
     if len(maybe_revision) > 0:
       if model_version is not None:
-        logger.warning("revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version)
+        logger.warning(
+          "revision is specified (%s). 'model_version=%s' will be ignored.", maybe_revision[0], model_version
+        )
       model_version = maybe_revision[0]
     if validate_is_path(model_id):
-      model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id))
+      model_id, model_version = (
+        resolve_filepath(model_id),
+        first_not_none(model_version, default=generate_hash_from_file(model_id)),
+      )
     return f'{backend}-{normalise_model_name(model_id)}', model_version
 
   @functools.cached_property
@@ -255,9 +276,11 @@ def _has_gpus(self):
       from cuda import cuda
 
       err, *_ = cuda.cuInit(0)
-      if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to initialise CUDA runtime binding.')
+      if err != cuda.CUresult.CUDA_SUCCESS:
+        raise RuntimeError('Failed to initialise CUDA runtime binding.')
       err, _ = cuda.cuDeviceGetCount()
-      if err != cuda.CUresult.CUDA_SUCCESS: raise RuntimeError('Failed to get CUDA device count.')
+      if err != cuda.CUresult.CUDA_SUCCESS:
+        raise RuntimeError('Failed to get CUDA device count.')
       return True
     except (ImportError, RuntimeError):
       return False
@@ -269,7 +292,9 @@ def _torch_dtype(self):
     _map = _torch_dtype_mapping()
     if not isinstance(self.__llm_torch_dtype__, torch.dtype):
       try:
-        hf_config = transformers.AutoConfig.from_pretrained(self.bentomodel.path, trust_remote_code=self.trust_remote_code)
+        hf_config = transformers.AutoConfig.from_pretrained(
+          self.bentomodel.path, trust_remote_code=self.trust_remote_code
+        )
       except OpenLLMException:
         hf_config = transformers.AutoConfig.from_pretrained(self.model_id, trust_remote_code=self.trust_remote_code)
       config_dtype = getattr(hf_config, 'torch_dtype', None)
@@ -300,7 +325,9 @@ def _tokenizer_attrs(self):
     return {**self.import_kwargs[1], **self.__tokenizer_attrs}
 
   def _cascade_backend(self) -> LiteralBackend:
-    logger.warning('It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.')
+    logger.warning(
+      'It is recommended to specify the backend explicitly. Cascading backend might lead to unexpected behaviour.'
+    )
     if self._has_gpus and is_vllm_available():
       return 'vllm'
     else:
@@ -330,7 +357,10 @@ def __repr__(self) -> str:
 
   @property
   def import_kwargs(self):
-    return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {'padding_side': 'left', 'truncation_side': 'left'}
+    return {'device_map': 'auto' if self._has_gpus else None, 'torch_dtype': self._torch_dtype}, {
+      'padding_side': 'left',
+      'truncation_side': 'left',
+    }
 
   @property
   def trust_remote_code(self):
@@ -363,7 +393,9 @@ def quantization_config(self):
       if self._quantization_config is not None:
         self.__llm_quantization_config__ = self._quantization_config
       elif self._quantise is not None:
-        self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(self, self._quantise, **self._model_attrs)
+        self.__llm_quantization_config__, self._model_attrs = infer_quantisation_config(
+          self, self._quantise, **self._model_attrs
+        )
       else:
         raise ValueError("Either 'quantization_config' or 'quantise' must be specified.")
     return self.__llm_quantization_config__
@@ -418,7 +450,11 @@ def prepare(self, adapter_type='lora', use_gradient_checking=True, **attrs):
 
     model = get_peft_model(
       prepare_model_for_kbit_training(self.model, use_gradient_checkpointing=use_gradient_checking),
-      self.config['fine_tune_strategies'].get(adapter_type, self.config.make_fine_tune_config(adapter_type)).train().with_config(**attrs).build(),
+      self.config['fine_tune_strategies']
+      .get(adapter_type, self.config.make_fine_tune_config(adapter_type))
+      .train()
+      .with_config(**attrs)
+      .build(),
     )
     if DEBUG:
       model.print_trainable_parameters()
@@ -438,7 +474,10 @@ def adapter_map(self):
     if self.__llm_adapter_map__ is None:
       _map: ResolvedAdapterMap = {k: {} for k in self._adapter_map}
       for adapter_type, adapter_tuple in self._adapter_map.items():
-        base = first_not_none(self.config['fine_tune_strategies'].get(adapter_type), default=self.config.make_fine_tune_config(adapter_type))
+        base = first_not_none(
+          self.config['fine_tune_strategies'].get(adapter_type),
+          default=self.config.make_fine_tune_config(adapter_type),
+        )
         for adapter in adapter_tuple:
           _map[adapter_type][adapter.name] = (base.with_config(**adapter.config).build(), adapter.adapter_id)
       self.__llm_adapter_map__ = _map
@@ -453,7 +492,9 @@ def model(self):
         import torch
 
         loaded_in_kbit = (
-          getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False)
+          getattr(model, 'is_loaded_in_8bit', False)
+          or getattr(model, 'is_loaded_in_4bit', False)
+          or getattr(model, 'is_quantized', False)
         )
         if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit:
           try:
@@ -474,7 +515,6 @@ def _architecture_mappings(self):
 
   @property
   def config(self):
-    import transformers
     if self._local:
       config_file = os.path.join(self.model_id, CONFIG_FILE_NAME)
     else:
@@ -482,7 +522,9 @@ def config(self):
         config_file = self.bentomodel.path_of(CONFIG_FILE_NAME)
       except OpenLLMException as err:
         if not is_transformers_available():
-          raise MissingDependencyError("Requires 'transformers' to be available. Do 'pip install transformers'") from err
+          raise MissingDependencyError(
+            "Requires 'transformers' to be available. Do 'pip install transformers'"
+          ) from err
         from transformers.utils import cached_file
 
         try:
@@ -499,7 +541,9 @@ def config(self):
     if 'architectures' in loaded_config:
       for architecture in loaded_config['architectures']:
         if architecture in self._architecture_mappings:
-          self.__llm_config__ = openllm_core.AutoConfig.for_model(self._architecture_mappings[architecture]).model_construct_env(**self._model_attrs)
+          self.__llm_config__ = openllm_core.AutoConfig.for_model(
+            self._architecture_mappings[architecture]
+          ).model_construct_env(**self._model_attrs)
           break
       else:
         raise ValueError(f"Failed to find architecture from 'config.json' (config_json_path={config_file})")
@@ -520,12 +564,18 @@ def _torch_dtype_mapping() -> dict[str, torch.dtype]:
 
 
 def normalise_model_name(name: str) -> str:
-  return os.path.basename(resolve_filepath(name)) if validate_is_path(name) else inflection.dasherize(name.replace('/', '--'))
+  return (
+    os.path.basename(resolve_filepath(name))
+    if validate_is_path(name)
+    else inflection.dasherize(name.replace('/', '--'))
+  )
 
 
 def convert_peft_config_type(adapter_map: dict[str, str]) -> AdapterMap:
   if not is_peft_available():
-    raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'")
+    raise RuntimeError(
+      "LoRA adapter requires 'peft' to be installed. Make sure to do 'pip install \"openllm[fine-tune]\"'"
+    )
   from huggingface_hub import hf_hub_download
 
   resolved: AdapterMap = {}
diff --git a/openllm-python/src/openllm/_runners.py b/openllm-python/src/openllm/_runners.py
index 52961a8ed..1750ae2ae 100644
--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -46,7 +46,10 @@ def runner(llm: openllm.LLM[M, T]) -> Runner[M, T]:
         (
           'runner_methods',
           {
-            method.name: {'batchable': method.config.batchable, 'batch_dim': method.config.batch_dim if method.config.batchable else None}
+            method.name: {
+              'batchable': method.config.batchable,
+              'batch_dim': method.config.batch_dim if method.config.batchable else None,
+            }
             for method in _.runner_methods
           },
         ),
@@ -83,7 +86,9 @@ def __init__(self, llm):
     if dev >= 2:
       num_gpus = min(dev // 2 * 2, dev)
     quantise = llm.quantise if llm.quantise and llm.quantise in {'gptq', 'awq', 'squeezellm'} else None
-    dtype = torch.float16 if quantise == 'gptq' else llm._torch_dtype  # NOTE: quantise GPTQ doesn't support bfloat16 yet.
+    dtype = (
+      torch.float16 if quantise == 'gptq' else llm._torch_dtype
+    )  # NOTE: quantise GPTQ doesn't support bfloat16 yet.
     try:
       self.model = vllm.AsyncLLMEngine.from_engine_args(
         vllm.AsyncEngineArgs(
@@ -102,7 +107,9 @@ def __init__(self, llm):
       )
     except Exception as err:
       traceback.print_exc()
-      raise openllm.exceptions.OpenLLMException(f'Failed to initialise vLLMEngine due to the following error:\n{err}') from err
+      raise openllm.exceptions.OpenLLMException(
+        f'Failed to initialise vLLMEngine due to the following error:\n{err}'
+      ) from err
 
   @bentoml.Runnable.method(batchable=False)
   async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapter_name=None, **attrs):
@@ -159,7 +166,9 @@ async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapt
         if config['logprobs']:  # FIXME: logprobs is not supported
           raise NotImplementedError('Logprobs is yet to be supported with encoder-decoder models.')
         encoder_output = self.model.encoder(input_ids=torch.as_tensor([prompt_token_ids], device=self.device))[0]
-        start_ids = torch.as_tensor([[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device)
+        start_ids = torch.as_tensor(
+          [[self.model.generation_config.decoder_start_token_id]], dtype=torch.int64, device=self.device
+        )
       else:
         start_ids = torch.as_tensor([prompt_token_ids], device=self.device)
 
@@ -187,7 +196,9 @@ async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapt
           )
           logits = self.model.lm_head(out[0])
         else:
-          out = self.model(input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True)
+          out = self.model(
+            input_ids=torch.as_tensor([[token]], device=self.device), past_key_values=past_key_values, use_cache=True
+          )
           logits = out.logits
         past_key_values = out.past_key_values
         if logits_processor:
@@ -231,7 +242,12 @@ async def generate_iterator(self, prompt_token_ids, request_id, stop=None, adapt
 
         tmp_output_ids, rfind_start = output_token_ids[input_len:], 0
         # XXX: Move this to API server
-        text = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True)
+        text = self.tokenizer.decode(
+          tmp_output_ids,
+          skip_special_tokens=True,
+          spaces_between_special_tokens=False,
+          clean_up_tokenization_spaces=True,
+        )
 
         if len(stop) > 0:
           for it in stop:
diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py
index 784b495a6..86e889875 100644
--- a/openllm-python/src/openllm/bundle/_package.py
+++ b/openllm-python/src/openllm/bundle/_package.py
@@ -101,7 +101,7 @@ def create_bento(
     'base_name_or_path': llm.model_id,
     'bundler': 'openllm.bundle',
     **{
-      f'{package.replace("-","_")}_version': importlib.metadata.version(package)
+      f'{package.replace("-", "_")}_version': importlib.metadata.version(package)
       for package in {'openllm', 'openllm-core', 'openllm-client'}
     },
   })
diff --git a/openllm-python/src/openllm_cli/_sdk.py b/openllm-python/src/openllm_cli/_sdk.py
index 1c9c04de2..a5167cbe4 100644
--- a/openllm-python/src/openllm_cli/_sdk.py
+++ b/openllm-python/src/openllm_cli/_sdk.py
@@ -81,7 +81,7 @@ def _start(
   if adapter_map:
     args.extend(
       list(
-        itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()])
+        itertools.chain.from_iterable([['--adapter-id', f"{k}{':' + v if v else ''}"] for k, v in adapter_map.items()])
       )
     )
   if additional_args:
@@ -173,7 +173,7 @@ def _build(
   if overwrite:
     args.append('--overwrite')
   if adapter_map:
-    args.extend([f"--adapter-id={k}{':'+v if v is not None else ''}" for k, v in adapter_map.items()])
+    args.extend([f"--adapter-id={k}{':' + v if v is not None else ''}" for k, v in adapter_map.items()])
   if model_version:
     args.extend(['--model-version', model_version])
   if bento_version:
diff --git a/openllm-python/src/openllm_cli/extension/dive_bentos.py b/openllm-python/src/openllm_cli/extension/dive_bentos.py
index db488004d..541d07bf2 100644
--- a/openllm-python/src/openllm_cli/extension/dive_bentos.py
+++ b/openllm-python/src/openllm_cli/extension/dive_bentos.py
@@ -21,7 +21,9 @@
 @machine_option
 @click.pass_context
 @inject
-def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
+def cli(
+  ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]
+) -> str | None:
   """Dive into a BentoLLM. This is synonymous to cd $(b get <bento>:<tag> -o path)."""
   try:
     bentomodel = _bento_store.get(bento)
diff --git a/openllm-python/src/openllm_cli/extension/get_containerfile.py b/openllm-python/src/openllm_cli/extension/get_containerfile.py
index 886054144..507988292 100644
--- a/openllm-python/src/openllm_cli/extension/get_containerfile.py
+++ b/openllm-python/src/openllm_cli/extension/get_containerfile.py
@@ -17,7 +17,9 @@
   from bentoml._internal.bento import BentoStore
 
 
-@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.')
+@click.command(
+  'get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.'
+)
 @click.argument('bento', type=str, shell_complete=bento_complete_envvar)
 @click.pass_context
 @inject
diff --git a/openllm-python/src/openllm_cli/extension/get_prompt.py b/openllm-python/src/openllm_cli/extension/get_prompt.py
index b679577fe..0e64c2304 100644
--- a/openllm-python/src/openllm_cli/extension/get_prompt.py
+++ b/openllm-python/src/openllm_cli/extension/get_prompt.py
@@ -22,7 +22,9 @@ def vformat(self, format_string: str, args: t.Sequence[t.Any], kwargs: t.Mapping
       raise ValueError('Positional arguments are not supported')
     return super().vformat(format_string, args, kwargs)
 
-  def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> None:
+  def check_unused_args(
+    self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]
+  ) -> None:
     extras = set(kwargs).difference(used_args)
     if extras:
       raise KeyError(f'Extra params passed: {extras}')
@@ -56,7 +58,9 @@ def format(self, **attrs: t.Any) -> str:
     try:
       return self.template.format(**prompt_variables)
     except KeyError as e:
-      raise RuntimeError(f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template.") from None
+      raise RuntimeError(
+        f"Missing variable '{e.args[0]}' (required: {self._input_variables}) in the prompt template."
+      ) from None
 
 
 @click.command('get_prompt', context_settings=termui.CONTEXT_SETTINGS)
@@ -124,15 +128,21 @@ def cli(
   if prompt_template_file and chat_template_file:
     ctx.fail('prompt-template-file and chat-template-file are mutually exclusive.')
 
-  acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys())
+  acceptable = set(openllm.CONFIG_MAPPING_NAMES.keys()) | set(
+    inflection.dasherize(name) for name in openllm.CONFIG_MAPPING_NAMES.keys()
+  )
   if model_id in acceptable:
-    logger.warning('Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n')
+    logger.warning(
+      'Using a default prompt from OpenLLM. Note that this prompt might not work for your intended usage.\n'
+    )
     config = openllm.AutoConfig.for_model(model_id)
     template = prompt_template_file.read() if prompt_template_file is not None else config.template
     system_message = system_message or config.system_message
 
     try:
-      formatted = PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
+      formatted = (
+        PromptTemplate(template).with_options(system_message=system_message).format(instruction=prompt, **_memoized)
+      )
     except RuntimeError as err:
       logger.debug('Exception caught while formatting prompt: %s', err)
       ctx.fail(str(err))
@@ -149,15 +159,21 @@ def cli(
       for architecture in config.architectures:
         if architecture in openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE():
           system_message = (
-            openllm.AutoConfig.infer_class_from_name(openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture])
+            openllm.AutoConfig.infer_class_from_name(
+              openllm.AutoConfig._CONFIG_MAPPING_NAMES_TO_ARCHITECTURE()[architecture]
+            )
             .model_construct_env()
             .system_message
           )
           break
       else:
-        ctx.fail(f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message')
+        ctx.fail(
+          f'Failed to infer system message from model architecture: {config.architectures}. Please pass in --system-message'
+        )
     messages = [{'role': 'system', 'content': system_message}, {'role': 'user', 'content': prompt}]
-    formatted = tokenizer.apply_chat_template(messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False)
+    formatted = tokenizer.apply_chat_template(
+      messages, chat_template=chat_template_file, add_generation_prompt=add_generation_prompt, tokenize=False
+    )
 
   termui.echo(orjson.dumps({'prompt': formatted}, option=orjson.OPT_INDENT_2).decode(), fg='white')
   ctx.exit(0)
diff --git a/openllm-python/src/openllm_cli/extension/list_models.py b/openllm-python/src/openllm_cli/extension/list_models.py
index eb18ce0d6..6eb49e079 100644
--- a/openllm-python/src/openllm_cli/extension/list_models.py
+++ b/openllm-python/src/openllm_cli/extension/list_models.py
@@ -33,12 +33,17 @@ def cli(model_name: str | None) -> DictStrAny:
   }
   if model_name is not None:
     ids_in_local_store = {
-      k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)]
+      k: [
+        i
+        for i in v
+        if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)
+      ]
       for k, v in ids_in_local_store.items()
     }
   ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v}
   local_models = {
-    k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val] for k, val in ids_in_local_store.items()
+    k: [{'tag': str(i.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(i.path))} for i in val]
+    for k, val in ids_in_local_store.items()
   }
   termui.echo(orjson.dumps(local_models, option=orjson.OPT_INDENT_2).decode(), fg='white')
   return local_models
diff --git a/openllm-python/src/openllm_cli/extension/playground.py b/openllm-python/src/openllm_cli/extension/playground.py
index f8e5b4da4..fcbc128b2 100644
--- a/openllm-python/src/openllm_cli/extension/playground.py
+++ b/openllm-python/src/openllm_cli/extension/playground.py
@@ -32,7 +32,14 @@ def load_notebook_metadata() -> DictStrAny:
 
 @click.command('playground', context_settings=termui.CONTEXT_SETTINGS)
 @click.argument('output-dir', default=None, required=False)
-@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server')
+@click.option(
+  '--port',
+  envvar='JUPYTER_PORT',
+  show_envvar=True,
+  show_default=True,
+  default=8888,
+  help='Default port for Jupyter server',
+)
 @click.pass_context
 def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
   """OpenLLM Playground.
@@ -53,7 +60,9 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
   > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
   """
   if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
-    raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
+    raise RuntimeError(
+      "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'"
+    )
   metadata = load_notebook_metadata()
   _temp_dir = False
   if output_dir is None:
@@ -65,7 +74,9 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
   termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue')
   for module in pkgutil.iter_modules(playground.__path__):
     if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')):
-      logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module')
+      logger.debug(
+        'Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module'
+      )
       continue
     if not isinstance(module.module_finder, importlib.machinery.FileFinder):
       continue
diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py
index 90069b797..fafa40984 100644
--- a/openllm-python/tests/configuration_test.py
+++ b/openllm-python/tests/configuration_test.py
@@ -66,8 +66,15 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings):
   st.integers(max_value=283473),
   st.floats(min_value=0.0, max_value=1.0),
 )
-def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float):
-  cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),))
+def test_complex_struct_dump(
+  gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float
+):
+  cl_ = make_llm_config(
+    'ComplexLLM',
+    gen_settings,
+    fields=(('field1', 'float', field1),),
+    generation_fields=(('temperature', temperature),),
+  )
   sent = cl_()
   assert sent.model_dump()['field1'] == field1
   assert sent.model_dump()['generation_config']['temperature'] == temperature
diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py
index 1efd9e4d1..e49b2656c 100644
--- a/openllm-python/tests/conftest.py
+++ b/openllm-python/tests/conftest.py
@@ -10,8 +10,14 @@
 if t.TYPE_CHECKING:
   from openllm_core._typing_compat import LiteralBackend
 
-_MODELING_MAPPING = {'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B'}
-_PROMPT_MAPPING = {'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?'}
+_MODELING_MAPPING = {
+  'flan_t5': 'google/flan-t5-small',
+  'opt': 'facebook/opt-125m',
+  'baichuan': 'baichuan-inc/Baichuan-7B',
+}
+_PROMPT_MAPPING = {
+  'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?'
+}
 
 
 def parametrise_local_llm(model: str) -> t.Generator[tuple[str, openllm.LLM[t.Any, t.Any]], None, None]:
@@ -25,7 +31,9 @@ def parametrise_local_llm(model: str) -> t.Generator[tuple[str, openllm.LLM[t.An
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
   if os.getenv('GITHUB_ACTIONS') is None:
     if 'prompt' in metafunc.fixturenames and 'llm' in metafunc.fixturenames:
-      metafunc.parametrize('prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])])
+      metafunc.parametrize(
+        'prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])]
+      )
 
 
 def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
diff --git a/openllm-python/tests/strategies_test.py b/openllm-python/tests/strategies_test.py
index f801ed81d..6b95ac0df 100644
--- a/openllm-python/tests/strategies_test.py
+++ b/openllm-python/tests/strategies_test.py
@@ -73,9 +73,13 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch):
     mcls.setenv('CUDA_VISIBLE_DEVICES', '')
     assert len(NvidiaGpuResource.from_system()) >= 0  # TODO: real from_system tests
 
-    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1]).match('Input list should be all string type.')
+    assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1]).match(
+      'Input list should be all string type.'
+    )
     assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match('Input list should be all string type.')
-    assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID')
+    assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match(
+      'Failed to parse available GPUs UUID'
+    )
 
 
 def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch):