diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
index 5b9c5fc70e..8bebb70817 100644
--- a/xinference/model/llm/utils.py
+++ b/xinference/model/llm/utils.py
@@ -34,7 +34,6 @@
     CompletionChunk,
     CompletionUsage,
 )
-from ..utils import ensure_cache_cleared
 from .llm_family import (
     LlamaCppLLMSpecV1,
     LLMFamilyV1,
@@ -249,7 +248,6 @@ def _get_final_chat_completion_chunk(
         return cast(ChatCompletionChunk, chat_chunk)
 
     @classmethod
-    @ensure_cache_cleared
     def _to_chat_completion_chunks(
         cls,
         chunks: Iterator[CompletionChunk],
@@ -282,7 +280,6 @@ async def _async_to_chat_completion_chunks(
             i += 1
 
     @staticmethod
-    @ensure_cache_cleared
     def _to_chat_completion(completion: Completion) -> ChatCompletion:
         return {
             "id": "chat" + completion["id"],
diff --git a/xinference/model/utils.py b/xinference/model/utils.py
index e03e3da69c..2ca0ec6c3c 100644
--- a/xinference/model/utils.py
+++ b/xinference/model/utils.py
@@ -11,10 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import functools
-import gc
-import inspect
 import json
 import logging
 import os
@@ -28,7 +24,7 @@
 import torch
 
 from ..constants import XINFERENCE_CACHE_DIR, XINFERENCE_ENV_MODEL_SRC
-from ..device_utils import empty_cache, get_available_device, is_device_available
+from ..device_utils import get_available_device, is_device_available
 from .core import CacheableModelSpec
 
 logger = logging.getLogger(__name__)
@@ -357,32 +353,6 @@ def convert_float_to_int_or_str(model_size: float) -> Union[int, str]:
         return str(model_size)
 
 
-def ensure_cache_cleared(func: Callable):
-    assert not inspect.iscoroutinefunction(func) and not inspect.isasyncgenfunction(
-        func
-    )
-    if inspect.isgeneratorfunction(func):
-
-        @functools.wraps(func)
-        def inner(*args, **kwargs):
-            for obj in func(*args, **kwargs):
-                yield obj
-            gc.collect()
-            empty_cache()
-
-    else:
-
-        @functools.wraps(func)
-        def inner(*args, **kwargs):
-            try:
-                return func(*args, **kwargs)
-            finally:
-                gc.collect()
-                empty_cache()
-
-    return inner
-
-
 def set_all_random_seed(seed: int):
     random.seed(seed)
     np.random.seed(seed)