Move profilers to vllm-hpu-extension (vllm-project#323)

Continuation of HabanaAI/vllm-hpu-extension#4 I've also removed is_tpu, as it got mistakenly restored in the rebase. It's not in the upstream.
jikunshang · Sep 23, 2024 · c64dc83 · c64dc83
1 parent f9b222e
commit c64dc83
Show file tree

Hide file tree

Showing 6 changed files with 12 additions and 240 deletions.
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
@@ -6,4 +6,4 @@ ray == 2.32.0
 triton
 pandas
 tabulate
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bdd4f2b
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0e05e25
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
@@ -6,14 +6,16 @@
 import os
 from typing import Any, Dict, List, Optional, Set, Tuple
 
+from vllm_hpu_extension.profiler import HabanaMemoryProfiler
+
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method,
-                        get_ip, get_open_port, make_async)
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
 from vllm.worker.worker_base import WorkerWrapperBase
 
 logger = init_logger(__name__)

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -374,15 +374,6 @@ def _is_built_for_hpu() -> bool:
         return False
 
 
-@lru_cache(maxsize=None)
-def is_tpu() -> bool:
-    try:
-        import libtpu
-    except ImportError:
-        libtpu = None
-    return libtpu is not None
-
-
 @lru_cache(maxsize=None)
 def is_xpu() -> bool:
     from importlib.metadata import PackageNotFoundError, version
@@ -785,107 +776,12 @@ def print_warning_once(msg: str) -> None:
     logger.warning(msg)
 
 
-# Adapted from https://stackoverflow.com/a/49361727
-def format_bytes(size):
-    # 2**10 = 1024
-    power = 2**10
-    n = 0
-    power_labels = {0: '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'}
-    while abs(size) > power:
-        size /= power
-        n += 1
-    return f'{size:.4g} {power_labels[n]+"B"}'
-
-
 def get_device() -> str:
     if is_hpu():
         return "hpu"
     return "cuda"
 
 
-class HabanaMemoryProfiler:
-
-    def __init__(self, device=None):
-        self.device = device
-
-    @staticmethod
-    def current_device_memory_usage() -> float:
-        if is_fake_hpu():
-            return 0
-        # Return the device memory usage in bytes.
-        free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info()
-        return total_hpu_memory - free_hpu_memory
-
-    @staticmethod
-    def current_free_device_memory() -> float:
-        if is_fake_hpu():
-            return 0
-        # Return the device memory usage in bytes.
-        free_hpu_memory, _ = torch.hpu.mem_get_info()
-        return free_hpu_memory
-
-    @staticmethod
-    def total_device_memory() -> float:
-        if is_fake_hpu():
-            return 0
-        # Return the device memory usage in bytes.
-        _, total_hpu_memory = torch.hpu.mem_get_info()
-        return total_hpu_memory
-
-    @staticmethod
-    def current_host_memory_usage() -> float:
-        # Return the host memory usage in bytes.
-        return HabanaMemoryProfiler.total_host_memory(
-        ) - HabanaMemoryProfiler.current_free_host_memory()
-
-    @staticmethod
-    def current_free_host_memory() -> float:
-        # Return the host memory usage in bytes.
-        return psutil.virtual_memory().available
-
-    @staticmethod
-    def total_host_memory() -> float:
-        # Return the host memory usage in bytes.
-        return psutil.virtual_memory().total
-
-    def get_summary_string(self):
-        if getattr(self, 'final_device_memory', None) is None or getattr(
-                self, 'final_host_memory', None) is None:
-            raise RuntimeError(
-                "HabanaMemoryProfiler.get_summary_string() can only be called "
-                "after closing context manager")
-        return (
-            f"{format_bytes(self.consumed_device_memory)} of device memory "
-            f"({format_bytes(self.final_device_memory)}/"
-            f"{format_bytes(HabanaMemoryProfiler.total_device_memory())} used)"
-            f" and {format_bytes(self.consumed_host_memory)} of host memory "
-            f"({format_bytes(self.final_host_memory)}/"
-            f"{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)")
-
-    def __enter__(self):
-        # Force garbage collection
-        gc.collect()
-        self.initial_device_memory = \
-            HabanaMemoryProfiler.current_device_memory_usage()
-        self.initial_host_memory = \
-            HabanaMemoryProfiler.current_host_memory_usage()
-        # This allows us to call methods of the context manager if needed
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        # Force garbage collection
-        gc.collect()
-        self.final_device_memory = \
-            HabanaMemoryProfiler.current_device_memory_usage(
-        )
-        self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage(
-        )
-        self.consumed_device_memory = \
-            self.final_device_memory - self.initial_device_memory
-        self.consumed_host_memory = \
-            self.final_host_memory - self.initial_host_memory
-
-
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
 

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
@@ -21,6 +21,8 @@
 import habana_frameworks.torch.internal.bridge_config as bc
 import torch
 from vllm_hpu_extension.ops import LoraMask as LoraMask
+from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
+                                         HabanaMemoryProfiler, format_bytes)
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
@@ -39,17 +41,15 @@
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_fake_hpu,
-                        is_pin_memory_available, make_tensor_with_pad)
+from vllm.utils import (is_fake_hpu, is_pin_memory_available,
+                        make_tensor_with_pad)
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase,
     _add_attn_metadata_broadcastable_dict,
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict)
 
-from .profiler import Profiler
-
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
 
@@ -517,7 +517,7 @@ def __init__(
         self.prompt_adapter_config = prompt_adapter_config
         self.return_hidden_states = return_hidden_states
         self.observability_config = observability_config
-        self.profiler = Profiler()
+        self.profiler = HabanaHighLevelProfiler()
 
         self.sliding_window = (model_config.get_sliding_window()
                                if model_config is not None else None)

diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
@@ -9,6 +9,7 @@
 import habana_frameworks.torch as htorch  # noqa:F401
 import torch
 import torch.distributed
+from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ObservabilityConfig, ParallelConfig,
@@ -21,8 +22,7 @@
 from vllm.model_executor import set_random_seed
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (HabanaMemoryProfiler, format_bytes, hpu_backend_string,
-                        hpu_device_string, is_fake_hpu)
+from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.habana_model_runner import HabanaModelRunner
 from vllm.worker.model_runner_base import ModelRunnerBase

diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py