diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 56caa4ba0386..1ab81898b5f7 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,4 +6,4 @@ ray == 2.32.0 triton pandas tabulate -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bdd4f2b +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0e05e25 diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 6e92da024583..44226fc89821 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -6,14 +6,16 @@ import os from typing import Any, Dict, List, Optional, Set, Tuple +from vllm_hpu_extension.profiler import HabanaMemoryProfiler + from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method, - get_ip, get_open_port, make_async) +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) diff --git a/vllm/utils.py b/vllm/utils.py index f7e7a64619b1..e5cef9b4419c 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -374,15 +374,6 @@ def _is_built_for_hpu() -> bool: return False -@lru_cache(maxsize=None) -def is_tpu() -> bool: - try: - import libtpu - except ImportError: - libtpu = None - return libtpu is not None - - @lru_cache(maxsize=None) def is_xpu() -> bool: from importlib.metadata import PackageNotFoundError, version @@ -785,107 +776,12 @@ def print_warning_once(msg: str) -> None: logger.warning(msg) -# Adapted from https://stackoverflow.com/a/49361727 -def format_bytes(size): - # 2**10 = 1024 - power = 2**10 - n = 0 - power_labels = {0: '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'} - while abs(size) > power: - size /= power - n += 1 - return f'{size:.4g} {power_labels[n]+"B"}' - - def get_device() -> str: if is_hpu(): return "hpu" return "cuda" -class HabanaMemoryProfiler: - - def __init__(self, device=None): - self.device = device - - @staticmethod - def current_device_memory_usage() -> float: - if is_fake_hpu(): - return 0 - # Return the device memory usage in bytes. - free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info() - return total_hpu_memory - free_hpu_memory - - @staticmethod - def current_free_device_memory() -> float: - if is_fake_hpu(): - return 0 - # Return the device memory usage in bytes. - free_hpu_memory, _ = torch.hpu.mem_get_info() - return free_hpu_memory - - @staticmethod - def total_device_memory() -> float: - if is_fake_hpu(): - return 0 - # Return the device memory usage in bytes. - _, total_hpu_memory = torch.hpu.mem_get_info() - return total_hpu_memory - - @staticmethod - def current_host_memory_usage() -> float: - # Return the host memory usage in bytes. - return HabanaMemoryProfiler.total_host_memory( - ) - HabanaMemoryProfiler.current_free_host_memory() - - @staticmethod - def current_free_host_memory() -> float: - # Return the host memory usage in bytes. - return psutil.virtual_memory().available - - @staticmethod - def total_host_memory() -> float: - # Return the host memory usage in bytes. - return psutil.virtual_memory().total - - def get_summary_string(self): - if getattr(self, 'final_device_memory', None) is None or getattr( - self, 'final_host_memory', None) is None: - raise RuntimeError( - "HabanaMemoryProfiler.get_summary_string() can only be called " - "after closing context manager") - return ( - f"{format_bytes(self.consumed_device_memory)} of device memory " - f"({format_bytes(self.final_device_memory)}/" - f"{format_bytes(HabanaMemoryProfiler.total_device_memory())} used)" - f" and {format_bytes(self.consumed_host_memory)} of host memory " - f"({format_bytes(self.final_host_memory)}/" - f"{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)") - - def __enter__(self): - # Force garbage collection - gc.collect() - self.initial_device_memory = \ - HabanaMemoryProfiler.current_device_memory_usage() - self.initial_host_memory = \ - HabanaMemoryProfiler.current_host_memory_usage() - # This allows us to call methods of the context manager if needed - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - # Force garbage collection - gc.collect() - self.final_device_memory = \ - HabanaMemoryProfiler.current_device_memory_usage( - ) - self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage( - ) - self.consumed_device_memory = \ - self.final_device_memory - self.initial_device_memory - self.consumed_host_memory = \ - self.final_host_memory - self.initial_host_memory - - @lru_cache(maxsize=None) def is_pin_memory_available() -> bool: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6cd80eb15107..c99500ef1296 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -21,6 +21,8 @@ import habana_frameworks.torch.internal.bridge_config as bc import torch from vllm_hpu_extension.ops import LoraMask as LoraMask +from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, + HabanaMemoryProfiler, format_bytes) from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, @@ -39,8 +41,8 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) -from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_fake_hpu, - is_pin_memory_available, make_tensor_with_pad) +from vllm.utils import (is_fake_hpu, is_pin_memory_available, + make_tensor_with_pad) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, _add_attn_metadata_broadcastable_dict, @@ -48,8 +50,6 @@ _init_attn_metadata_from_tensor_dict, _init_sampling_metadata_from_tensor_dict) -from .profiler import Profiler - if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionBackend @@ -517,7 +517,7 @@ def __init__( self.prompt_adapter_config = prompt_adapter_config self.return_hidden_states = return_hidden_states self.observability_config = observability_config - self.profiler = Profiler() + self.profiler = HabanaHighLevelProfiler() self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None) diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index f2678c5e405d..8cdbba02fbb3 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -9,6 +9,7 @@ import habana_frameworks.torch as htorch # noqa:F401 import torch import torch.distributed +from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, @@ -21,8 +22,7 @@ from vllm.model_executor import set_random_seed from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import (HabanaMemoryProfiler, format_bytes, hpu_backend_string, - hpu_device_string, is_fake_hpu) +from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu from vllm.worker.cache_engine import CacheEngine from vllm.worker.habana_model_runner import HabanaModelRunner from vllm.worker.model_runner_base import ModelRunnerBase diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py deleted file mode 100644 index 48348de41f52..000000000000 --- a/vllm/worker/profiler.py +++ /dev/null @@ -1,126 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -############################################################################### - -import json -import os -import queue -import threading -import time -from contextlib import contextmanager -from typing import Any, List - -from vllm.logger import init_logger -from vllm.utils import get_vllm_instance_id - -logger = init_logger(__name__) - - -class FileWriter(threading.Thread): - - def __init__(self, filename, event_queue): - super().__init__() - self.filename = filename - self.event_queue = event_queue - self.daemon = True - self.timer_event = threading.Event() - - def _drain_event_queue(self): - content = '' - while True: - try: - element = self.event_queue.get_nowait() - content += element - except queue.Empty: - break - return content - - def run(self): - # don't check the queue too often - while not self.timer_event.wait(1): - # Block and wait for the next item in the queue - content = self.event_queue.get() - # Collect any other items in the queue - content += self._drain_event_queue() - - with open(self.filename, 'a') as outfile: - outfile.write(content) - - -class Profiler: - profiling_trace_events: queue.Queue = queue.Queue() - event_tid = {'counter': 1, 'external': 2, 'internal': 3} - vllm_instance_id = get_vllm_instance_id() - filename = f'server_events_{vllm_instance_id}.json' - event_cache: List[Any] = [] - - def __init__(self): - self.enabled = os.getenv('VLLM_PROFILER_ENABLED', - 'false').lower() == 'true' and int( - os.getenv('RANK', '0')) == 0 - msg = f'Profiler enabled for: {self.vllm_instance_id}' - logger.info(msg) - if self.enabled: - # initialize the trace file (JSON Array Format) - with open(self.filename, 'w') as outfile: - outfile.write('[') - file_writer = FileWriter(self.filename, - self.profiling_trace_events) - file_writer.start() - - def _dump_with_sep(self, entry): - entry = json.dumps(entry) + ',' - self.profiling_trace_events.put(entry) - - def get_timestamp_us(self): - return time.time() * 1000000.0 - - def record_counter(self, ts, counter): - if self.enabled: - self._dump_with_sep({ - 'pid': 1, - 'tid': self.event_tid['counter'], - 'ph': 'C', - 'name': 'utils', - 'ts': ts, - 'args': counter - }) - - def start(self, type, name, args=None): - if self.enabled: - ts = self.get_timestamp_us() - if args is not None and 'counter' in args: - self.record_counter(ts, args['counter']) - del args['counter'] - event = { - 'pid': 1, - 'tid': self.event_tid[type], - 'ph': 'X', - 'name': name, - 'ts': ts, - 'dur': None, - 'args': args - } - self.event_cache.append(event) - - def end(self): - if self.enabled: - ts = self.get_timestamp_us() - if not self.event_cache: - logger.warning( - 'Profiler: end() call does not have matching start() call. ' - 'Disabling profiler.') - self.enabled = False - return - event = self.event_cache.pop() - event['dur'] = ts - event['ts'] - self._dump_with_sep(event) - - @contextmanager - def record_event(self, type, name, args=None): - if self.enabled: - self.start(type, name, args) - yield - self.end() - else: - yield