Skip to content

Commit

Permalink
Move profilers to vllm-hpu-extension (vllm-project#323)
Browse files Browse the repository at this point in the history
Continuation of HabanaAI/vllm-hpu-extension#4

I've also removed is_tpu, as it got mistakenly restored in the rebase.
It's not in the upstream.
  • Loading branch information
kzawora-intel authored Sep 23, 2024
1 parent f9b222e commit c64dc83
Show file tree
Hide file tree
Showing 6 changed files with 12 additions and 240 deletions.
2 changes: 1 addition & 1 deletion requirements-hpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ ray == 2.32.0
triton
pandas
tabulate
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bdd4f2b
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0e05e25
6 changes: 4 additions & 2 deletions vllm/executor/habana_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@
import os
from typing import Any, Dict, List, Optional, Set, Tuple

from vllm_hpu_extension.profiler import HabanaMemoryProfiler

from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import ExecuteModelRequest
from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method,
get_ip, get_open_port, make_async)
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async)
from vllm.worker.worker_base import WorkerWrapperBase

logger = init_logger(__name__)
Expand Down
104 changes: 0 additions & 104 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,15 +374,6 @@ def _is_built_for_hpu() -> bool:
return False


@lru_cache(maxsize=None)
def is_tpu() -> bool:
try:
import libtpu
except ImportError:
libtpu = None
return libtpu is not None


@lru_cache(maxsize=None)
def is_xpu() -> bool:
from importlib.metadata import PackageNotFoundError, version
Expand Down Expand Up @@ -785,107 +776,12 @@ def print_warning_once(msg: str) -> None:
logger.warning(msg)


# Adapted from https://stackoverflow.com/a/49361727
def format_bytes(size):
# 2**10 = 1024
power = 2**10
n = 0
power_labels = {0: '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'}
while abs(size) > power:
size /= power
n += 1
return f'{size:.4g} {power_labels[n]+"B"}'


def get_device() -> str:
if is_hpu():
return "hpu"
return "cuda"


class HabanaMemoryProfiler:

def __init__(self, device=None):
self.device = device

@staticmethod
def current_device_memory_usage() -> float:
if is_fake_hpu():
return 0
# Return the device memory usage in bytes.
free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info()
return total_hpu_memory - free_hpu_memory

@staticmethod
def current_free_device_memory() -> float:
if is_fake_hpu():
return 0
# Return the device memory usage in bytes.
free_hpu_memory, _ = torch.hpu.mem_get_info()
return free_hpu_memory

@staticmethod
def total_device_memory() -> float:
if is_fake_hpu():
return 0
# Return the device memory usage in bytes.
_, total_hpu_memory = torch.hpu.mem_get_info()
return total_hpu_memory

@staticmethod
def current_host_memory_usage() -> float:
# Return the host memory usage in bytes.
return HabanaMemoryProfiler.total_host_memory(
) - HabanaMemoryProfiler.current_free_host_memory()

@staticmethod
def current_free_host_memory() -> float:
# Return the host memory usage in bytes.
return psutil.virtual_memory().available

@staticmethod
def total_host_memory() -> float:
# Return the host memory usage in bytes.
return psutil.virtual_memory().total

def get_summary_string(self):
if getattr(self, 'final_device_memory', None) is None or getattr(
self, 'final_host_memory', None) is None:
raise RuntimeError(
"HabanaMemoryProfiler.get_summary_string() can only be called "
"after closing context manager")
return (
f"{format_bytes(self.consumed_device_memory)} of device memory "
f"({format_bytes(self.final_device_memory)}/"
f"{format_bytes(HabanaMemoryProfiler.total_device_memory())} used)"
f" and {format_bytes(self.consumed_host_memory)} of host memory "
f"({format_bytes(self.final_host_memory)}/"
f"{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)")

def __enter__(self):
# Force garbage collection
gc.collect()
self.initial_device_memory = \
HabanaMemoryProfiler.current_device_memory_usage()
self.initial_host_memory = \
HabanaMemoryProfiler.current_host_memory_usage()
# This allows us to call methods of the context manager if needed
return self

def __exit__(self, exc_type, exc_val, exc_tb):
# Force garbage collection
gc.collect()
self.final_device_memory = \
HabanaMemoryProfiler.current_device_memory_usage(
)
self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage(
)
self.consumed_device_memory = \
self.final_device_memory - self.initial_device_memory
self.consumed_host_memory = \
self.final_host_memory - self.initial_host_memory


@lru_cache(maxsize=None)
def is_pin_memory_available() -> bool:

Expand Down
10 changes: 5 additions & 5 deletions vllm/worker/habana_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import habana_frameworks.torch.internal.bridge_config as bc
import torch
from vllm_hpu_extension.ops import LoraMask as LoraMask
from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler,
HabanaMemoryProfiler, format_bytes)

from vllm.attention import AttentionMetadata, get_attn_backend
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
Expand All @@ -39,17 +41,15 @@
from vllm.sampling_params import SamplingParams
from vllm.sequence import (IntermediateTensors, SequenceData,
SequenceGroupMetadata)
from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_fake_hpu,
is_pin_memory_available, make_tensor_with_pad)
from vllm.utils import (is_fake_hpu, is_pin_memory_available,
make_tensor_with_pad)
from vllm.worker.model_runner_base import (
ModelRunnerBase, ModelRunnerInputBase,
_add_attn_metadata_broadcastable_dict,
_add_sampling_metadata_broadcastable_dict,
_init_attn_metadata_from_tensor_dict,
_init_sampling_metadata_from_tensor_dict)

from .profiler import Profiler

if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionBackend

Expand Down Expand Up @@ -517,7 +517,7 @@ def __init__(
self.prompt_adapter_config = prompt_adapter_config
self.return_hidden_states = return_hidden_states
self.observability_config = observability_config
self.profiler = Profiler()
self.profiler = HabanaHighLevelProfiler()

self.sliding_window = (model_config.get_sliding_window()
if model_config is not None else None)
Expand Down
4 changes: 2 additions & 2 deletions vllm/worker/habana_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import habana_frameworks.torch as htorch # noqa:F401
import torch
import torch.distributed
from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes

from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, ObservabilityConfig, ParallelConfig,
Expand All @@ -21,8 +22,7 @@
from vllm.model_executor import set_random_seed
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import ExecuteModelRequest
from vllm.utils import (HabanaMemoryProfiler, format_bytes, hpu_backend_string,
hpu_device_string, is_fake_hpu)
from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu
from vllm.worker.cache_engine import CacheEngine
from vllm.worker.habana_model_runner import HabanaModelRunner
from vllm.worker.model_runner_base import ModelRunnerBase
Expand Down
126 changes: 0 additions & 126 deletions vllm/worker/profiler.py

This file was deleted.

0 comments on commit c64dc83

Please sign in to comment.