Skip to content

Commit

Permalink
feat: add CPU offloading support (#598)
Browse files Browse the repository at this point in the history
  • Loading branch information
AlpinDale committed Aug 24, 2024
1 parent fb4c017 commit 6671e3a
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 3 deletions.
2 changes: 2 additions & 0 deletions aphrodite/common/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,7 @@ def __init__(
num_gpu_blocks_override: Optional[int] = None,
sliding_window: Optional[int] = None,
enable_prefix_caching: bool = False,
cpu_offload_gb: float = 0.0,
) -> None:
self.block_size = block_size
self.gpu_memory_utilization = gpu_memory_utilization
Expand All @@ -551,6 +552,7 @@ def __init__(
self.cache_dtype = cache_dtype
self.sliding_window = sliding_window
self.enable_prefix_caching = enable_prefix_caching
self.cpu_offload_gb = cpu_offload_gb
self._verify_args()
self._verify_cache_dtype()
self._verify_prefix_caching()
Expand Down
6 changes: 6 additions & 0 deletions aphrodite/endpoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ class LLM:
when their `best_of` sampling parameters are larger than 1. If all
requests will have `best_of=1`, you can safely set this to 0.
Otherwise, too small values may cause out-of-memory (OOM) errors.
cpu_offload_gb: The size (GiB) of CPU memory to use for offloading
the model weights. This virtually increases the GPU memory space
you can use to hold the model weights, at the cost of CPU-GPU data
transfer for every forward pass.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
If False, we will use CUDA graph and eager execution in hybrid.
Expand Down Expand Up @@ -110,6 +114,7 @@ def __init__(
seed: int = 0,
gpu_memory_utilization: float = 0.9,
swap_space: int = 4,
cpu_offload_gb: float = 0,
enforce_eager: bool = False,
max_context_len_to_capture: Optional[int] = None,
max_seq_len_to_capture: int = 8192,
Expand Down Expand Up @@ -137,6 +142,7 @@ def __init__(
seed=seed,
gpu_memory_utilization=gpu_memory_utilization,
swap_space=swap_space,
cpu_offload_gb=cpu_offload_gb,
enforce_eager=enforce_eager,
max_context_len_to_capture=max_context_len_to_capture,
max_seq_len_to_capture=max_seq_len_to_capture,
Expand Down
21 changes: 21 additions & 0 deletions aphrodite/engine/args_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class EngineArgs:
disable_sliding_window: bool = False
use_v2_block_manager: bool = False
swap_space: int = 4 # GiB
cpu_offload_gb: int = 0 # GiB
gpu_memory_utilization: float = 0.90
max_num_batched_tokens: Optional[int] = None
max_num_seqs: int = 256
Expand Down Expand Up @@ -332,6 +333,20 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
default=EngineArgs.swap_space,
help="CPU swap space size (GiB) per GPU",
)
parser.add_argument(
'--cpu-offload-gb',
type=float,
default=0,
help='The space in GiB to offload to CPU, per GPU. '
'Default is 0, which means no offloading. Intuitively, '
'this argument can be seen as a virtual way to increase '
'the GPU memory size. For example, if you have one 24 GB '
'GPU and set this to 10, virtually you can think of it as '
'a 34 GB GPU. Then you can load a 13B model with BF16 weight,'
'which requires at least 26GB GPU memory. Note that this '
'requires fast CPU-GPU interconnect, as part of the model is'
'loaded from CPU memory to GPU memory on the fly in each '
'model forward pass.')
parser.add_argument(
"--gpu-memory-utilization",
"-gmu",
Expand Down Expand Up @@ -698,6 +713,11 @@ def create_engine_config(self, ) -> EngineConfig:
raise ValueError(
"BitsAndBytes load format and QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}")

assert self.cpu_offload_gb >= 0, (
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")

multimodal_config = MultiModalConfig()

device_config = DeviceConfig(device=self.device)
Expand Down Expand Up @@ -739,6 +759,7 @@ def create_engine_config(self, ) -> EngineConfig:
num_gpu_blocks_override=self.num_gpu_blocks_override,
sliding_window=model_config.get_sliding_window(),
enable_prefix_caching=self.enable_prefix_caching,
cpu_offload_gb=self.cpu_offload_gb,
)

parallel_config = ParallelConfig(
Expand Down
73 changes: 70 additions & 3 deletions aphrodite/modeling/models/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import Callable, Dict, List, Tuple

import torch
from torch.func import functional_call

from aphrodite.common.utils import is_pin_memory_available
from aphrodite.multimodal import BatchedTensors


Expand Down Expand Up @@ -52,6 +54,70 @@ def __init__(self, *args, **kwargs):
super().__init__()


_CPU_OFFLOAD_BYTES = 0
_CPU_OFFLOAD_MAX_BYTES = 0


def set_cpu_offload_max_bytes(max_bytes: int) -> None:
global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
_CPU_OFFLOAD_BYTES = 0
_CPU_OFFLOAD_MAX_BYTES = max_bytes


def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
device = next(module.parameters()).device

if device == torch.device("cpu"):
return module

global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
return module

pin_memory = is_pin_memory_available()

# offload parameters to CPU
# use pin_memory if possible, which helps cudagraph capture speed
for p in module.parameters():
if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
# we use per-parameter offloading
# one module might have some parameters offloaded and some not
break

# `torch.empty_like` does not support `pin_memory` argument
cpu_data = torch.empty(size=p.data.size(),
dtype=p.data.dtype,
layout=p.data.layout,
device='cpu',
pin_memory=pin_memory)
cpu_data.copy_(p.data)
p.data = cpu_data
_CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()

state_dict: Dict[str, torch.Tensor] = module.state_dict()

original_forward = module.forward

def forward(*args, **kwargs):
module.forward = original_forward
device_state = {
# here we blindly call `to(device)`
# if the parameter is already on the device, it will be a no-op
k: v.to(device, non_blocking=True)
for k, v in state_dict.items()
}
output = functional_call(module,
device_state,
args=args,
kwargs=kwargs)
module.forward = forward
return output

module.forward = forward

return module


def make_layers(
num_hidden_layers: int, layer_fn: Callable[[], torch.nn.Module]
) -> Tuple[int, int, torch.nn.ModuleList]:
Expand All @@ -64,9 +130,10 @@ def make_layers(
get_pp_group().rank_in_group,
get_pp_group().world_size)
modules = torch.nn.ModuleList(
[PPMissingLayer() for _ in range(start_layer)] +
[layer_fn() for _ in range(start_layer, end_layer)] +
[PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
[PPMissingLayer() for _ in range(start_layer)] + [
maybe_offload_to_cpu(layer_fn())
for _ in range(start_layer, end_layer)
] + [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
return start_layer, end_layer, modules


Expand Down
4 changes: 4 additions & 0 deletions aphrodite/task_handler/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from aphrodite.modeling.model_loader import get_model
from aphrodite.modeling.model_loader.tensorizer import TensorizerConfig
from aphrodite.modeling.models.interfaces import supports_lora, supports_vision
from aphrodite.modeling.models.utils import set_cpu_offload_max_bytes
from aphrodite.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
MultiModalInputs)
from aphrodite.prompt_adapter.layers import PromptAdapterMapping
Expand Down Expand Up @@ -547,6 +548,9 @@ def __init__(
self.flashinfer_prefill_workspace_buffer = None
self.flashinfer_prefill_wrapper = None

set_cpu_offload_max_bytes(
int(self.cache_config.cpu_offload_gb * 1024**3))

def load_model(self) -> None:
with CudaMemoryProfiler() as m:
# measure the time it takes to load the model
Expand Down

0 comments on commit 6671e3a

Please sign in to comment.