Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Rs/sparse integration test clean 2 #67

Merged
merged 20 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 55 additions & 25 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,59 @@ def generate(
outputs.append((req_sample_output_ids, req_sample_output_strs))
return outputs

def generate_greedy(
self,
prompts: List[str],
max_tokens: int,
) -> List[Tuple[List[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts, greedy_params)
return [(output_ids[0], output_str[0])
for output_ids, output_str in outputs]

def generate_beam_search(
self,
prompts: List[str],
beam_width: int,
max_tokens: int,
) -> List[Tuple[List[int], str]]:
beam_search_params = SamplingParams(n=beam_width,
use_beam_search=True,
temperature=0.0,
max_tokens=max_tokens)
outputs = self.generate(prompts, beam_search_params)
return outputs


@pytest.fixture
def vllm_runner():
return VllmRunner


class VllmRunnerNm(VllmRunner):

def __init__(
self,
model_name: str,
sparsity: Optional[str] = None,
tokenizer_name: Optional[str] = None,
dtype: str = "half",
disable_log_stats: bool = True,
tensor_parallel_size: int = 1,
max_model_len: Optional[int] = None,
) -> None:
self.model = LLM(
model=model_name,
sparsity=sparsity,
tokenizer=tokenizer_name,
trust_remote_code=True,
dtype=dtype,
swap_space=0,
disable_log_stats=disable_log_stats,
tensor_parallel_size=tensor_parallel_size,
max_model_len=max_model_len,
)

def generate_w_logprobs(
self,
prompts: List[str],
Expand All @@ -215,16 +268,6 @@ def generate_w_logprobs(
outputs.append((output_ids, output_str, output_logprobs))
return outputs

def generate_greedy(
self,
prompts: List[str],
max_tokens: int,
) -> List[Tuple[List[int], str]]:
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
outputs = self.generate(prompts, greedy_params)
return [(output_ids[0], output_str[0])
for output_ids, output_str in outputs]

def generate_greedy_logprobs(
self,
prompts: List[str],
Expand All @@ -239,20 +282,7 @@ def generate_greedy_logprobs(
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]

def generate_beam_search(
self,
prompts: List[str],
beam_width: int,
max_tokens: int,
) -> List[Tuple[List[int], str]]:
beam_search_params = SamplingParams(n=beam_width,
use_beam_search=True,
temperature=0.0,
max_tokens=max_tokens)
outputs = self.generate(prompts, beam_search_params)
return outputs


@pytest.fixture
def vllm_runner():
return VllmRunner
def vllm_runner_nm():
return VllmRunnerNm
69 changes: 69 additions & 0 deletions tests/models/test_compressed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Compare the outputs of a sparse model running sparse vs sparse model running dense.
Note: sparse kernels do not have bitwise correctness vs the dense models.
As a result, in this test, we just confirm that the top selected tokens of the
sparse models are in the top N selections of same model running dense.
Run `pytest tests/models/test_sparse.py --forked`.
"""

import gc
import pytest
import torch
from compare_utils import check_logprobs_close

MAX_MODEL_LEN = 1024
MODEL_FORMAT_PAIRS = [
("nm-testing/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
"semi_structured_sparse_w16a16"),
("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16"),
("nm-testing/OpenHermes-2.5-Mistral-7B-pruned2.4",
"semi_structured_sparse_w16a16"),
]


@pytest.mark.parametrize("model_format_pairs", MODEL_FORMAT_PAIRS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
vllm_runner_nm,
example_prompts,
model_format_pairs,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
model_name, sparsity = model_format_pairs

sparse_model = vllm_runner_nm(model_name=model_name,
sparsity=sparsity,
dtype=dtype,
max_model_len=MAX_MODEL_LEN)
sparse_outputs = sparse_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)

# Note: deleting just the model does not always free the GPU memory, not sure why.
del sparse_model.model.llm_engine.driver_worker
del sparse_model
torch.cuda.empty_cache()
gc.collect()

dense_model = vllm_runner_nm(model_name=model_name,
sparsity=None,
dtype=dtype,
max_model_len=MAX_MODEL_LEN)
dense_outputs = dense_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)

# Note: deleting just the model does not always free the GPU memory, not sure why.
del dense_model.model.llm_engine.driver_worker
del dense_model
torch.cuda.empty_cache()
gc.collect()

# loop through the prompts
check_logprobs_close(
outputs_0_lst=dense_outputs,
outputs_1_lst=sparse_outputs,
name_0="dense",
name_1="sparse",
)
62 changes: 62 additions & 0 deletions tests/models/test_compressed_memory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""Checks the memory usage of the sparse model is < memory usage of the
dense model by checking that the number of KV cache blocks is
bigger for the sparse model rather than the dense model. vLLM pre-allocates
the memory for the KV-cache after checking availability once the model
is loaded. This implies that using a compressed model should give more space
for the KV cache and thus more allocated blocks.

Run `pytest tests/models/test_sparse_memory.py --forked`.
"""

import gc
import pytest
import torch

MODEL_FORMAT_EXTRABLOCKS = [
("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16", 2000),
("nm-testing/OpenHermes-2.5-Mistral-7B-pruned2.4",
"semi_structured_sparse_w16a16", 2000),
]


@pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [3])
def test_models(
vllm_runner_nm,
example_prompts,
model_format_extrablocks,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
model_name, sparsity, num_extra_blocks = model_format_extrablocks
dense_model = vllm_runner_nm(model_name=model_name,
sparsity=None,
dtype=dtype,
max_model_len=1024)
dense_num_kv_blocks = dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks

# Note: deleting just the model does not always free the GPU memory, not sure why.
del dense_model.model.llm_engine.driver_worker
del dense_model
torch.cuda.empty_cache()
gc.collect()

sparse_model = vllm_runner_nm(model_name=model_name,
sparsity=sparsity,
dtype=dtype,
max_model_len=1024)
sparse_num_kv_blocks = sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks

# Note: deleting just the model does not always free the GPU memory, not sure why.
del sparse_model.model.llm_engine.driver_worker
del sparse_model
torch.cuda.empty_cache()
gc.collect()

assert sparse_num_kv_blocks > dense_num_kv_blocks + num_extra_blocks, (
f"Test{model_name}: Sparse model KV cache size {sparse_num_kv_blocks} "
f"not bigger than dense model KV cache size {dense_num_kv_blocks} + "
f"expected num_extra_blocks {num_extra_blocks}")
26 changes: 16 additions & 10 deletions tests/models/test_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
result in very slight nondeterminism for Marlin. As a result, we re-run the test
up to 3 times to see if we pass.

Run `pytest tests/models/test_marlin.py --forked`.
Note: This test currently fails running with --forked with the following:
RuntimeError: Cannot re-initialize CUDA in forked subprocess.
To use CUDA with multiprocessing, you must use the 'spawn' start method

Run `pytest tests/models/test_marlin.py`.
"""

import pytest
Expand All @@ -17,6 +21,8 @@
from dataclasses import dataclass
from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY

MAX_MODEL_LEN = 1024

capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
marlin_not_supported = (
Expand Down Expand Up @@ -47,31 +53,31 @@ class ModelPair:
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [3])
def test_models(
vllm_runner,
vllm_runner_nm,
example_prompts,
model_pair: ModelPair,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
marlin_model = vllm_runner(model_pair.model_marlin, dtype=dtype)
marlin_model = vllm_runner_nm(model_pair.model_marlin,
dtype=dtype,
max_model_len=MAX_MODEL_LEN)
marlin_outputs = marlin_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)

# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting just the model
# frees the memory.
# Note: deleting just the model does not always free the GPU memory, not sure why.
del marlin_model.model.llm_engine.driver_worker
del marlin_model

gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype)
gptq_model = vllm_runner_nm(model_pair.model_gptq,
dtype=dtype,
max_model_len=MAX_MODEL_LEN)
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
max_tokens,
num_logprobs)

# Note: not sure why, but deleting just the model on Ada Lovelace
# does not free the GPU memory. On Ampere, deleting just the model
# frees the memory.
# Note: deleting just the model does not always free the GPU memory, not sure why.
del gptq_model.model.llm_engine.driver_worker
del gptq_model

Expand Down
Loading