neuralmagic · mgoin · Feb 28, 2024 · Feb 23, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -197,6 +197,59 @@ def generate(
             outputs.append((req_sample_output_ids, req_sample_output_strs))
         return outputs
 
+    def generate_greedy(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+    ) -> List[Tuple[List[int], str]]:
+        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+        outputs = self.generate(prompts, greedy_params)
+        return [(output_ids[0], output_str[0])
+                for output_ids, output_str in outputs]
+
+    def generate_beam_search(
+        self,
+        prompts: List[str],
+        beam_width: int,
+        max_tokens: int,
+    ) -> List[Tuple[List[int], str]]:
+        beam_search_params = SamplingParams(n=beam_width,
+                                            use_beam_search=True,
+                                            temperature=0.0,
+                                            max_tokens=max_tokens)
+        outputs = self.generate(prompts, beam_search_params)
+        return outputs
+
+
+@pytest.fixture
+def vllm_runner():
+    return VllmRunner
+
+
+class VllmRunnerNm(VllmRunner):
+
+    def __init__(
+        self,
+        model_name: str,
+        sparsity: Optional[str] = None,
+        tokenizer_name: Optional[str] = None,
+        dtype: str = "half",
+        disable_log_stats: bool = True,
+        tensor_parallel_size: int = 1,
+        max_model_len: Optional[int] = None,
+    ) -> None:
+        self.model = LLM(
+            model=model_name,
+            sparsity=sparsity,
+            tokenizer=tokenizer_name,
+            trust_remote_code=True,
+            dtype=dtype,
+            swap_space=0,
+            disable_log_stats=disable_log_stats,
+            tensor_parallel_size=tensor_parallel_size,
+            max_model_len=max_model_len,
+        )
+
     def generate_w_logprobs(
         self,
         prompts: List[str],
@@ -215,16 +268,6 @@ def generate_w_logprobs(
             outputs.append((output_ids, output_str, output_logprobs))
         return outputs
 
-    def generate_greedy(
-        self,
-        prompts: List[str],
-        max_tokens: int,
-    ) -> List[Tuple[List[int], str]]:
-        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
-        outputs = self.generate(prompts, greedy_params)
-        return [(output_ids[0], output_str[0])
-                for output_ids, output_str in outputs]
-
     def generate_greedy_logprobs(
         self,
         prompts: List[str],
@@ -239,20 +282,7 @@ def generate_greedy_logprobs(
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
 
-    def generate_beam_search(
-        self,
-        prompts: List[str],
-        beam_width: int,
-        max_tokens: int,
-    ) -> List[Tuple[List[int], str]]:
-        beam_search_params = SamplingParams(n=beam_width,
-                                            use_beam_search=True,
-                                            temperature=0.0,
-                                            max_tokens=max_tokens)
-        outputs = self.generate(prompts, beam_search_params)
-        return outputs
-
 
 @pytest.fixture
-def vllm_runner():
-    return VllmRunner
+def vllm_runner_nm():
+    return VllmRunnerNm
diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py
@@ -0,0 +1,69 @@
+"""Compare the outputs of a sparse model running sparse vs sparse model running dense.
+Note: sparse kernels do not have bitwise correctness vs the dense models. 
+As a result, in this test, we just confirm that the top selected tokens of the 
+sparse models are in the top N selections of same model running dense.
+Run `pytest tests/models/test_sparse.py --forked`.
+"""
+
+import gc
+import pytest
+import torch
+from compare_utils import check_logprobs_close
+
+MAX_MODEL_LEN = 1024
+MODEL_FORMAT_PAIRS = [
+    ("nm-testing/TinyLlama-1.1B-Chat-v1.0-pruned2.4",
+     "semi_structured_sparse_w16a16"),
+    ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16"),
+    ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned2.4",
+     "semi_structured_sparse_w16a16"),
+]
+
+
+@pytest.mark.parametrize("model_format_pairs", MODEL_FORMAT_PAIRS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner_nm,
+    example_prompts,
+    model_format_pairs,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    model_name, sparsity = model_format_pairs
+
+    sparse_model = vllm_runner_nm(model_name=model_name,
+                                  sparsity=sparsity,
+                                  dtype=dtype,
+                                  max_model_len=MAX_MODEL_LEN)
+    sparse_outputs = sparse_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+
+    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    del sparse_model.model.llm_engine.driver_worker
+    del sparse_model
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    dense_model = vllm_runner_nm(model_name=model_name,
+                                 sparsity=None,
+                                 dtype=dtype,
+                                 max_model_len=MAX_MODEL_LEN)
+    dense_outputs = dense_model.generate_greedy_logprobs(
+        example_prompts, max_tokens, num_logprobs)
+
+    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    del dense_model.model.llm_engine.driver_worker
+    del dense_model
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    # loop through the prompts
+    check_logprobs_close(
+        outputs_0_lst=dense_outputs,
+        outputs_1_lst=sparse_outputs,
+        name_0="dense",
+        name_1="sparse",
+    )
diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py
@@ -0,0 +1,62 @@
+"""Checks the memory usage of the sparse model is < memory usage of the
+dense model by checking that the number of KV cache blocks is
+bigger for the sparse model rather than the dense model. vLLM pre-allocates
+the memory for the KV-cache after checking availability once the model
+is loaded. This implies that using a compressed model should give more space
+for the KV cache and thus more allocated blocks.
+
+Run `pytest tests/models/test_sparse_memory.py --forked`.
+"""
+
+import gc
+import pytest
+import torch
+
+MODEL_FORMAT_EXTRABLOCKS = [
+    ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned50", "sparse_w16a16", 2000),
+    ("nm-testing/OpenHermes-2.5-Mistral-7B-pruned2.4",
+     "semi_structured_sparse_w16a16", 2000),
+]
+
+
+@pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [3])
+def test_models(
+    vllm_runner_nm,
+    example_prompts,
+    model_format_extrablocks,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    model_name, sparsity, num_extra_blocks = model_format_extrablocks
+    dense_model = vllm_runner_nm(model_name=model_name,
+                                 sparsity=None,
+                                 dtype=dtype,
+                                 max_model_len=1024)
+    dense_num_kv_blocks = dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks
+
+    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    del dense_model.model.llm_engine.driver_worker
+    del dense_model
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    sparse_model = vllm_runner_nm(model_name=model_name,
+                                  sparsity=sparsity,
+                                  dtype=dtype,
+                                  max_model_len=1024)
+    sparse_num_kv_blocks = sparse_model.model.llm_engine.scheduler.block_manager.gpu_allocator.num_blocks
+
+    # Note: deleting just the model does not always free the GPU memory, not sure why.
+    del sparse_model.model.llm_engine.driver_worker
+    del sparse_model
+    torch.cuda.empty_cache()
+    gc.collect()
+
+    assert sparse_num_kv_blocks > dense_num_kv_blocks + num_extra_blocks, (
+        f"Test{model_name}: Sparse model KV cache size {sparse_num_kv_blocks} "
+        f"not bigger than dense model KV cache size {dense_num_kv_blocks} + "
+        f"expected num_extra_blocks {num_extra_blocks}")
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
@@ -8,7 +8,11 @@
 result in very slight nondeterminism for Marlin. As a result, we re-run the test
 up to 3 times to see if we pass.
 
-Run `pytest tests/models/test_marlin.py --forked`.
+Note: This test currently fails running with --forked with the following:
+    RuntimeError: Cannot re-initialize CUDA in forked subprocess. 
+    To use CUDA with multiprocessing, you must use the 'spawn' start method
+
+Run `pytest tests/models/test_marlin.py`.
 """
 
 import pytest
@@ -17,6 +21,8 @@
 from dataclasses import dataclass
 from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY
 
+MAX_MODEL_LEN = 1024
+
 capability = torch.cuda.get_device_capability()
 capability = capability[0] * 10 + capability[1]
 marlin_not_supported = (
@@ -47,31 +53,31 @@ class ModelPair:
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [3])
 def test_models(
-    vllm_runner,
+    vllm_runner_nm,
     example_prompts,
     model_pair: ModelPair,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    marlin_model = vllm_runner(model_pair.model_marlin, dtype=dtype)
+    marlin_model = vllm_runner_nm(model_pair.model_marlin,
+                                  dtype=dtype,
+                                  max_model_len=MAX_MODEL_LEN)
     marlin_outputs = marlin_model.generate_greedy_logprobs(
         example_prompts, max_tokens, num_logprobs)
 
-    # Note: not sure why, but deleting just the model on Ada Lovelace
-    #   does not free the GPU memory. On Ampere, deleting just the model
-    #   frees the memory.
+    # Note: deleting just the model does not always free the GPU memory, not sure why.
     del marlin_model.model.llm_engine.driver_worker
     del marlin_model
 
-    gptq_model = vllm_runner(model_pair.model_gptq, dtype=dtype)
+    gptq_model = vllm_runner_nm(model_pair.model_gptq,
+                                dtype=dtype,
+                                max_model_len=MAX_MODEL_LEN)
     gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
                                                        max_tokens,
                                                        num_logprobs)
 
-    # Note: not sure why, but deleting just the model on Ada Lovelace
-    #   does not free the GPU memory. On Ampere, deleting just the model
-    #   frees the memory.
+    # Note: deleting just the model does not always free the GPU memory, not sure why.
     del gptq_model.model.llm_engine.driver_worker
     del gptq_model