From 2ad2e5608eeede10683412bbbfaf30b3a68019dc Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Wed, 4 Sep 2024 11:53:25 -0700 Subject: [PATCH] [MISC] Consolidate FP8 kv-cache tests (#8131) --- .buildkite/run-cpu-test.sh | 7 +- .../basic_correctness/test_chunked_prefill.py | 43 +---- tests/models/test_fp8.py | 181 ++++++++---------- tests/models/test_fp8kv_flashinfer.py | 96 ---------- 4 files changed, 94 insertions(+), 233 deletions(-) delete mode 100644 tests/models/test_fp8kv_flashinfer.py diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 8e4be08f3aba..ca9cf15780e2 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,7 +23,12 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \ + --ignore=tests/models/test_oot_registration.py \ + --ignore=tests/models/test_registry.py \ + --ignore=tests/models/test_fp8.py \ + --ignore=tests/models/test_jamba.py \ + --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # online inference docker exec cpu-test bash -c " diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index a63ac380e859..9c34b2a13fd5 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -16,18 +16,6 @@ "facebook/opt-125m", "meta-llama/Llama-2-7b-hf", ] -E5M2_KV_MODELS = [ - "facebook/opt-125m", - "meta-llama/Llama-2-7b-chat-hf", -] -E4M3_KV_MODELS = [ - "meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", - "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme" -] -KV_CACHE_QUANTIZATION_PATHS = { - "meta-llama/Llama-2-7b-chat-hf": - "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json" -} @pytest.mark.parametrize("model", MODELS) @@ -78,10 +66,10 @@ def test_models( ) -@pytest.mark.parametrize("kv_cache_dtype,model", - [("fp8_e5m2", m) - for m in E5M2_KV_MODELS] + [("fp8_e4m3", m) - for m in E4M3_KV_MODELS]) +@pytest.mark.parametrize( + "kv_cache_dtype,model", + [("fp8_e4m3", + "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("chunked_prefill_token_size", [4, 16]) @@ -104,30 +92,15 @@ def test_models_with_fp8_kv_cache( disable_async_output_proc: bool, ) -> None: """ - Only checks log probs match between chunked-prefill and - non-chunked-prefill version of vLLM model runner. - - This test is used when there is discrepancy in kernels - / numerics (e.g. when using lower-precision types like FP8). + Check output logprobs match between no_chunked_prefill and chunked_prefill + with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py, + so here we only check chunked prefill. """ NUM_LOG_PROBS = 8 - if model == "facebook/opt-125m": - pytest.skip( - "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m" - ) - if ((model, kv_cache_dtype, chunked_prefill_token_size) == ( - "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", "fp8_e4m3", 4)): - pytest.skip("flakey test, see: #7874 #8051") - max_num_seqs = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size - extra_kwargs = {} - if model in KV_CACHE_QUANTIZATION_PATHS: - extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[ - model] - with vllm_runner( model, tensor_parallel_size=tensor_parallel_size, @@ -135,7 +108,6 @@ def test_models_with_fp8_kv_cache( max_num_seqs=max_num_seqs, kv_cache_dtype=kv_cache_dtype, disable_async_output_proc=disable_async_output_proc, - **extra_kwargs, ) as vllm_model: no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -149,7 +121,6 @@ def test_models_with_fp8_kv_cache( max_num_seqs=max_num_seqs, kv_cache_dtype=kv_cache_dtype, disable_async_output_proc=disable_async_output_proc, - **extra_kwargs, ) as vllm_model: chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 4ab968c01da0..17acdb52322f 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -3,116 +3,97 @@ Note: these tests will only pass on L4 GPU. """ import os -from typing import List +from typing import Optional import pytest -import torch -from transformers import AutoTokenizer +from tests.kernels.utils import override_backend_env_variable from tests.quantization.utils import is_quant_method_supported -from vllm import LLM, SamplingParams -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -MAX_MODEL_LEN = 1024 - -MODELS = [ - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", - "meta-llama/Meta-Llama-3-8B-Instruct", -] +from ..models.utils import check_logprobs_close -EXPECTED_STRS_MAP = { - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": { - "auto": [ - 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no' - ], - "fp8": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system made up of several basic components that work together to enable it to', - 'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk' - ] - }, - "meta-llama/Meta-Llama-3-8B-Instruct": { - "auto": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu' - ], - "fp8": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', - 'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu' - ] - }, -} +os.environ["TOKENIZERS_PARALLELISM"] = "true" -# This test compares against golden strings for exact match since -# there is no baseline implementation to compare against -# and is unstable w.r.t specifics of the fp8 implementation or -# the hardware being run on. -# Disabled to prevent it from breaking the build -@pytest.mark.skip( - reason= - "Prevent unstable test based on golden strings from breaking the build.") @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") -@pytest.mark.parametrize("model_name", MODELS) -@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -def test_models(example_prompts, model_name, kv_cache_dtype) -> None: - model = LLM(model=model_name, - max_model_len=MAX_MODEL_LEN, - trust_remote_code=True, - enforce_eager=True, - quantization="fp8", - kv_cache_dtype=kv_cache_dtype) +@pytest.mark.parametrize( + "kv_cache_dtype,base_model,test_model,scale_path", + [ + # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. + ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), + # Test FP16 checkpoint w. fp8_e5m2 kv-cache. + ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Meta-Llama-3-8B-Instruct", None), + # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. + ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Llama-2-7b-chat-hf", + "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") + ]) +# Due to low-precision numerical divergence, we only test logprob of 4 tokens +@pytest.mark.parametrize("max_tokens", [4]) +@pytest.mark.parametrize("enforce_eager", [False, True]) +@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) +# NOTE: Increasing this in this suite will fail CI because we currently cannot +# reset distributed env properly. Use a value > 1 just when you test. +@pytest.mark.parametrize("tensor_parallel_size", [1]) +# Due to low-precision numerical divergence, this test is too sensitive for +# the async postprocessor +@pytest.mark.parametrize("disable_async_output_proc", [True]) +def test_models( + vllm_runner, + example_prompts, + kv_cache_dtype: str, + base_model: str, + test_model: str, + scale_path: Optional[str], + max_tokens: int, + enforce_eager: bool, + backend: str, + tensor_parallel_size: int, + disable_async_output_proc: bool, + monkeypatch, +) -> None: + """ + Only checks log probs match to cover the discrepancy in + numerical sensitive kernels. + """ + override_backend_env_variable(monkeypatch, backend) + + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 + + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - tokenizer = AutoTokenizer.from_pretrained(model_name) - formatted_prompts = [ - tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - tokenize=False, - add_generation_prompt=True) - for prompt in example_prompts - ] + extra_kwargs = {} + if scale_path is not None: + extra_kwargs["quantization_param_path"] = scale_path - params = SamplingParams(max_tokens=20, temperature=0) - generations: List[str] = [] - # Note: these need to be run 1 at a time due to numerical precision, - # since the expected strs were generated this way. - for prompt in formatted_prompts: - outputs = model.generate(prompt, params) - generations.append(outputs[0].outputs[0].text) - del model + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + **extra_kwargs, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - print(model_name, kv_cache_dtype, generations) - expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype] - for i in range(len(example_prompts)): - generated_str = generations[i] - expected_str = expected_strs[i] - assert expected_str == generated_str, ( - f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}") + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="fp16_kv_cache", + name_1="fp8_kv_cache", + ) diff --git a/tests/models/test_fp8kv_flashinfer.py b/tests/models/test_fp8kv_flashinfer.py deleted file mode 100644 index ff2a44162b6c..000000000000 --- a/tests/models/test_fp8kv_flashinfer.py +++ /dev/null @@ -1,96 +0,0 @@ -# flake8: noqa -"""Tests fp8 models against ground truth generation -This verifies the flashinfer backend with fp8 -quantization and fp8 KV Cache without scaling -factors Note: these tests will only pass on H100 GPU. -""" -import os -from typing import List - -import pytest -from transformers import AutoTokenizer - -from tests.quantization.utils import is_quant_method_supported -from vllm import LLM, SamplingParams - -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -MAX_MODEL_LEN = 1024 - -MODELS = [ - "nm-testing/Meta-Llama-3-8B-Instruct-FP8", -] - -EXPECTED_STRS_MAP = { - "nm-testing/Meta-Llama-3-8B-Instruct-FP8": { - "auto": [ - 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', - 'In the sterile, metallic halls of the robotics lab, a peculiar phenomenon occurred. Zeta-5', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, mushi o', - ], - "fp8": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o', - ] - } -} - - -# This test compares against golden strings for exact match since -# there is no baseline implementation to compare against -# and is unstable w.r.t specifics of the fp8 implementation or -# the hardware being run on. -# No assert to prevent it from breaking the build -@pytest.mark.skipif(not is_quant_method_supported("fp8"), - reason="fp8 is not supported on this GPU type.") -@pytest.mark.parametrize("model_name", MODELS) -@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -@pytest.mark.parametrize("backend", ["XFORMERS", "FLASHINFER"]) -def test_models(example_prompts, model_name, kv_cache_dtype, backend) -> None: - # Note that the golden strings may not work for FLASHINFER Backend. - # The intention is to test the path - os.environ["VLLM_ATTENTION_BACKEND"] = backend - model = LLM(model=model_name, - max_model_len=MAX_MODEL_LEN, - trust_remote_code=True, - quantization="fp8", - kv_cache_dtype=kv_cache_dtype) - - tokenizer = AutoTokenizer.from_pretrained(model_name) - formatted_prompts = [ - tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - tokenize=False, - add_generation_prompt=True) - for prompt in example_prompts - ] - - params = SamplingParams(max_tokens=20, temperature=0) - generations: List[str] = [] - # Note: these need to be run 1 at a time due to numerical precision, - # since the expected strs were generated this way. - for prompt in formatted_prompts: - outputs = model.generate(prompt, params) - generations.append(outputs[0].outputs[0].text) - del model - - print(f"Testing: {model_name} with kv_cache_dtype: {kv_cache_dtype}") - expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype] - for i in range(len(example_prompts)): - generated_str = generations[i] - expected_str = expected_strs[i] - print(f"generated_str\n: {generated_str}") - print(f"expected_str\n: {expected_str}")