From 50149c380088a3374174c3f02e69f999c1d65c83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Sat, 20 Jul 2024 12:26:06 +0200 Subject: [PATCH] Add FP8 release test (#2261) --- .../test_flash_llama_fp8.json | 89 +++++ .../test_flash_llama_fp8_all_params.json | 89 +++++ .../test_flash_llama_fp8_load.json | 358 ++++++++++++++++++ .../models/test_flash_llama_fp8.py | 62 +++ 4 files changed, 598 insertions(+) create mode 100644 integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8.json create mode 100644 integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json create mode 100644 integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_load.json create mode 100644 integration-tests/models/test_flash_llama_fp8.py diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8.json new file mode 100644 index 00000000000..85cfb91f1ec --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8.json @@ -0,0 +1,89 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 2323, + "logprob": -9.421875, + "text": "Test" + }, + { + "id": 1715, + "logprob": -10.546875, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 369, + "logprob": -2.1816406, + "special": false, + "text": " for" + }, + { + "id": 279, + "logprob": -2.6992188, + "special": false, + "text": " the" + }, + { + "id": 220, + "logprob": -3.6308594, + "special": false, + "text": " " + }, + { + "id": 679, + "logprob": -1.7900391, + "special": false, + "text": "201" + }, + { + "id": 24, + "logprob": -1.3554688, + "special": false, + "text": "9" + }, + { + "id": 12, + "logprob": -2.0039062, + "special": false, + "text": "-" + }, + { + "id": 2366, + "logprob": -0.4489746, + "special": false, + "text": "202" + }, + { + "id": 15, + "logprob": -0.037109375, + "special": false, + "text": "0" + }, + { + "id": 2978, + "logprob": -0.8100586, + "special": false, + "text": " school" + }, + { + "id": 1060, + "logprob": -0.013015747, + "special": false, + "text": " year" + } + ], + "top_tokens": null + }, + "generated_text": " for the 2019-2020 school year" +} diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json new file mode 100644 index 00000000000..dcb4d063a27 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_all_params.json @@ -0,0 +1,89 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 2323, + "logprob": -9.421875, + "text": "Test" + }, + { + "id": 1715, + "logprob": -10.546875, + "text": " request" + } + ], + "seed": 0, + "tokens": [ + { + "id": 25, + "logprob": -0.8535156, + "special": false, + "text": ":" + }, + { + "id": 2209, + "logprob": -2.4804688, + "special": false, + "text": " Is" + }, + { + "id": 279, + "logprob": -0.7167969, + "special": false, + "text": " the" + }, + { + "id": 734, + "logprob": -2.625, + "special": false, + "text": " function" + }, + { + "id": 330, + "logprob": -0.35131836, + "special": false, + "text": " \"" + }, + { + "id": 4110, + "logprob": -2.4101562, + "special": false, + "text": "Create" + }, + { + "id": 264, + "logprob": -0.23181152, + "special": false, + "text": " a" + }, + { + "id": 502, + "logprob": -0.25512695, + "special": false, + "text": " new" + }, + { + "id": 1052, + "logprob": -1.2792969, + "special": false, + "text": " file" + }, + { + "id": 1, + "logprob": -1.2529297, + "special": false, + "text": "\"" + } + ], + "top_tokens": null + }, + "generated_text": "Test request: Is the function \"Create a new file\"" +} diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_load.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_load.json new file mode 100644 index 00000000000..36c87c0975a --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8/test_flash_llama_fp8_load.json @@ -0,0 +1,358 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 2323, + "logprob": -9.421875, + "text": "Test" + }, + { + "id": 1715, + "logprob": -10.546875, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 369, + "logprob": -2.1816406, + "special": false, + "text": " for" + }, + { + "id": 279, + "logprob": -2.6992188, + "special": false, + "text": " the" + }, + { + "id": 220, + "logprob": -3.6308594, + "special": false, + "text": " " + }, + { + "id": 679, + "logprob": -1.7988281, + "special": false, + "text": "201" + }, + { + "id": 24, + "logprob": -1.3535156, + "special": false, + "text": "9" + }, + { + "id": 12, + "logprob": -2.0058594, + "special": false, + "text": "-" + }, + { + "id": 2366, + "logprob": -0.45410156, + "special": false, + "text": "202" + }, + { + "id": 15, + "logprob": -0.037109375, + "special": false, + "text": "0" + }, + { + "id": 2978, + "logprob": -0.8095703, + "special": false, + "text": " school" + }, + { + "id": 1060, + "logprob": -0.013053894, + "special": false, + "text": " year" + } + ], + "top_tokens": null + }, + "generated_text": " for the 2019-2020 school year" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 2323, + "logprob": -9.421875, + "text": "Test" + }, + { + "id": 1715, + "logprob": -10.546875, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 369, + "logprob": -2.1816406, + "special": false, + "text": " for" + }, + { + "id": 279, + "logprob": -2.6992188, + "special": false, + "text": " the" + }, + { + "id": 220, + "logprob": -3.6308594, + "special": false, + "text": " " + }, + { + "id": 679, + "logprob": -1.7988281, + "special": false, + "text": "201" + }, + { + "id": 24, + "logprob": -1.3535156, + "special": false, + "text": "9" + }, + { + "id": 12, + "logprob": -2.0058594, + "special": false, + "text": "-" + }, + { + "id": 2366, + "logprob": -0.45410156, + "special": false, + "text": "202" + }, + { + "id": 15, + "logprob": -0.037109375, + "special": false, + "text": "0" + }, + { + "id": 2978, + "logprob": -0.8095703, + "special": false, + "text": " school" + }, + { + "id": 1060, + "logprob": -0.013053894, + "special": false, + "text": " year" + } + ], + "top_tokens": null + }, + "generated_text": " for the 2019-2020 school year" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 2323, + "logprob": -9.421875, + "text": "Test" + }, + { + "id": 1715, + "logprob": -10.546875, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 369, + "logprob": -2.1816406, + "special": false, + "text": " for" + }, + { + "id": 279, + "logprob": -2.6992188, + "special": false, + "text": " the" + }, + { + "id": 220, + "logprob": -3.6308594, + "special": false, + "text": " " + }, + { + "id": 679, + "logprob": -1.7988281, + "special": false, + "text": "201" + }, + { + "id": 24, + "logprob": -1.3535156, + "special": false, + "text": "9" + }, + { + "id": 12, + "logprob": -2.0058594, + "special": false, + "text": "-" + }, + { + "id": 2366, + "logprob": -0.45410156, + "special": false, + "text": "202" + }, + { + "id": 15, + "logprob": -0.037109375, + "special": false, + "text": "0" + }, + { + "id": 2978, + "logprob": -0.8095703, + "special": false, + "text": " school" + }, + { + "id": 1060, + "logprob": -0.013053894, + "special": false, + "text": " year" + } + ], + "top_tokens": null + }, + "generated_text": " for the 2019-2020 school year" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 128000, + "logprob": null, + "text": "<|begin_of_text|>" + }, + { + "id": 2323, + "logprob": -9.421875, + "text": "Test" + }, + { + "id": 1715, + "logprob": -10.546875, + "text": " request" + } + ], + "seed": null, + "tokens": [ + { + "id": 369, + "logprob": -2.1816406, + "special": false, + "text": " for" + }, + { + "id": 279, + "logprob": -2.6992188, + "special": false, + "text": " the" + }, + { + "id": 220, + "logprob": -3.6308594, + "special": false, + "text": " " + }, + { + "id": 679, + "logprob": -1.7988281, + "special": false, + "text": "201" + }, + { + "id": 24, + "logprob": -1.3535156, + "special": false, + "text": "9" + }, + { + "id": 12, + "logprob": -2.0058594, + "special": false, + "text": "-" + }, + { + "id": 2366, + "logprob": -0.45410156, + "special": false, + "text": "202" + }, + { + "id": 15, + "logprob": -0.037109375, + "special": false, + "text": "0" + }, + { + "id": 2978, + "logprob": -0.8095703, + "special": false, + "text": " school" + }, + { + "id": 1060, + "logprob": -0.013053894, + "special": false, + "text": " year" + } + ], + "top_tokens": null + }, + "generated_text": " for the 2019-2020 school year" + } +] diff --git a/integration-tests/models/test_flash_llama_fp8.py b/integration-tests/models/test_flash_llama_fp8.py new file mode 100644 index 00000000000..fe5df590c2a --- /dev/null +++ b/integration-tests/models/test_flash_llama_fp8.py @@ -0,0 +1,62 @@ +import pytest + + +@pytest.fixture(scope="module") +def flash_llama_fp8_handle(launcher): + with launcher("meta-llama/Meta-Llama-3-8B", num_shard=2, quantize="fp8") as handle: + yield handle + + +@pytest.fixture(scope="module") +async def flash_llama_fp8(flash_llama_fp8_handle): + await flash_llama_fp8_handle.health(300) + return flash_llama_fp8_handle.client + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_fp8(flash_llama_fp8, response_snapshot): + response = await flash_llama_fp8.generate( + "Test request", max_new_tokens=10, decoder_input_details=True + ) + + assert response.details.generated_tokens == 10 + assert response == response_snapshot + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_fp8_all_params(flash_llama_fp8, response_snapshot): + response = await flash_llama_fp8.generate( + "Test request", + max_new_tokens=10, + repetition_penalty=1.2, + return_full_text=True, + stop_sequences=["test"], + temperature=0.5, + top_p=0.9, + top_k=10, + truncate=5, + typical_p=0.9, + watermark=True, + decoder_input_details=True, + seed=0, + ) + + assert response == response_snapshot + + +@pytest.mark.release +@pytest.mark.asyncio +@pytest.mark.private +async def test_flash_llama_fp8_load(flash_llama_fp8, generate_load, response_snapshot): + responses = await generate_load( + flash_llama_fp8, "Test request", max_new_tokens=10, n=4 + ) + + assert len(responses) == 4 + assert all([r.generated_text == responses[0].generated_text for r in responses]) + + assert responses == response_snapshot