diff --git a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json index 3e1baf2ea..4f4f3d8b8 100644 --- a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json +++ b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json @@ -42,6 +42,22 @@ "tgis-runtime": { "tokenize_response_text": "{'responses':[{'tokenCount':9,'tokens':['\\u003cs\\u003e','▁At','▁what','▁temperature','▁does','▁water','▁bo','il','?']}]}" } + }, + "granite-8b-code-base": { + "response_tokens": 20, + "response_text": "\n\nWater boils at 100 degrees Celsius or 212", + "streamed_response_text": "{ 'inputTokenCount': 8 } { 'generatedTokenCount': 2, 'text': '\n' } { 'generatedTokenCount': 3, 'text': '\n' } { 'generatedTokenCount': 4, 'text': 'Wate' } { 'generatedTokenCount': 5, 'text': 'r b' } { 'generatedTokenCount': 6, 'text': 'oil' } { 'generatedTokenCount': 7, 'text': 's a' } { 'generatedTokenCount': 8, 'text': 't' } { 'generatedTokenCount': 9, 'text': ' ' } { 'generatedTokenCount': 10, 'text': '1' } { 'generatedTokenCount': 11, 'text': '0' } { 'generatedTokenCount': 12, 'text': '0 degree' } { 'generatedTokenCount': 13, 'text': 's Ce' } { 'generatedTokenCount': 14, 'text': 'ls' } { 'generatedTokenCount': 15, 'text': 'iu' } { 'generatedTokenCount': 16, 'text': 's o' } { 'generatedTokenCount': 17, 'text': 'r' } { 'generatedTokenCount': 18, 'text': ' ' } { 'generatedTokenCount': 19, 'text': '2' } { 'generatedTokenCount': 20, 'text': '12', 'stopReason': 'MAX_TOKENS' }", + "tgis-runtime": { + "tokenize_response_text": "{'responses':[{'tokenCount':9,'tokens':['\\u003cs\\u003e','▁At','▁what','▁temperature','▁does','▁water','▁bo','il','?']}]}" + } + }, + "merlinite-7b-lab": { + "response_tokens": 20, + "response_text": "\n\nWater boils at 100 degrees Celsius or 212", + "streamed_response_text": "{ 'inputTokenCount': 8 } { 'generatedTokenCount': 2, 'text': '\n' } { 'generatedTokenCount': 3, 'text': '\n' } { 'generatedTokenCount': 4, 'text': 'Wate' } { 'generatedTokenCount': 5, 'text': 'r b' } { 'generatedTokenCount': 6, 'text': 'oil' } { 'generatedTokenCount': 7, 'text': 's a' } { 'generatedTokenCount': 8, 'text': 't' } { 'generatedTokenCount': 9, 'text': ' ' } { 'generatedTokenCount': 10, 'text': '1' } { 'generatedTokenCount': 11, 'text': '0' } { 'generatedTokenCount': 12, 'text': '0 degree' } { 'generatedTokenCount': 13, 'text': 's Ce' } { 'generatedTokenCount': 14, 'text': 'ls' } { 'generatedTokenCount': 15, 'text': 'iu' } { 'generatedTokenCount': 16, 'text': 's o' } { 'generatedTokenCount': 17, 'text': 'r' } { 'generatedTokenCount': 18, 'text': ' ' } { 'generatedTokenCount': 19, 'text': '2' } { 'generatedTokenCount': 20, 'text': '12', 'stopReason': 'MAX_TOKENS' }", + "tgis-runtime": { + "tokenize_response_text": "{'responses':[{'tokenCount':9,'tokens':['\\u003cs\\u003e','▁At','▁what','▁temperature','▁does','▁water','▁bo','il','?']}]}" + } } } }, @@ -184,6 +200,16 @@ "vllm-runtime": { "completions_response_text": "{\"object\":\"text_completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"text\":\" beautiful city that is full of life and energy. Located along the San Francisco\",\"logprobs\":null,\"finish_reason\":\"length\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":5,\"total_tokens\":21,\"completion_tokens\":16}}" } + }, + "granite-8b-code-base":{ + "vllm-runtime": { + "completions_response_text": "{\"object\":\"text_completion\",\"created\":,\"model\":\"granite-8b-code-base\",\"choices\":[{\"index\":0,\"text\":\" city in California, one of the most populous cities in the United States\",\"logprobs\":null,\"finish_reason\":\"length\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":5,\"total_tokens\":20,\"completion_tokens\":16}}" + } + }, + "merlinite-7b-lab":{ + "vllm-runtime": { + "completions_response_text": "{\"object\":\"text_completion\",\"created\":,\"model\":\"granite-8b-code-base\",\"choices\":[{\"index\":0,\"text\":\" city in California, one of the most populous cities in the United States\",\"logprobs\":null,\"finish_reason\":\"length\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":5,\"total_tokens\":20,\"completion_tokens\":16}}" + } } } }, @@ -194,6 +220,16 @@ "vllm-runtime": { "chat-completions_response_text": "{\"object\":\"chat.completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\" This is a test!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":15,\"total_tokens\":22,\"completion_tokens\":7}}" } + }, + "granite-8b-code-base":{ + "vllm-runtime": { + "chat-completions_response_text": "{\"object\":\"chat.completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\" This is a test!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":15,\"total_tokens\":22,\"completion_tokens\":7}}" + } + }, + "merlinite-7b-lab":{ + "vllm-runtime": { + "chat-completions_response_text": "{\"object\":\"chat.completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\" This is a test!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":15,\"total_tokens\":22,\"completion_tokens\":7}}" + } } } } @@ -228,6 +264,12 @@ }, "llama-2-13b-chat-hf":{ "tgis-runtime": "{ 'maxSequenceLength': 4096,'maxNewTokens': 1024}" + }, + "granite-8b-code-base":{ + "tgis-runtime": "{ 'maxSequenceLength': 4096,'maxNewTokens': 1024}" + }, + "merlinite-7b-lab": { + "tgis-runtime": "{ 'maxSequenceLength': 4096,'maxNewTokens': 1024}" } } } diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_grpc.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_grpc.yaml index 54049fb72..81af81e26 100644 --- a/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_grpc.yaml +++ b/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_grpc.yaml @@ -13,7 +13,7 @@ spec: - /models-cache - --port - "8080" - image: quay.io/opendatahub/vllm:fast-ibm-nightly-2024-05-21 + image: quay.io/opendatahub/vllm:stable-e392b03 name: kserve-container command: - python3 diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml index c2c983d40..42decb4fa 100644 --- a/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml +++ b/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml @@ -13,7 +13,7 @@ spec: - /models-cache - --port - "8080" - image: quay.io/opendatahub/vllm:fast-ibm-nightly-2024-05-21 + image: quay.io/opendatahub/vllm:stable-e392b03 name: kserve-container command: - python3 diff --git a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot index ac3b3d658..5268aa301 100644 --- a/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot +++ b/ods_ci/tests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot @@ -471,6 +471,128 @@ Verify User Can Serve And Query A google/flan-t5-xl Prompt Tuned Model ... AND ... Run Keyword If "${KSERVE_MODE}"=="RawDeployment" Terminate Process llm-query-process kill=true +Verify User Can Serve And Query A instructlab/merlinite-7b-lab Model + [Documentation] Basic tests for preparing, deploying and querying a LLM model + ... using Kserve using TGIS standalone or vllm runtime + [Tags] RHOAIENG-7690 VLLM + Setup Test Variables model_name=merlinite-7b-lab use_pvc=${USE_PVC} use_gpu=${USE_GPU} + ... kserve_mode=${KSERVE_MODE} model_path=merlinite-7b-lab + Set Project And Runtime runtime=${RUNTIME_NAME} namespace=${test_namespace} + ... download_in_pvc=${DOWNLOAD_IN_PVC} model_name=${model_name} protocol=${PROTOCOL} + ... storage_size=70Gi model_path=${model_path} + ${requests}= Create Dictionary memory=40Gi + IF "${OVERLAY}" != "${EMPTY}" + ${overlays}= Create List ${OVERLAY} + ELSE + ${overlays}= Create List + END + Compile Inference Service YAML isvc_name=${model_name} + ... sa_name=${EMPTY} + ... model_storage_uri=${storage_uri} + ... model_format=${MODEL_FORMAT} serving_runtime=${RUNTIME_NAME} + ... limits_dict=${limits} requests_dict=${requests} kserve_mode=${KSERVE_MODE} + ... overlays=${overlays} + Deploy Model Via CLI isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH} + ... namespace=${test_namespace} + Wait For Model KServe Deployment To Be Ready label_selector=serving.kserve.io/inferenceservice=${model_name} + ... namespace=${test_namespace} runtime=${RUNTIME_NAME} timeout=900s + ${pod_name}= Get Pod Name namespace=${test_namespace} label_selector=serving.kserve.io/inferenceservice=${model_name} + Run Keyword If "${KSERVE_MODE}"=="RawDeployment" + ... Start Port-forwarding namespace=${test_namespace} pod_name=${pod_name} + IF "${RUNTIME_NAME}" == "tgis-runtime" or "${KSERVE_MODE}" == "RawDeployment" + Set Test Variable ${RUNTIME_NAME} tgis-runtime + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} + ... inference_type=all-tokens n_times=1 protocol=${PROTOCOL} + ... namespace=${test_namespace} query_idx=0 validate_response=${FALSE} # temp + ... port_forwarding=${use_port_forwarding} + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} + ... inference_type=streaming n_times=1 protocol=${PROTOCOL} + ... namespace=${test_namespace} query_idx=0 validate_response=${FALSE} + ... port_forwarding=${use_port_forwarding} + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} + ... inference_type=model-info n_times=0 + ... namespace=${test_namespace} validate_response=${TRUE} string_check_only=${TRUE} + ... port_forwarding=${use_port_forwarding} + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} + ... inference_type=tokenize n_times=0 query_idx=0 + ... namespace=${test_namespace} validate_response=${TRUE} string_check_only=${TRUE} + ... port_forwarding=${use_port_forwarding} + ELSE IF "${RUNTIME_NAME}" == "vllm-runtime" and "${KSERVE_MODE}" == "Serverless" + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} protocol=http + ... inference_type=chat-completions n_times=1 query_idx=12 + ... namespace=${test_namespace} string_check_only=${TRUE} + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} protocol=http + ... inference_type=completions n_times=1 query_idx=11 + ... namespace=${test_namespace} string_check_only=${TRUE} + END + [Teardown] Run Keywords + ... Clean Up Test Project test_ns=${test_namespace} + ... isvc_names=${models_names} wait_prj_deletion=${FALSE} + ... kserve_mode=${KSERVE_MODE} + ... AND + ... Run Keyword If "${KSERVE_MODE}"=="RawDeployment" Terminate Process llm-query-process kill=true + +Verify User Can Serve And Query A ibm-granite/granite-8b-code-base Model + [Documentation] Basic tests for preparing, deploying and querying a LLM model + ... using Kserve using TGIS standalone or vllm runtime + [Tags] RHOAIENG-7689 VLLM + Setup Test Variables model_name=granite-8b-code-base use_pvc=${USE_PVC} use_gpu=${USE_GPU} + ... kserve_mode=${KSERVE_MODE} model_path=granite-8b-code-base + Set Project And Runtime runtime=${RUNTIME_NAME} namespace=${test_namespace} + ... download_in_pvc=${DOWNLOAD_IN_PVC} model_name=${model_name} protocol=${PROTOCOL} + ... storage_size=40Gi model_path=${model_path} + ${requests}= Create Dictionary memory=40Gi + IF "${OVERLAY}" != "${EMPTY}" + ${overlays}= Create List ${OVERLAY} + ELSE + ${overlays}= Create List + END + Compile Inference Service YAML isvc_name=${model_name} + ... sa_name=${EMPTY} + ... model_storage_uri=${storage_uri} + ... model_format=${MODEL_FORMAT} serving_runtime=${RUNTIME_NAME} + ... limits_dict=${limits} requests_dict=${requests} kserve_mode=${KSERVE_MODE} + ... overlays=${overlays} + Deploy Model Via CLI isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH} + ... namespace=${test_namespace} + Wait For Model KServe Deployment To Be Ready label_selector=serving.kserve.io/inferenceservice=${model_name} + ... namespace=${test_namespace} runtime=${RUNTIME_NAME} timeout=900s + ${pod_name}= Get Pod Name namespace=${test_namespace} label_selector=serving.kserve.io/inferenceservice=${model_name} + Run Keyword If "${KSERVE_MODE}"=="RawDeployment" + ... Start Port-forwarding namespace=${test_namespace} pod_name=${pod_name} + IF "${RUNTIME_NAME}" == "tgis-runtime" or "${KSERVE_MODE}" == "RawDeployment" + Set Test Variable ${RUNTIME_NAME} tgis-runtime + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} + ... inference_type=all-tokens n_times=1 protocol=${PROTOCOL} + ... namespace=${test_namespace} query_idx=0 validate_response=${FALSE} # temp + ... port_forwarding=${use_port_forwarding} + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} + ... inference_type=streaming n_times=1 protocol=${PROTOCOL} + ... namespace=${test_namespace} query_idx=0 validate_response=${FALSE} + ... port_forwarding=${use_port_forwarding} + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} + ... inference_type=model-info n_times=0 + ... namespace=${test_namespace} validate_response=${TRUE} string_check_only=${TRUE} + ... port_forwarding=${use_port_forwarding} + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} + ... inference_type=tokenize n_times=0 query_idx=0 + ... namespace=${test_namespace} validate_response=${TRUE} string_check_only=${TRUE} + ... port_forwarding=${use_port_forwarding} + ELSE IF "${RUNTIME_NAME}" == "vllm-runtime" and "${KSERVE_MODE}" == "Serverless" + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} protocol=http + ... inference_type=chat-completions n_times=1 query_idx=12 + ... namespace=${test_namespace} string_check_only=${TRUE} + Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} protocol=http + ... inference_type=completions n_times=1 query_idx=11 + ... namespace=${test_namespace} string_check_only=${TRUE} + END + [Teardown] Run Keywords + ... Clean Up Test Project test_ns=${test_namespace} + ... isvc_names=${models_names} wait_prj_deletion=${FALSE} + ... kserve_mode=${KSERVE_MODE} + ... AND + ... Run Keyword If "${KSERVE_MODE}"=="RawDeployment" Terminate Process llm-query-process kill=true + *** Keywords *** Suite Setup