Skip to content

Commit

Permalink
Add new granite and intructlab merlinite-7b-lab model for tgis and vl…
Browse files Browse the repository at this point in the history
…lm (red-hat-data-services#1486)

Signed-off-by: Tarun Kumar <takumar@redhat.com>
  • Loading branch information
tarukumar authored May 30, 2024
1 parent 60ed928 commit de1c0f8
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 2 deletions.
42 changes: 42 additions & 0 deletions ods_ci/tests/Resources/Files/llm/model_expected_responses.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,22 @@
"tgis-runtime": {
"tokenize_response_text": "{'responses':[{'tokenCount':9,'tokens':['\\u003cs\\u003e','▁At','▁what','▁temperature','▁does','▁water','▁bo','il','?']}]}"
}
},
"granite-8b-code-base": {
"response_tokens": 20,
"response_text": "\n\nWater boils at 100 degrees Celsius or 212",
"streamed_response_text": "{ 'inputTokenCount': 8 } { 'generatedTokenCount': 2, 'text': '\n' } { 'generatedTokenCount': 3, 'text': '\n' } { 'generatedTokenCount': 4, 'text': 'Wate' } { 'generatedTokenCount': 5, 'text': 'r b' } { 'generatedTokenCount': 6, 'text': 'oil' } { 'generatedTokenCount': 7, 'text': 's a' } { 'generatedTokenCount': 8, 'text': 't' } { 'generatedTokenCount': 9, 'text': ' ' } { 'generatedTokenCount': 10, 'text': '1' } { 'generatedTokenCount': 11, 'text': '0' } { 'generatedTokenCount': 12, 'text': '0 degree' } { 'generatedTokenCount': 13, 'text': 's Ce' } { 'generatedTokenCount': 14, 'text': 'ls' } { 'generatedTokenCount': 15, 'text': 'iu' } { 'generatedTokenCount': 16, 'text': 's o' } { 'generatedTokenCount': 17, 'text': 'r' } { 'generatedTokenCount': 18, 'text': ' ' } { 'generatedTokenCount': 19, 'text': '2' } { 'generatedTokenCount': 20, 'text': '12', 'stopReason': 'MAX_TOKENS' }",
"tgis-runtime": {
"tokenize_response_text": "{'responses':[{'tokenCount':9,'tokens':['\\u003cs\\u003e','▁At','▁what','▁temperature','▁does','▁water','▁bo','il','?']}]}"
}
},
"merlinite-7b-lab": {
"response_tokens": 20,
"response_text": "\n\nWater boils at 100 degrees Celsius or 212",
"streamed_response_text": "{ 'inputTokenCount': 8 } { 'generatedTokenCount': 2, 'text': '\n' } { 'generatedTokenCount': 3, 'text': '\n' } { 'generatedTokenCount': 4, 'text': 'Wate' } { 'generatedTokenCount': 5, 'text': 'r b' } { 'generatedTokenCount': 6, 'text': 'oil' } { 'generatedTokenCount': 7, 'text': 's a' } { 'generatedTokenCount': 8, 'text': 't' } { 'generatedTokenCount': 9, 'text': ' ' } { 'generatedTokenCount': 10, 'text': '1' } { 'generatedTokenCount': 11, 'text': '0' } { 'generatedTokenCount': 12, 'text': '0 degree' } { 'generatedTokenCount': 13, 'text': 's Ce' } { 'generatedTokenCount': 14, 'text': 'ls' } { 'generatedTokenCount': 15, 'text': 'iu' } { 'generatedTokenCount': 16, 'text': 's o' } { 'generatedTokenCount': 17, 'text': 'r' } { 'generatedTokenCount': 18, 'text': ' ' } { 'generatedTokenCount': 19, 'text': '2' } { 'generatedTokenCount': 20, 'text': '12', 'stopReason': 'MAX_TOKENS' }",
"tgis-runtime": {
"tokenize_response_text": "{'responses':[{'tokenCount':9,'tokens':['\\u003cs\\u003e','▁At','▁what','▁temperature','▁does','▁water','▁bo','il','?']}]}"
}
}
}
},
Expand Down Expand Up @@ -184,6 +200,16 @@
"vllm-runtime": {
"completions_response_text": "{\"object\":\"text_completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"text\":\" beautiful city that is full of life and energy. Located along the San Francisco\",\"logprobs\":null,\"finish_reason\":\"length\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":5,\"total_tokens\":21,\"completion_tokens\":16}}"
}
},
"granite-8b-code-base":{
"vllm-runtime": {
"completions_response_text": "{\"object\":\"text_completion\",\"created\":,\"model\":\"granite-8b-code-base\",\"choices\":[{\"index\":0,\"text\":\" city in California, one of the most populous cities in the United States\",\"logprobs\":null,\"finish_reason\":\"length\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":5,\"total_tokens\":20,\"completion_tokens\":16}}"
}
},
"merlinite-7b-lab":{
"vllm-runtime": {
"completions_response_text": "{\"object\":\"text_completion\",\"created\":,\"model\":\"granite-8b-code-base\",\"choices\":[{\"index\":0,\"text\":\" city in California, one of the most populous cities in the United States\",\"logprobs\":null,\"finish_reason\":\"length\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":5,\"total_tokens\":20,\"completion_tokens\":16}}"
}
}
}
},
Expand All @@ -194,6 +220,16 @@
"vllm-runtime": {
"chat-completions_response_text": "{\"object\":\"chat.completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\" This is a test!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":15,\"total_tokens\":22,\"completion_tokens\":7}}"
}
},
"granite-8b-code-base":{
"vllm-runtime": {
"chat-completions_response_text": "{\"object\":\"chat.completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\" This is a test!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":15,\"total_tokens\":22,\"completion_tokens\":7}}"
}
},
"merlinite-7b-lab":{
"vllm-runtime": {
"chat-completions_response_text": "{\"object\":\"chat.completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\" This is a test!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":15,\"total_tokens\":22,\"completion_tokens\":7}}"
}
}
}
}
Expand Down Expand Up @@ -228,6 +264,12 @@
},
"llama-2-13b-chat-hf":{
"tgis-runtime": "{ 'maxSequenceLength': 4096,'maxNewTokens': 1024}"
},
"granite-8b-code-base":{
"tgis-runtime": "{ 'maxSequenceLength': 4096,'maxNewTokens': 1024}"
},
"merlinite-7b-lab": {
"tgis-runtime": "{ 'maxSequenceLength': 4096,'maxNewTokens': 1024}"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
- /models-cache
- --port
- "8080"
image: quay.io/opendatahub/vllm:fast-ibm-nightly-2024-05-21
image: quay.io/opendatahub/vllm:stable-e392b03
name: kserve-container
command:
- python3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
- /models-cache
- --port
- "8080"
image: quay.io/opendatahub/vllm:fast-ibm-nightly-2024-05-21
image: quay.io/opendatahub/vllm:stable-e392b03
name: kserve-container
command:
- python3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,128 @@ Verify User Can Serve And Query A google/flan-t5-xl Prompt Tuned Model
... AND
... Run Keyword If "${KSERVE_MODE}"=="RawDeployment" Terminate Process llm-query-process kill=true

Verify User Can Serve And Query A instructlab/merlinite-7b-lab Model
[Documentation] Basic tests for preparing, deploying and querying a LLM model
... using Kserve using TGIS standalone or vllm runtime
[Tags] RHOAIENG-7690 VLLM
Setup Test Variables model_name=merlinite-7b-lab use_pvc=${USE_PVC} use_gpu=${USE_GPU}
... kserve_mode=${KSERVE_MODE} model_path=merlinite-7b-lab
Set Project And Runtime runtime=${RUNTIME_NAME} namespace=${test_namespace}
... download_in_pvc=${DOWNLOAD_IN_PVC} model_name=${model_name} protocol=${PROTOCOL}
... storage_size=70Gi model_path=${model_path}
${requests}= Create Dictionary memory=40Gi
IF "${OVERLAY}" != "${EMPTY}"
${overlays}= Create List ${OVERLAY}
ELSE
${overlays}= Create List
END
Compile Inference Service YAML isvc_name=${model_name}
... sa_name=${EMPTY}
... model_storage_uri=${storage_uri}
... model_format=${MODEL_FORMAT} serving_runtime=${RUNTIME_NAME}
... limits_dict=${limits} requests_dict=${requests} kserve_mode=${KSERVE_MODE}
... overlays=${overlays}
Deploy Model Via CLI isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
... namespace=${test_namespace}
Wait For Model KServe Deployment To Be Ready label_selector=serving.kserve.io/inferenceservice=${model_name}
... namespace=${test_namespace} runtime=${RUNTIME_NAME} timeout=900s
${pod_name}= Get Pod Name namespace=${test_namespace} label_selector=serving.kserve.io/inferenceservice=${model_name}
Run Keyword If "${KSERVE_MODE}"=="RawDeployment"
... Start Port-forwarding namespace=${test_namespace} pod_name=${pod_name}
IF "${RUNTIME_NAME}" == "tgis-runtime" or "${KSERVE_MODE}" == "RawDeployment"
Set Test Variable ${RUNTIME_NAME} tgis-runtime
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME}
... inference_type=all-tokens n_times=1 protocol=${PROTOCOL}
... namespace=${test_namespace} query_idx=0 validate_response=${FALSE} # temp
... port_forwarding=${use_port_forwarding}
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME}
... inference_type=streaming n_times=1 protocol=${PROTOCOL}
... namespace=${test_namespace} query_idx=0 validate_response=${FALSE}
... port_forwarding=${use_port_forwarding}
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME}
... inference_type=model-info n_times=0
... namespace=${test_namespace} validate_response=${TRUE} string_check_only=${TRUE}
... port_forwarding=${use_port_forwarding}
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME}
... inference_type=tokenize n_times=0 query_idx=0
... namespace=${test_namespace} validate_response=${TRUE} string_check_only=${TRUE}
... port_forwarding=${use_port_forwarding}
ELSE IF "${RUNTIME_NAME}" == "vllm-runtime" and "${KSERVE_MODE}" == "Serverless"
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} protocol=http
... inference_type=chat-completions n_times=1 query_idx=12
... namespace=${test_namespace} string_check_only=${TRUE}
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} protocol=http
... inference_type=completions n_times=1 query_idx=11
... namespace=${test_namespace} string_check_only=${TRUE}
END
[Teardown] Run Keywords
... Clean Up Test Project test_ns=${test_namespace}
... isvc_names=${models_names} wait_prj_deletion=${FALSE}
... kserve_mode=${KSERVE_MODE}
... AND
... Run Keyword If "${KSERVE_MODE}"=="RawDeployment" Terminate Process llm-query-process kill=true

Verify User Can Serve And Query A ibm-granite/granite-8b-code-base Model
[Documentation] Basic tests for preparing, deploying and querying a LLM model
... using Kserve using TGIS standalone or vllm runtime
[Tags] RHOAIENG-7689 VLLM
Setup Test Variables model_name=granite-8b-code-base use_pvc=${USE_PVC} use_gpu=${USE_GPU}
... kserve_mode=${KSERVE_MODE} model_path=granite-8b-code-base
Set Project And Runtime runtime=${RUNTIME_NAME} namespace=${test_namespace}
... download_in_pvc=${DOWNLOAD_IN_PVC} model_name=${model_name} protocol=${PROTOCOL}
... storage_size=40Gi model_path=${model_path}
${requests}= Create Dictionary memory=40Gi
IF "${OVERLAY}" != "${EMPTY}"
${overlays}= Create List ${OVERLAY}
ELSE
${overlays}= Create List
END
Compile Inference Service YAML isvc_name=${model_name}
... sa_name=${EMPTY}
... model_storage_uri=${storage_uri}
... model_format=${MODEL_FORMAT} serving_runtime=${RUNTIME_NAME}
... limits_dict=${limits} requests_dict=${requests} kserve_mode=${KSERVE_MODE}
... overlays=${overlays}
Deploy Model Via CLI isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
... namespace=${test_namespace}
Wait For Model KServe Deployment To Be Ready label_selector=serving.kserve.io/inferenceservice=${model_name}
... namespace=${test_namespace} runtime=${RUNTIME_NAME} timeout=900s
${pod_name}= Get Pod Name namespace=${test_namespace} label_selector=serving.kserve.io/inferenceservice=${model_name}
Run Keyword If "${KSERVE_MODE}"=="RawDeployment"
... Start Port-forwarding namespace=${test_namespace} pod_name=${pod_name}
IF "${RUNTIME_NAME}" == "tgis-runtime" or "${KSERVE_MODE}" == "RawDeployment"
Set Test Variable ${RUNTIME_NAME} tgis-runtime
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME}
... inference_type=all-tokens n_times=1 protocol=${PROTOCOL}
... namespace=${test_namespace} query_idx=0 validate_response=${FALSE} # temp
... port_forwarding=${use_port_forwarding}
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME}
... inference_type=streaming n_times=1 protocol=${PROTOCOL}
... namespace=${test_namespace} query_idx=0 validate_response=${FALSE}
... port_forwarding=${use_port_forwarding}
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME}
... inference_type=model-info n_times=0
... namespace=${test_namespace} validate_response=${TRUE} string_check_only=${TRUE}
... port_forwarding=${use_port_forwarding}
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME}
... inference_type=tokenize n_times=0 query_idx=0
... namespace=${test_namespace} validate_response=${TRUE} string_check_only=${TRUE}
... port_forwarding=${use_port_forwarding}
ELSE IF "${RUNTIME_NAME}" == "vllm-runtime" and "${KSERVE_MODE}" == "Serverless"
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} protocol=http
... inference_type=chat-completions n_times=1 query_idx=12
... namespace=${test_namespace} string_check_only=${TRUE}
Query Model Multiple Times model_name=${model_name} runtime=${RUNTIME_NAME} protocol=http
... inference_type=completions n_times=1 query_idx=11
... namespace=${test_namespace} string_check_only=${TRUE}
END
[Teardown] Run Keywords
... Clean Up Test Project test_ns=${test_namespace}
... isvc_names=${models_names} wait_prj_deletion=${FALSE}
... kserve_mode=${KSERVE_MODE}
... AND
... Run Keyword If "${KSERVE_MODE}"=="RawDeployment" Terminate Process llm-query-process kill=true


*** Keywords ***
Suite Setup
Expand Down

0 comments on commit de1c0f8

Please sign in to comment.