Add new granite and intructlab merlinite-7b-lab model for tgis and vl…

…lm (red-hat-data-services#1486) Signed-off-by: Tarun Kumar <takumar@redhat.com>
bdattoma · May 30, 2024 · de1c0f8 · de1c0f8
1 parent 60ed928
commit de1c0f8
Show file tree

Hide file tree

Showing 4 changed files with 166 additions and 2 deletions.
diff --git a/ods_ci/tests/Resources/Files/llm/model_expected_responses.json b/ods_ci/tests/Resources/Files/llm/model_expected_responses.json
@@ -42,6 +42,22 @@
                     "tgis-runtime": {
                         "tokenize_response_text": "{'responses':[{'tokenCount':9,'tokens':['\\u003cs\\u003e','▁At','▁what','▁temperature','▁does','▁water','▁bo','il','?']}]}"
                     }
+                },
+                "granite-8b-code-base": {
+                    "response_tokens": 20,
+                    "response_text": "\n\nWater boils at 100 degrees Celsius or 212",
+                    "streamed_response_text": "{ 'inputTokenCount': 8 } { 'generatedTokenCount': 2, 'text': '\n' } { 'generatedTokenCount': 3, 'text': '\n' } { 'generatedTokenCount': 4, 'text': 'Wate' } { 'generatedTokenCount': 5, 'text': 'r b' } { 'generatedTokenCount': 6, 'text': 'oil' } { 'generatedTokenCount': 7, 'text': 's a' } { 'generatedTokenCount': 8, 'text': 't' } { 'generatedTokenCount': 9, 'text': ' ' } { 'generatedTokenCount': 10, 'text': '1' } { 'generatedTokenCount': 11, 'text': '0' } { 'generatedTokenCount': 12, 'text': '0 degree' } { 'generatedTokenCount': 13, 'text': 's Ce' } { 'generatedTokenCount': 14, 'text': 'ls' } { 'generatedTokenCount': 15, 'text': 'iu' } { 'generatedTokenCount': 16, 'text': 's o' } { 'generatedTokenCount': 17, 'text': 'r' } { 'generatedTokenCount': 18, 'text': ' ' } { 'generatedTokenCount': 19, 'text': '2' } { 'generatedTokenCount': 20, 'text': '12', 'stopReason': 'MAX_TOKENS' }",
+                    "tgis-runtime": {
+                        "tokenize_response_text": "{'responses':[{'tokenCount':9,'tokens':['\\u003cs\\u003e','▁At','▁what','▁temperature','▁does','▁water','▁bo','il','?']}]}"
+                    }
+                },
+                "merlinite-7b-lab": {
+                    "response_tokens": 20,
+                    "response_text": "\n\nWater boils at 100 degrees Celsius or 212",
+                    "streamed_response_text": "{ 'inputTokenCount': 8 } { 'generatedTokenCount': 2, 'text': '\n' } { 'generatedTokenCount': 3, 'text': '\n' } { 'generatedTokenCount': 4, 'text': 'Wate' } { 'generatedTokenCount': 5, 'text': 'r b' } { 'generatedTokenCount': 6, 'text': 'oil' } { 'generatedTokenCount': 7, 'text': 's a' } { 'generatedTokenCount': 8, 'text': 't' } { 'generatedTokenCount': 9, 'text': ' ' } { 'generatedTokenCount': 10, 'text': '1' } { 'generatedTokenCount': 11, 'text': '0' } { 'generatedTokenCount': 12, 'text': '0 degree' } { 'generatedTokenCount': 13, 'text': 's Ce' } { 'generatedTokenCount': 14, 'text': 'ls' } { 'generatedTokenCount': 15, 'text': 'iu' } { 'generatedTokenCount': 16, 'text': 's o' } { 'generatedTokenCount': 17, 'text': 'r' } { 'generatedTokenCount': 18, 'text': ' ' } { 'generatedTokenCount': 19, 'text': '2' } { 'generatedTokenCount': 20, 'text': '12', 'stopReason': 'MAX_TOKENS' }",
+                    "tgis-runtime": {
+                        "tokenize_response_text": "{'responses':[{'tokenCount':9,'tokens':['\\u003cs\\u003e','▁At','▁what','▁temperature','▁does','▁water','▁bo','il','?']}]}"
+                    }
                 }
             }
         },
@@ -184,6 +200,16 @@
                     "vllm-runtime": {
                         "completions_response_text": "{\"object\":\"text_completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"text\":\" beautiful city that is full of life and energy. Located along the San Francisco\",\"logprobs\":null,\"finish_reason\":\"length\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":5,\"total_tokens\":21,\"completion_tokens\":16}}"
                     }
+                },
+                "granite-8b-code-base":{
+                    "vllm-runtime": {
+                        "completions_response_text": "{\"object\":\"text_completion\",\"created\":,\"model\":\"granite-8b-code-base\",\"choices\":[{\"index\":0,\"text\":\" city in California, one of the most populous cities in the United States\",\"logprobs\":null,\"finish_reason\":\"length\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":5,\"total_tokens\":20,\"completion_tokens\":16}}"
+                    }
+                },
+                "merlinite-7b-lab":{
+                    "vllm-runtime": {
+                        "completions_response_text": "{\"object\":\"text_completion\",\"created\":,\"model\":\"granite-8b-code-base\",\"choices\":[{\"index\":0,\"text\":\" city in California, one of the most populous cities in the United States\",\"logprobs\":null,\"finish_reason\":\"length\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":5,\"total_tokens\":20,\"completion_tokens\":16}}"
+                    }
                 }
             }
         },
@@ -194,6 +220,16 @@
                     "vllm-runtime": {
                         "chat-completions_response_text": "{\"object\":\"chat.completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"  This is a test!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":15,\"total_tokens\":22,\"completion_tokens\":7}}"
                     }
+                },
+                "granite-8b-code-base":{
+                    "vllm-runtime": {
+                        "chat-completions_response_text": "{\"object\":\"chat.completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"  This is a test!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":15,\"total_tokens\":22,\"completion_tokens\":7}}"
+                    }
+                },
+                "merlinite-7b-lab":{
+                    "vllm-runtime": {
+                        "chat-completions_response_text": "{\"object\":\"chat.completion\",\"created\":,\"model\":\"llama-2-13b-chat\",\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"  This is a test!\"},\"logprobs\":null,\"finish_reason\":\"stop\",\"stop_reason\":null}],\"usage\":{\"prompt_tokens\":15,\"total_tokens\":22,\"completion_tokens\":7}}"
+                    }
                 }
             }
         }
@@ -228,6 +264,12 @@
         },
         "llama-2-13b-chat-hf":{
             "tgis-runtime": "{ 'maxSequenceLength': 4096,'maxNewTokens': 1024}"
+        },
+        "granite-8b-code-base":{
+            "tgis-runtime": "{ 'maxSequenceLength': 4096,'maxNewTokens': 1024}"
+        },
+        "merlinite-7b-lab": {
+            "tgis-runtime": "{ 'maxSequenceLength': 4096,'maxNewTokens': 1024}"
         }
     }
 }
diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_grpc.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_grpc.yaml
@@ -13,7 +13,7 @@ spec:
         - /models-cache
         - --port
         - "8080"
-      image: quay.io/opendatahub/vllm:fast-ibm-nightly-2024-05-21
+      image: quay.io/opendatahub/vllm:stable-e392b03
       name: kserve-container
       command:
         - python3

diff --git a/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml b/ods_ci/tests/Resources/Files/llm/serving_runtimes/vllm_servingruntime_http.yaml
@@ -13,7 +13,7 @@ spec:
         - /models-cache
         - --port
         - "8080"
-      image: quay.io/opendatahub/vllm:fast-ibm-nightly-2024-05-21
+      image: quay.io/opendatahub/vllm:stable-e392b03
       name: kserve-container
       command:
         - python3

diff --git a/...ests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot b/...ests/Tests/400__ods_dashboard/420__model_serving/LLMs/422__model_serving_llm_models.robot
@@ -471,6 +471,128 @@ Verify User Can Serve And Query A google/flan-t5-xl Prompt Tuned Model
     ...    AND
     ...    Run Keyword If    "${KSERVE_MODE}"=="RawDeployment"    Terminate Process    llm-query-process    kill=true
 
+Verify User Can Serve And Query A instructlab/merlinite-7b-lab Model
+    [Documentation]    Basic tests for preparing, deploying and querying a LLM model
+    ...                using Kserve using TGIS standalone or vllm runtime
+    [Tags]    RHOAIENG-7690    VLLM
+    Setup Test Variables    model_name=merlinite-7b-lab    use_pvc=${USE_PVC}    use_gpu=${USE_GPU}
+    ...    kserve_mode=${KSERVE_MODE}    model_path=merlinite-7b-lab
+    Set Project And Runtime    runtime=${RUNTIME_NAME}     namespace=${test_namespace}
+    ...    download_in_pvc=${DOWNLOAD_IN_PVC}    model_name=${model_name}    protocol=${PROTOCOL}
+    ...    storage_size=70Gi    model_path=${model_path}
+    ${requests}=    Create Dictionary    memory=40Gi
+    IF    "${OVERLAY}" != "${EMPTY}"
+          ${overlays}=   Create List    ${OVERLAY}
+    ELSE
+          ${overlays}=   Create List
+    END
+    Compile Inference Service YAML    isvc_name=${model_name}
+    ...    sa_name=${EMPTY}
+    ...    model_storage_uri=${storage_uri}
+    ...    model_format=${MODEL_FORMAT}    serving_runtime=${RUNTIME_NAME}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
+    ...    overlays=${overlays}
+    Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
+    ...    namespace=${test_namespace}
+    Wait For Model KServe Deployment To Be Ready    label_selector=serving.kserve.io/inferenceservice=${model_name}
+    ...    namespace=${test_namespace}    runtime=${RUNTIME_NAME}    timeout=900s
+    ${pod_name}=  Get Pod Name    namespace=${test_namespace}    label_selector=serving.kserve.io/inferenceservice=${model_name}
+    Run Keyword If    "${KSERVE_MODE}"=="RawDeployment"
+    ...    Start Port-forwarding    namespace=${test_namespace}    pod_name=${pod_name}
+    IF     "${RUNTIME_NAME}" == "tgis-runtime" or "${KSERVE_MODE}" == "RawDeployment"
+            Set Test Variable    ${RUNTIME_NAME}    tgis-runtime
+            Query Model Multiple Times    model_name=${model_name}    runtime=${RUNTIME_NAME}
+            ...    inference_type=all-tokens    n_times=1    protocol=${PROTOCOL}
+            ...    namespace=${test_namespace}   query_idx=0   validate_response=${FALSE}    # temp
+            ...    port_forwarding=${use_port_forwarding}
+            Query Model Multiple Times    model_name=${model_name}    runtime=${RUNTIME_NAME}
+            ...    inference_type=streaming    n_times=1    protocol=${PROTOCOL}
+            ...    namespace=${test_namespace}    query_idx=0    validate_response=${FALSE}
+            ...    port_forwarding=${use_port_forwarding}
+            Query Model Multiple Times    model_name=${model_name}    runtime=${RUNTIME_NAME}
+            ...    inference_type=model-info    n_times=0
+            ...    namespace=${test_namespace}    validate_response=${TRUE}    string_check_only=${TRUE}
+            ...    port_forwarding=${use_port_forwarding}
+            Query Model Multiple Times    model_name=${model_name}    runtime=${RUNTIME_NAME}
+            ...    inference_type=tokenize    n_times=0    query_idx=0
+            ...    namespace=${test_namespace}    validate_response=${TRUE}    string_check_only=${TRUE}
+            ...    port_forwarding=${use_port_forwarding}
+    ELSE IF    "${RUNTIME_NAME}" == "vllm-runtime" and "${KSERVE_MODE}" == "Serverless"
+            Query Model Multiple Times    model_name=${model_name}      runtime=${RUNTIME_NAME}    protocol=http
+            ...    inference_type=chat-completions    n_times=1    query_idx=12
+            ...    namespace=${test_namespace}    string_check_only=${TRUE}
+            Query Model Multiple Times    model_name=${model_name}      runtime=${RUNTIME_NAME}    protocol=http
+            ...    inference_type=completions    n_times=1    query_idx=11
+            ...    namespace=${test_namespace}    string_check_only=${TRUE}
+    END
+    [Teardown]    Run Keywords
+    ...    Clean Up Test Project    test_ns=${test_namespace}
+    ...    isvc_names=${models_names}    wait_prj_deletion=${FALSE}
+    ...    kserve_mode=${KSERVE_MODE}
+    ...    AND
+    ...    Run Keyword If    "${KSERVE_MODE}"=="RawDeployment"    Terminate Process    llm-query-process    kill=true
+
+Verify User Can Serve And Query A ibm-granite/granite-8b-code-base Model
+    [Documentation]    Basic tests for preparing, deploying and querying a LLM model
+    ...                using Kserve using TGIS standalone or vllm runtime
+    [Tags]    RHOAIENG-7689    VLLM
+    Setup Test Variables    model_name=granite-8b-code-base   use_pvc=${USE_PVC}    use_gpu=${USE_GPU}
+    ...    kserve_mode=${KSERVE_MODE}    model_path=granite-8b-code-base
+    Set Project And Runtime    runtime=${RUNTIME_NAME}     namespace=${test_namespace}
+    ...    download_in_pvc=${DOWNLOAD_IN_PVC}    model_name=${model_name}    protocol=${PROTOCOL}
+    ...    storage_size=40Gi    model_path=${model_path}
+    ${requests}=    Create Dictionary    memory=40Gi
+    IF    "${OVERLAY}" != "${EMPTY}"
+          ${overlays}=   Create List    ${OVERLAY}
+    ELSE
+          ${overlays}=   Create List
+    END
+    Compile Inference Service YAML    isvc_name=${model_name}
+    ...    sa_name=${EMPTY}
+    ...    model_storage_uri=${storage_uri}
+    ...    model_format=${MODEL_FORMAT}    serving_runtime=${RUNTIME_NAME}
+    ...    limits_dict=${limits}    requests_dict=${requests}    kserve_mode=${KSERVE_MODE}
+    ...    overlays=${overlays}
+    Deploy Model Via CLI    isvc_filepath=${INFERENCESERVICE_FILLED_FILEPATH}
+    ...    namespace=${test_namespace}
+    Wait For Model KServe Deployment To Be Ready    label_selector=serving.kserve.io/inferenceservice=${model_name}
+    ...    namespace=${test_namespace}    runtime=${RUNTIME_NAME}    timeout=900s
+    ${pod_name}=  Get Pod Name    namespace=${test_namespace}    label_selector=serving.kserve.io/inferenceservice=${model_name}
+    Run Keyword If    "${KSERVE_MODE}"=="RawDeployment"
+    ...    Start Port-forwarding    namespace=${test_namespace}    pod_name=${pod_name}
+    IF     "${RUNTIME_NAME}" == "tgis-runtime" or "${KSERVE_MODE}" == "RawDeployment"
+            Set Test Variable    ${RUNTIME_NAME}    tgis-runtime
+            Query Model Multiple Times    model_name=${model_name}    runtime=${RUNTIME_NAME}
+            ...    inference_type=all-tokens    n_times=1    protocol=${PROTOCOL}
+            ...    namespace=${test_namespace}   query_idx=0   validate_response=${FALSE}   # temp
+            ...    port_forwarding=${use_port_forwarding}
+            Query Model Multiple Times    model_name=${model_name}    runtime=${RUNTIME_NAME}
+            ...    inference_type=streaming    n_times=1    protocol=${PROTOCOL}
+            ...    namespace=${test_namespace}    query_idx=0    validate_response=${FALSE}
+            ...    port_forwarding=${use_port_forwarding}
+            Query Model Multiple Times    model_name=${model_name}    runtime=${RUNTIME_NAME}
+            ...    inference_type=model-info    n_times=0
+            ...    namespace=${test_namespace}    validate_response=${TRUE}    string_check_only=${TRUE}
+            ...    port_forwarding=${use_port_forwarding}
+            Query Model Multiple Times    model_name=${model_name}    runtime=${RUNTIME_NAME}
+            ...    inference_type=tokenize    n_times=0    query_idx=0
+            ...    namespace=${test_namespace}    validate_response=${TRUE}    string_check_only=${TRUE}
+            ...    port_forwarding=${use_port_forwarding}
+    ELSE IF    "${RUNTIME_NAME}" == "vllm-runtime" and "${KSERVE_MODE}" == "Serverless"
+            Query Model Multiple Times    model_name=${model_name}      runtime=${RUNTIME_NAME}    protocol=http
+            ...    inference_type=chat-completions    n_times=1    query_idx=12
+            ...    namespace=${test_namespace}    string_check_only=${TRUE}
+            Query Model Multiple Times    model_name=${model_name}      runtime=${RUNTIME_NAME}    protocol=http
+            ...    inference_type=completions    n_times=1    query_idx=11
+            ...    namespace=${test_namespace}    string_check_only=${TRUE}
+    END
+    [Teardown]    Run Keywords
+    ...    Clean Up Test Project    test_ns=${test_namespace}
+    ...    isvc_names=${models_names}    wait_prj_deletion=${FALSE}
+    ...    kserve_mode=${KSERVE_MODE}
+    ...    AND
+    ...    Run Keyword If    "${KSERVE_MODE}"=="RawDeployment"    Terminate Process    llm-query-process    kill=true
+
 
 *** Keywords ***
 Suite Setup