From 68f9b9f1ef4732a22486e69202c61b100f80ed60 Mon Sep 17 00:00:00 2001 From: bdattoma Date: Thu, 17 Oct 2024 17:36:20 +0200 Subject: [PATCH] add nvidia-gpus tag along with resources-gpu --- .../1002__model_serving_modelmesh_gpu.robot | 4 +- .../1005__model_serving_ovms_on_kserve.robot | 2 +- .../1007__model_serving_llm.robot | 2 +- .../1007__model_serving_llm_UI.robot | 2 +- ..._model_serving_llm_other_runtimes_UI.robot | 6 +- .../1007__model_serving_llm_tgis.robot | 2 +- .../1008__model_serving_vllm_metrics.robot | 6 +- ...0__ods_dashboard_projects_additional.robot | 4 +- .../minimal-cuda-test.robot | 14 +-- .../minimal-pytorch-test.robot | 10 +- .../minimal-tensorflow-test.robot | 10 +- .../501__ide_jupyterhub/multiple-gpus.robot | 4 +- .../500__jupyterhub/minimal-cuda-test.robot | 110 ++++++++++++++++++ .../test-run-codeflare-tests.robot | 2 +- 14 files changed, 144 insertions(+), 34 deletions(-) create mode 100644 ods_ci/tests/Tests/500__jupyterhub/minimal-cuda-test.robot diff --git a/ods_ci/tests/Tests/1000__model_serving/1002__model_serving_modelmesh_gpu.robot b/ods_ci/tests/Tests/1000__model_serving/1002__model_serving_modelmesh_gpu.robot index 6e4564b5b..7efa30ad1 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1002__model_serving_modelmesh_gpu.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1002__model_serving_modelmesh_gpu.robot @@ -24,7 +24,7 @@ ${RUNTIME_NAME}= Model Serving GPU Test *** Test Cases *** Verify GPU Model Deployment Via UI [Documentation] Test the deployment of an openvino_ir model on a model server with GPUs attached - [Tags] Sanity Tier1 Resources-GPU + [Tags] Sanity Tier1 Resources-GPU NVIDIA-GPUs ... ODS-2214 Clean All Models Of Current User Open Data Science Projects Home Page @@ -57,7 +57,7 @@ Verify GPU Model Deployment Via UI Test Inference Load On GPU [Documentation] Test the inference load on the GPU after sending random requests to the endpoint - [Tags] Sanity Tier1 Resources-GPU + [Tags] Sanity Tier1 Resources-GPU NVIDIA-GPUs ... ODS-2213 ${url}= Get Model Route Via UI ${MODEL_NAME} Send Random Inference Request endpoint=${url} no_requests=100 diff --git a/ods_ci/tests/Tests/1000__model_serving/1005__model_serving_ovms_on_kserve.robot b/ods_ci/tests/Tests/1000__model_serving/1005__model_serving_ovms_on_kserve.robot index 609714d90..93b94e3f2 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1005__model_serving_ovms_on_kserve.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1005__model_serving_ovms_on_kserve.robot @@ -103,7 +103,7 @@ Verify Multiple Projects With Same Model (OVMS on Kserve) Verify GPU Model Deployment Via UI (OVMS on Kserve) [Documentation] Test the deployment of an openvino_ir model on a model server with GPUs attached - [Tags] Sanity Tier1 Resources-GPU + [Tags] Sanity Tier1 Resources-GPU NVIDIA-GPUs ... ODS-2630 ODS-2631 ProductBug RHOAIENG-3355 ${requests}= Create Dictionary nvidia.com/gpu=1 ${limits}= Create Dictionary nvidia.com/gpu=1 diff --git a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm.robot b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm.robot index fc5f64c87..325f5d0fd 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm.robot @@ -345,7 +345,7 @@ Verify User Can Set Requests And Limits For A Model Verify Model Can Be Served And Query On A GPU Node [Documentation] Basic tests for preparing, deploying and querying a LLM model on GPU node ... using Kserve and Caikit+TGIS runtime - [Tags] Sanity Tier1 ODS-2381 Resources-GPU + [Tags] Sanity Tier1 ODS-2381 Resources-GPU NVIDIA-GPUs [Setup] Set Project And Runtime namespace=singlemodel-gpu ${test_namespace}= Set Variable singlemodel-gpu ${model_name}= Set Variable flan-t5-small-caikit diff --git a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_UI.robot b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_UI.robot index 4b427fbac..05af8e12a 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_UI.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_UI.robot @@ -150,7 +150,7 @@ Verify User Can Set Requests And Limits For A Model Using The UI # robocop: dis Verify Model Can Be Served And Query On A GPU Node Using The UI # robocop: disable [Documentation] Basic tests for preparing, deploying and querying a LLM model on GPU node ... using Kserve and Caikit+TGIS runtime - [Tags] Sanity Tier1 ODS-2523 Resources-GPU + [Tags] Sanity Tier1 ODS-2523 Resources-GPU NVIDIA-GPUs [Setup] Set Up Project namespace=singlemodel-gpu ${test_namespace}= Set Variable singlemodel-gpu ${model_name}= Set Variable flan-t5-small-caikit diff --git a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_other_runtimes_UI.robot b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_other_runtimes_UI.robot index c0bf43106..9af2fab2d 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_other_runtimes_UI.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_other_runtimes_UI.robot @@ -57,7 +57,7 @@ Verify Non Admin Can Serve And Query A Model Using The UI # robocop: disable Verify Model Can Be Served And Query On A GPU Node Using The UI # robocop: disable [Documentation] Basic tests for preparing, deploying and querying a LLM model on GPU node ... using Single-model platform and TGIS Standalone runtime. - [Tags] Sanity Tier1 ODS-2612 Resources-GPU + [Tags] Sanity Tier1 ODS-2612 Resources-GPU NVIDIA-GPUs [Setup] Run git clone https://github.com/IBM/text-generation-inference/ ${test_namespace}= Set Variable ${TEST_NS} ${isvc__name}= Set Variable flan-t5-small-hf-gpu @@ -83,7 +83,7 @@ Verify Model Can Be Served And Query On A GPU Node Using The UI # robocop: disa Verify Model Can Be Served And Query On A GPU Node Using The UI For VLMM # robocop: disable [Documentation] Basic tests for preparing, deploying and querying a LLM model on GPU node ... using Single-model platform with vllm runtime. - [Tags] Sanity Tier1 RHOAIENG-6344 Resources-GPU + [Tags] Sanity Tier1 RHOAIENG-6344 Resources-GPU NVIDIA-GPUs ${test_namespace}= Set Variable ${TEST_NS} ${isvc__name}= Set Variable gpt2-gpu ${model_name}= Set Variable gpt2 @@ -105,7 +105,7 @@ Verify Model Can Be Served And Query On A GPU Node Using The UI For VLMM # robo Verify Embeddings Model Can Be Served And Query On A GPU Node Using The UI For VLMM # robocop: disable [Documentation] Basic tests for preparing, deploying and querying a LLM model on GPU node ... using Single-model platform with vllm runtime. - [Tags] Sanity Tier1 RHOAIENG-8832 Resources-GPU + [Tags] Sanity Tier1 RHOAIENG-8832 Resources-GPU NVIDIA-GPUs ${test_namespace}= Set Variable ${TEST_NS} ${isvc__name}= Set Variable e5-mistral-7b-gpu ${model_name}= Set Variable e5-mistral-7b diff --git a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_tgis.robot b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_tgis.robot index 6f864d0e3..6b9be224c 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_tgis.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1007__model_serving_llm_tgis.robot @@ -381,7 +381,7 @@ Verify User Can Set Requests And Limits For A Model Verify Model Can Be Served And Query On A GPU Node [Documentation] Basic tests for preparing, deploying and querying a LLM model on GPU node ... using Kserve and Caikit+TGIS runtime - [Tags] Tier1 ODS-2381 Resources-GPU + [Tags] Tier1 ODS-2381 Resources-GPU NVIDIA-GPUs [Setup] Set Project And Runtime runtime=${TGIS_RUNTIME_NAME} namespace=singlemodel-gpu ${test_namespace}= Set Variable singlemodel-gpu ${model_name}= Set Variable flan-t5-small-caikit diff --git a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1008__model_serving_vllm/1008__model_serving_vllm_metrics.robot b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1008__model_serving_vllm/1008__model_serving_vllm_metrics.robot index 11f4eed50..067e05b67 100644 --- a/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1008__model_serving_vllm/1008__model_serving_vllm_metrics.robot +++ b/ods_ci/tests/Tests/1000__model_serving/1007__model_serving_llm/1008__model_serving_vllm/1008__model_serving_vllm_metrics.robot @@ -57,7 +57,7 @@ ${TEST_NS}= vllm-gpt2 *** Test Cases *** Verify User Can Deploy A Model With Vllm Via CLI [Documentation] Deploy a model (gpt2) using the vllm runtime and confirm that it's running - [Tags] Tier1 Sanity Resources-GPU RHOAIENG-6264 VLLM + [Tags] Tier1 Sanity Resources-GPU NVIDIA-GPUs RHOAIENG-6264 VLLM ${rc} ${out}= Run And Return Rc And Output oc apply -f ${DL_POD_FILEPATH} Should Be Equal As Integers ${rc} ${0} Wait For Pods To Succeed label_selector=gpt-download-pod=true namespace=${TEST_NS} @@ -77,7 +77,7 @@ Verify User Can Deploy A Model With Vllm Via CLI Verify Vllm Metrics Are Present [Documentation] Confirm vLLM metrics are exposed in OpenShift metrics - [Tags] Tier1 Sanity Resources-GPU RHOAIENG-6264 VLLM + [Tags] Tier1 Sanity Resources-GPU NVIDIA-GPUs RHOAIENG-6264 VLLM Depends On Test Verify User Can Deploy A Model With Vllm Via CLI ${host}= llm.Get KServe Inference Host Via CLI isvc_name=vllm-gpt2-openai namespace=${TEST_NS} ${rc} ${out}= Run And Return Rc And Output curl -ks https://${host}/metrics/ @@ -91,7 +91,7 @@ Verify Vllm Metrics Are Present Verify Vllm Metrics Values Match Between UWM And Endpoint [Documentation] Confirm the values returned by UWM and by the model endpoint match for each metric - [Tags] Tier1 Sanity Resources-GPU RHOAIENG-6264 RHOAIENG-7687 VLLM + [Tags] Tier1 Sanity Resources-GPU NVIDIA-GPUs RHOAIENG-6264 RHOAIENG-7687 VLLM Depends On Test Verify User Can Deploy A Model With Vllm Via CLI Depends On Test Verify Vllm Metrics Are Present ${host}= llm.Get KServe Inference Host Via CLI isvc_name=vllm-gpt2-openai namespace=${TEST_NS} diff --git a/ods_ci/tests/Tests/400__ods_dashboard/410__ods_dashboard_projects/410__ods_dashboard_projects_additional.robot b/ods_ci/tests/Tests/400__ods_dashboard/410__ods_dashboard_projects/410__ods_dashboard_projects_additional.robot index 22ac31374..698231018 100644 --- a/ods_ci/tests/Tests/400__ods_dashboard/410__ods_dashboard_projects/410__ods_dashboard_projects_additional.robot +++ b/ods_ci/tests/Tests/400__ods_dashboard/410__ods_dashboard_projects/410__ods_dashboard_projects_additional.robot @@ -83,7 +83,7 @@ Verify Notebook Tolerations Are Applied To Workbenches Verify User Can Add GPUs To Workbench [Documentation] Verifies user can add GPUs to an already started workbench [Tags] Tier1 Sanity - ... ODS-2013 Resources-GPU + ... ODS-2013 Resources-GPU NVIDIA-GPUs Launch Data Science Project Main Page Create Workbench workbench_title=${WORKBENCH_TITLE_GPU} workbench_description=${EMPTY} ... prj_title=${PRJ_TITLE} image_name=${NB_IMAGE_GPU} deployment_size=Small @@ -107,7 +107,7 @@ Verify User Can Add GPUs To Workbench Verify User Can Remove GPUs From Workbench [Documentation] Verifies user can remove GPUs from an already started workbench [Tags] Tier1 Sanity - ... ODS-2014 Resources-GPU + ... ODS-2014 Resources-GPU NVIDIA-GPUs Launch Data Science Project Main Page Create Workbench workbench_title=${WORKBENCH_TITLE_GPU} workbench_description=${EMPTY} ... prj_title=${PRJ_TITLE} image_name=${NB_IMAGE_GPU} deployment_size=Small diff --git a/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-cuda-test.robot b/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-cuda-test.robot index f919d2e5e..3676daf55 100644 --- a/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-cuda-test.robot +++ b/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-cuda-test.robot @@ -22,35 +22,35 @@ Verify CUDA Image Can Be Spawned With GPU [Documentation] Spawns CUDA image with 1 GPU and verifies that the GPU is ... not available for other users. [Tags] Sanity Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1141 ODS-346 ODS-1359 Pass Execution Passing tests, as suite setup ensures that image can be spawned Verify CUDA Image Includes Expected CUDA Version [Documentation] Checks CUDA version [Tags] Sanity Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1142 Verify Installed CUDA Version ${EXPECTED_CUDA_VERSION} Verify PyTorch Library Can See GPUs In Minimal CUDA [Documentation] Installs PyTorch and verifies it can see the GPU [Tags] Sanity Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1144 Verify Pytorch Can See GPU install=True Verify Tensorflow Library Can See GPUs In Minimal CUDA [Documentation] Installs Tensorflow and verifies it can see the GPU [Tags] Sanity Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1143 Verify Tensorflow Can See GPU install=True Verify Cuda Image Has NVCC Installed [Documentation] Verifies NVCC Version in Minimal CUDA Image [Tags] Sanity Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-483 ${nvcc_version} = Run Cell And Get Output input=!nvcc --version Should Not Contain ${nvcc_version} /usr/bin/sh: nvcc: command not found @@ -58,7 +58,7 @@ Verify Cuda Image Has NVCC Installed Verify Previous CUDA Notebook Image With GPU [Documentation] Runs a workload after spawning the N-1 CUDA Notebook [Tags] Tier2 LiveTesting - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-2128 [Setup] N-1 CUDA Setup Spawn Notebook With Arguments image=${NOTEBOOK_IMAGE} size=Small gpus=1 version=previous @@ -90,7 +90,7 @@ Verify CUDA Image Suite Setup # This will fail in case there are two nodes with the same number of GPUs # Since the overall available number won't change even after 1 GPU is assigned # However I can't think of a better way to execute this check, under the assumption that - # the Resources-GPU tag will always ensure there is 1 node with 1 GPU on the cluster. + # the Resources-GPU NVIDIA-GPUs tag will always ensure there is 1 node with 1 GPU on the cluster. ${maxNo} = Find Max Number Of GPUs In One Node ${maxSpawner} = Fetch Max Number Of GPUs In Spawner Page # Need to continue execution even on failure or the whole suite will be failed diff --git a/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-pytorch-test.robot b/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-pytorch-test.robot index 7f1710888..9158b79ff 100644 --- a/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-pytorch-test.robot +++ b/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-pytorch-test.robot @@ -52,7 +52,7 @@ Verify Tensorboard Is Accessible Verify PyTorch Image Can Be Spawned With GPU [Documentation] Spawns PyTorch image with 1 GPU [Tags] Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1145 Clean Up Server Stop JupyterLab Notebook Server @@ -63,28 +63,28 @@ Verify PyTorch Image Can Be Spawned With GPU Verify PyTorch Image Includes Expected CUDA Version [Documentation] Checks CUDA version [Tags] Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1146 Verify Installed CUDA Version ${EXPECTED_CUDA_VERSION} Verify PyTorch Library Can See GPUs In PyTorch Image [Documentation] Verifies PyTorch can see the GPU [Tags] Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1147 Verify Pytorch Can See GPU Verify PyTorch Image GPU Workload [Documentation] Runs a workload on GPUs in PyTorch image [Tags] Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1148 Run Repo And Clean https://github.com/lugi0/notebook-benchmarks notebook-benchmarks/pytorch/fgsm_tutorial.ipynb Verify Previous PyTorch Notebook Image With GPU [Documentation] Runs a workload after spawning the N-1 PyTorch Notebook [Tags] Tier2 LiveTesting - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-2129 [Setup] N-1 PyTorch Setup Spawn Notebook With Arguments image=${NOTEBOOK_IMAGE} size=Small gpus=1 version=previous diff --git a/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-tensorflow-test.robot b/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-tensorflow-test.robot index 432388084..75993ff64 100644 --- a/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-tensorflow-test.robot +++ b/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/minimal-tensorflow-test.robot @@ -51,7 +51,7 @@ Verify Tensorboard Is Accessible Verify Tensorflow Image Can Be Spawned With GPU [Documentation] Spawns PyTorch image with 1 GPU [Tags] Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1151 Close Previous Server Spawn Notebook With Arguments image=${NOTEBOOK_IMAGE} size=Small gpus=1 @@ -59,28 +59,28 @@ Verify Tensorflow Image Can Be Spawned With GPU Verify Tensorflow Image Includes Expected CUDA Version [Documentation] Checks CUDA version [Tags] Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1152 Verify Installed CUDA Version ${EXPECTED_CUDA_VERSION} Verify Tensorflow Library Can See GPUs In Tensorflow Image [Documentation] Verifies Tensorlow can see the GPU [Tags] Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1153 Verify Tensorflow Can See GPU Verify Tensorflow Image GPU Workload [Documentation] Runs a workload on GPUs in Tensorflow image [Tags] Tier1 - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-1154 Run Repo And Clean https://github.com/lugi0/notebook-benchmarks notebook-benchmarks/tensorflow/GPU-no-warnings.ipynb Verify Previous Tensorflow Notebook Image With GPU [Documentation] Runs a workload after spawning the N-1 Tensorflow Notebook [Tags] Tier2 LiveTesting - ... Resources-GPU + ... Resources-GPU NVIDIA-GPUs ... ODS-2130 [Setup] N-1 Tensorflow Setup Spawn Notebook With Arguments image=${NOTEBOOK_IMAGE} size=Small gpus=1 version=previous diff --git a/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/multiple-gpus.robot b/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/multiple-gpus.robot index 9574f7590..6ff5f309f 100644 --- a/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/multiple-gpus.robot +++ b/ods_ci/tests/Tests/500__ide/501__ide_jupyterhub/multiple-gpus.robot @@ -22,7 +22,7 @@ Verify Number Of Available GPUs Is Correct [Documentation] Verifies that the number of available GPUs in the ... Spawner dropdown is correct; i.e., it should show the maximum ... Number of GPUs available in a single node. - [Tags] Tier1 Sanity Resources-2GPUS + [Tags] Tier1 Sanity Resources-2GPU NVIDIA-GPUsS ... ODS-1256 ${maxNo} = Find Max Number Of GPUs In One Node ${maxSpawner} = Fetch Max Number Of GPUs In Spawner Page @@ -31,7 +31,7 @@ Verify Number Of Available GPUs Is Correct Verify Two Servers Can Be Spawned [Documentation] Spawns two servers requesting 1 gpu each, and checks ... that both can schedule and are scheduled on different nodes. - [Tags] Tier1 Sanity Resources-2GPUS + [Tags] Tier1 Sanity Resources-2GPU NVIDIA-GPUsS ... ODS-1257 Spawn Notebook With Arguments image=${NOTEBOOK_IMAGE} size=Small gpus=1 ${serial_first} = Get GPU Serial Number diff --git a/ods_ci/tests/Tests/500__jupyterhub/minimal-cuda-test.robot b/ods_ci/tests/Tests/500__jupyterhub/minimal-cuda-test.robot new file mode 100644 index 000000000..b2c30a8eb --- /dev/null +++ b/ods_ci/tests/Tests/500__jupyterhub/minimal-cuda-test.robot @@ -0,0 +1,110 @@ +*** Settings *** +Documentation Minimal test for the CUDA image +Resource ../../Resources/ODS.robot +Resource ../../Resources/Common.robot +Resource ../../Resources/Page/ODH/JupyterHub/JupyterHubSpawner.robot +Resource ../../Resources/Page/ODH/JupyterHub/JupyterLabLauncher.robot +Resource ../../Resources/Page/ODH/JupyterHub/GPU.resource +Library JupyterLibrary +Suite Setup Verify CUDA Image Suite Setup +Suite Teardown End Web Test +Test Tags JupyterHub + + +*** Variables *** +${NOTEBOOK_IMAGE} = minimal-gpu +${EXPECTED_CUDA_VERSION} = 12.1 +${EXPECTED_CUDA_VERSION_N_1} = 11.8 + + +*** Test Cases *** +Verify CUDA Image Can Be Spawned With GPU + [Documentation] Spawns CUDA image with 1 GPU and verifies that the GPU is + ... not available for other users. + [Tags] Sanity Tier1 + ... Resources-GPU NVIDIA-GPUs + ... ODS-1141 ODS-346 ODS-1359 + Pass Execution Passing tests, as suite setup ensures that image can be spawned + +Verify CUDA Image Includes Expected CUDA Version + [Documentation] Checks CUDA version + [Tags] Sanity Tier1 + ... Resources-GPU NVIDIA-GPUs + ... ODS-1142 + Verify Installed CUDA Version ${EXPECTED_CUDA_VERSION} + +Verify PyTorch Library Can See GPUs In Minimal CUDA + [Documentation] Installs PyTorch and verifies it can see the GPU + [Tags] Sanity Tier1 + ... Resources-GPU NVIDIA-GPUs + ... ODS-1144 + Verify Pytorch Can See GPU install=True + +Verify Tensorflow Library Can See GPUs In Minimal CUDA + [Documentation] Installs Tensorflow and verifies it can see the GPU + [Tags] Sanity Tier1 + ... Resources-GPU NVIDIA-GPUs + ... ODS-1143 + Verify Tensorflow Can See GPU install=True + +Verify Cuda Image Has NVCC Installed + [Documentation] Verifies NVCC Version in Minimal CUDA Image + [Tags] Sanity Tier1 + ... Resources-GPU NVIDIA-GPUs + ... ODS-483 + ${nvcc_version} = Run Cell And Get Output input=!nvcc --version + Should Not Contain ${nvcc_version} /usr/bin/sh: nvcc: command not found + +Verify Previous CUDA Notebook Image With GPU + [Documentation] Runs a workload after spawning the N-1 CUDA Notebook + [Tags] Tier2 LiveTesting + ... Resources-GPU NVIDIA-GPUs + ... ODS-2128 + [Setup] N-1 CUDA Setup + Spawn Notebook With Arguments image=${NOTEBOOK_IMAGE} size=Small gpus=1 version=previous + Verify Installed CUDA Version ${EXPECTED_CUDA_VERSION_N_1} + Verify PyTorch Can See GPU install=True + Verify Tensorflow Can See GPU install=True + ${nvcc_version} = Run Cell And Get Output input=!nvcc --version + Should Not Contain ${nvcc_version} /usr/bin/sh: nvcc: command not found + [Teardown] End Web Test + + +*** Keywords *** +Verify CUDA Image Suite Setup + [Documentation] Suite Setup, spawns CUDA img with one GPU attached + ... Additionally, checks that the number of available GPUs decreases + ... after the GPU is assigned. + Set Library Search Order SeleniumLibrary + Close All Browsers + Begin Web Test + Launch JupyterHub Spawner From Dashboard + Spawn Notebook With Arguments image=${NOTEBOOK_IMAGE} size=Small gpus=1 + # Verifies that now there are no GPUs available for selection + @{old_browser} = Get Browser Ids + Sleep 30s msg=Give time to spawner to update GPU count + Launch Dashboard ${TEST_USER2.USERNAME} ${TEST_USER2.PASSWORD} ${TEST_USER2.AUTH_TYPE} + ... ${ODH_DASHBOARD_URL} ${BROWSER.NAME} ${BROWSER.OPTIONS} + Launch JupyterHub Spawner From Dashboard ${TEST_USER_2.USERNAME} ${TEST_USER.PASSWORD} + ... ${TEST_USER.AUTH_TYPE} + # This will fail in case there are two nodes with the same number of GPUs + # Since the overall available number won't change even after 1 GPU is assigned + # However I can't think of a better way to execute this check, under the assumption that + # the Resources-GPU NVIDIA-GPUs tag will always ensure there is 1 node with 1 GPU on the cluster. + ${maxNo} = Find Max Number Of GPUs In One Node + ${maxSpawner} = Fetch Max Number Of GPUs In Spawner Page + # Need to continue execution even on failure or the whole suite will be failed + # And not executed at all. + Run Keyword And Warn On Failure Should Be Equal ${maxSpawner} ${maxNo-1} + Close Browser + Switch Browser ${old_browser}[0] + +N-1 CUDA Setup + [Documentation] Closes the previous browser (if any) and starts a clean + ... run spawning the N-1 PyTorch image + End Web Test + Begin Web Test + Launch JupyterHub Spawner From Dashboard + Sleep 30s reason=Wait for resources to become available again + Reload Page + Wait Until JupyterHub Spawner Is Ready diff --git a/ods_ci/tests/Tests/650__distributed_workloads/test-run-codeflare-tests.robot b/ods_ci/tests/Tests/650__distributed_workloads/test-run-codeflare-tests.robot index e8e81bae0..afe57ffa4 100644 --- a/ods_ci/tests/Tests/650__distributed_workloads/test-run-codeflare-tests.robot +++ b/ods_ci/tests/Tests/650__distributed_workloads/test-run-codeflare-tests.robot @@ -29,7 +29,7 @@ Run TestKueueRayCpu ODH test Run TestKueueRayGpu ODH test [Documentation] Run Go ODH test: TestKueueRayGpu - [Tags] Resources-GPU + [Tags] Resources-GPU NVIDIA-GPUs ... Tier1 ... DistributedWorkloads ... CodeflareOperator