Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Include GPU check validation #20840

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 58 additions & 25 deletions tests/publiccloud/create_aistack_env.pm
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SUSE's openQA tests

Check failure on line 1 in tests/publiccloud/create_aistack_env.pm

View workflow job for this annotation

GitHub Actions / CI: Running static tests with perl v5.32

File tests/publiccloud/create_aistack_env.pm needs tidying
#
# Copyright 2024 SUSE LLC
# SPDX-License-Identifier: FSFAP
Expand All @@ -10,7 +10,6 @@
# - Install the required dependencies to install the aistack helm chart
# - Test access to OpenWebUI and run integration tests with Ollama and MilvusDB
# Maintainer: Yogalakshmi Arunachalam <yarunachalam@suse.com>
#

use Mojo::Base 'publiccloud::basetest';
use testapi;
Expand Down Expand Up @@ -74,9 +73,9 @@
my $ing_ver = get_var('ING_VERSION');

# Add Ingress Controller to open-webui endpoint
assert_script_run("helm repo add $ingress_repo");
assert_script_run("helm repo update");
assert_script_run("helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx --namespace ingress-nginx --set controller.service.type=ClusterIP --version $ing_ver --create-namespace", timeout => 120);
# assert_script_run("helm repo add $ingress_repo");
#assert_script_run("helm repo update");
#assert_script_run("helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx --namespace ingress-nginx --set controller.service.type=ClusterIP --version $ing_ver --create-namespace", timeout => 120);

# Add cert-manager repo,install
assert_script_run("helm repo add $cert_repo");
Expand Down Expand Up @@ -143,10 +142,10 @@
assert_script_run("helm registry login dp.apps.rancher.io/charts -u $docker_user_name -p $SECRET_application_collection");
if (check_var('PUBLIC_CLOUD_NVIDIA_GPU_AISTACK', 1)) {
assert_script_run("curl " . data_url("aistack/$openwebui_gpu_values") . " -o $openwebui_gpu_values", 60);
assert_script_run("helm install open-webui -f $openwebui_gpu_values -n $namespace $openwebui_helm_repo --set open-webui.ingress.class=nginx", timeout => 100);
assert_script_run("helm install open-webui -f $openwebui_gpu_values -n $namespace $openwebui_helm_repo", timeout => 100);
} else {
assert_script_run("curl " . data_url("aistack/$openwebui_values") . " -o $openwebui_values", 60);
assert_script_run("helm install open-webui -f $openwebui_values -n $namespace $openwebui_helm_repo --set open-webui.ingress.class=nginx", timeout => 100);
assert_script_run("helm install open-webui -f $openwebui_values -n $namespace $openwebui_helm_repo", timeout => 100);
}

assert_script_run("kubectl get all --namespace $namespace");
Expand All @@ -159,10 +158,16 @@
# After reaching max_retries , record the pod details which does not run after reaching max_retries
my $max_retries = 15;
my @failed_pods;
my @issue_logs_pod;
my $sleep_interval = 20;
my $ollama_pod;
my @out = split(' ', script_output("kubectl get pods --namespace $namespace -o custom-columns=':metadata.name'"));
record_info("Pod names", join(" ", @out));
POD_LOOP: foreach my $pod (@out) {

if ($pod =~ /^ollama/) {
$ollama_pod = $pod;
yarunachalam marked this conversation as resolved.
Show resolved Hide resolved
}
my $counter = 0;
my $start_time = time();
while ($counter++ < $max_retries) {
Expand All @@ -177,8 +182,8 @@
next POD_LOOP;
} else {
if ($logs =~ /ERROR|FAILURE|Exception|Failed/) {
record_info("$pod failed due to error in log: $logs \n ");
push @failed_pods, {name => $pod, status => $status};
record_info("$pod has error in log: $logs \n ");
push @issue_logs_pod, {name => $pod, status => $status};
next POD_LOOP;
} # if log
sleep $sleep_interval;
Expand All @@ -188,6 +193,25 @@
} # pod loop
yarunachalam marked this conversation as resolved.
Show resolved Hide resolved

assert_script_run("kubectl get all --namespace $namespace");

# GPU check for NVIDIA_GPU_AISTACK test
if (check_var('PUBLIC_CLOUD_NVIDIA_GPU_AISTACK', 1)) {
my $ollama_log = script_output("kubectl logs $ollama_pod -n $namespace", proceed_on_failure => 1);
if ($ollama_log =~ /looking for compatible GPUs/) {
record_info("GPU compatible check in pod log $ollama_pod.");
}
if ($ollama_log =~ /no gpus found/) {
die "No GPU found for $ollama_pod\n";
}
}

# pod logs containing ERROR, FAILURE, or Exception, log a message indicating
# that the log has failure details and further inspection is needed
if (@issue_logs_pod) {
record_info("@issue_logs_pod log has ERROR|FAILURE|Exception check log for more details ");
}

# Exit if there is failed pods
if (@failed_pods) {
die "Failed pods:\n" . join("\n", map { "$_->{name}: $_->{status}" } @failed_pods) . "\n";
}
Expand Down Expand Up @@ -233,24 +257,30 @@
record_info("Added $ipaddr to /etc/hosts with hostname $host_name");

# get endpoints
assert_script_run("kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'");
my $endpoint_cmd = "kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'";
my $endpoint_result = script_output($endpoint_cmd);
record_info("Endpoint code: $endpoint_result \n");
if (!$endpoint_result) {
die "No healthy endpoints found for the open-webui service in $namespace\n";
#my $endpoint_result = script_output("kubectl get endpoints $sr_name -n $namespace -o=jsonpath='{.subsets[*].addresses[*].ip}'");
#record_info("Endpoint code: $endpoint_result \n");
#if (!$endpoint_result) {
# die "No healthy endpoints found for the open-webui service in $namespace\n";
#} else {
# connect open-webui service
assert_script_run("ping -c 5 $ipaddr");
my $curl_cmd = 'curl -v --trace --head --write-out "%{http_code}\n" -k -L https://' . $host_name;
my $curl_code = script_retry($curl_cmd, retry => 5, delay => 60);
record_info("http code: $curl_code \n");
if ($curl_code == 200) {
record_info("Successfully connected to the open-webui service at $curl_cmd \n");
} else {
# connect open-webui service
assert_script_run("curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name");
my $curl_cmd = "curl --output /dev/null --silent --head --write-out \"%{http_code}\n\" -k -L https://$host_name";
my $curl_result = script_output($curl_cmd);
record_info("http code: $curl_result \n");
if ($curl_result == 200) {
record_info("Successfully connected to the open-webui service at $curl_cmd \n");
} else {
die "Received unexpected HTTP error code $curl_result for $curl_cmd\n";
}
}
my $opod_name = script_output("kubectl get pods -n suse-private-ai -o=jsonpath='{.items[*].metadata.name}' | tr ' ' '\n' | grep 'open-webui' | head -n 1");
record_info("Open webui name $opod_name");
assert_script_run("kubectl get pods --all-namespaces");
assert_script_run("kubectl logs $opod_name -n suse-private-ai");
assert_script_run("kubectl get ingress --all-namespaces");
assert_script_run("kubectl get pods --all-namespaces | grep ingress");
assert_script_run("kubectl get svc --all-namespaces | grep ingress");
assert_script_run("kubectl describe pod open-webui-0 -n suse-private-ai");
die "Received unexpected HTTP error code $curl_code for $curl_cmd\n";
}


# create Admin user
my $signup_url = "https://$host_name/api/v1/auths/signup";
Expand Down Expand Up @@ -304,6 +334,8 @@

my $instance = $self->{my_instance} = $args->{my_instance};
my $provider = $self->{provider} = $args->{my_provider};
my $webip = $instance->public_ip;
record_info 'Instance', join(' ', 'IP: ', $instance->public_ip);

# Install dependency package, config kubectl and depnedency components
install_dependency_package($instance);
Expand All @@ -317,6 +349,7 @@

# Install private_ai_stack chart
install_aistack_chart($instance, $ai_ns);


# OpenWebUI service test
test_openwebui_service($instance, $ai_ns);
Expand Down
Loading