diff --git a/USAGE.md b/USAGE.md index 1ffbc8b4d..755e2a0e7 100644 --- a/USAGE.md +++ b/USAGE.md @@ -159,6 +159,16 @@ Also setting the verbose option for many tasks will add extra output to help wit ``` ./cnf-testsuite test_name verbose ``` +#### Environment variables for timeouts: + +Timeouts are controlled by these environment variables, set them if default values aren't suitable: +``` +CNF_TESTSUITE_GENERIC_OPERATION_TIMEOUT=60 +CNF_TESTSUITE_RESOURCE_CREATION_TIMEOUT=120 +CNF_TESTSUITE_NODE_READINESS_TIMEOUT=240 +CNF_TESTSUITE_POD_READINESS_TIMEOUT=180 +CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOUT=1800 +``` #### Running The Linter in Developer Mode diff --git a/spec/platform/observability_spec.cr b/spec/platform/observability_spec.cr index 9306d5634..2ed59119b 100644 --- a/spec/platform/observability_spec.cr +++ b/spec/platform/observability_spec.cr @@ -32,13 +32,10 @@ describe "Platform Observability" do Helm.helm_repo_add("prometheus-community","https://prometheus-community.github.io/helm-charts") result = ShellCmd.run("#{helm} install node-exporter prometheus-community/prometheus-node-exporter", force_output: true) - pod_ready = "" - pod_ready_timeout = 45 - until (pod_ready == "true" || pod_ready_timeout == 0) - pod_ready = KubectlClient::Get.pod_status("node-exporter-prometheus").split(",")[2] + repeat_with_timeout(timeout: POD_READINESS_TIMEOUT, errormsg: "Pod readiness has timed-out") do + pod_ready = KubectlClient::Get.pod_status("node-exporter-prometheus").split(",")[2] == "true" Log.info { "Pod Ready Status: #{pod_ready}" } - sleep 1 - pod_ready_timeout = pod_ready_timeout - 1 + pod_ready end result = ShellCmd.run_testsuite("platform:node_exporter poc") if check_containerd diff --git a/spec/workload/operator_spec.cr b/spec/workload/operator_spec.cr index dbd9532bd..b3d74be98 100644 --- a/spec/workload/operator_spec.cr +++ b/spec/workload/operator_spec.cr @@ -42,10 +42,7 @@ describe "Operator" do KubectlClient::Get.resource_wait_for_uninstall("Pod", "#{pod_name}", 180, "operator-lifecycle-manager") end - second_count = 0 - wait_count = 20 - delete=false - until delete || second_count > wait_count.to_i + repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Namespace uninstallation has timed-out") do File.write("operator.json", "#{KubectlClient::Get.namespaces("operators").to_json}") json = File.open("operator.json") do |file| JSON.parse(file) @@ -53,16 +50,10 @@ describe "Operator" do json.as_h.delete("spec") File.write("operator.json", "#{json.to_json}") Log.info { "Uninstall Namespace Finalizer" } - if KubectlClient::Replace.command("--raw '/api/v1/namespaces/operators/finalize' -f ./operator.json")[:status].success? - delete=true - end - sleep 3 + KubectlClient::Replace.command("--raw '/api/v1/namespaces/operators/finalize' -f ./operator.json")[:status].success? end - second_count = 0 - wait_count = 20 - delete=false - until delete || second_count > wait_count.to_i + repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Namespace uninstallation has timed-out") do File.write("manager.json", "#{KubectlClient::Get.namespaces("operator-lifecycle-manager").to_json}") json = File.open("manager.json") do |file| JSON.parse(file) @@ -70,12 +61,9 @@ describe "Operator" do json.as_h.delete("spec") File.write("manager.json", "#{json.to_json}") Log.info { "Uninstall Namespace Finalizer" } - if KubectlClient::Replace.command("--raw '/api/v1/namespaces/operator-lifecycle-manager/finalize' -f ./manager.json")[:status].success? - delete=true - end - sleep 3 + KubectlClient::Replace.command("--raw '/api/v1/namespaces/operator-lifecycle-manager/finalize' -f ./manager.json")[:status].success? end - end + end end it "'operator_test' operator should not be found", tags: ["operator_test"] do diff --git a/src/tasks/chaos_mesh_setup.cr b/src/tasks/chaos_mesh_setup.cr index e20bc4184..b5de01180 100644 --- a/src/tasks/chaos_mesh_setup.cr +++ b/src/tasks/chaos_mesh_setup.cr @@ -53,18 +53,13 @@ end module ChaosMeshSetup def self.wait_for_test(test_type, test_name) - second_count = 0 - wait_count = 60 - status = "" - until (status.empty? != true && status == "Finished") || second_count > wait_count.to_i - Log.debug { "second_count = #{second_count}" } - sleep 1 + execution_complete = repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Chaos Mesh test timed-out") do cmd = "kubectl get #{test_type} #{test_name} -o json " Log.info { cmd } status = Process.run(cmd, shell: true, - output: output = IO::Memory.new, - error: stderr = IO::Memory.new) + output: output = IO::Memory.new, + error: stderr = IO::Memory.new) Log.info { "KubectlClient.exec output: #{output.to_s}" } Log.info { "KubectlClient.exec stderr: #{stderr.to_s}" } get_status = output.to_s @@ -75,23 +70,15 @@ module ChaosMeshSetup end Log.info { "Status: #{get_status}" } status = status_data.dig?("status", "experiment", "phase").to_s - second_count = second_count + 1 Log.info { "#{get_status}" } - Log.info { "#{second_count}" } + !status.empty? && status == "Finished" end - # Did chaos mesh finish the test successfully - # (status.empty? !=true && status == "Finished") - true + execution_complete end # TODO make generate without delete? def self.wait_for_resource(resource_file) - second_count = 0 - wait_count = 60 - is_resource_created = nil - until (is_resource_created.nil? != true && is_resource_created == true) || second_count > wait_count.to_i - Log.info { "second_count = #{second_count}" } - sleep 3 + execution_complete = repeat_with_timeout(timeout: RESOURCE_CREATION_TIMEOUT, errormsg: "Resource creation timed-out") do cmd = "kubectl create -f #{resource_file} 2>&1 >/dev/null" status = Process.run( cmd, @@ -103,8 +90,9 @@ module ChaosMeshSetup Log.info { "Waiting for CRD" } Log.info { "Status: #{is_resource_created}" } Log.debug { "resource file: #{resource_file}" } - second_count = second_count + 1 + is_resource_created == true end KubectlClient::Delete.file(resource_file) + execution_complete end end diff --git a/src/tasks/kind_setup.cr b/src/tasks/kind_setup.cr index a5a6283fb..68ad3aa19 100644 --- a/src/tasks/kind_setup.cr +++ b/src/tasks/kind_setup.cr @@ -127,11 +127,9 @@ class KindManager def initialize(@name : String, @kubeconfig : String) end - def wait_until_nodes_ready(wait_count : Int32 = 180) + def wait_until_nodes_ready Log.info { "wait_until_nodes_ready" } - ready = false - timeout = wait_count - until (ready == true || timeout <= 0) + execution_complete = repeat_with_timeout(timeout: NODE_READINESS_TIMEOUT, errormsg: "Node readiness timed-out") do cmd = "kubectl get nodes --kubeconfig #{kubeconfig}" result = ShellCmd.run(cmd, "wait_until_nodes_ready:all_nodes") all_nodes = result[:output] @@ -143,32 +141,27 @@ class KindManager node_count = all_nodes.size Log.info { "node_count: #{node_count}" } - ready_count = all_nodes.reduce(0) do |acc, node| + ready_count = all_nodes.reduce(0) do |acc, node| if /\s(Ready)/.match(node) acc = acc + 1 else acc end end - if node_count == ready_count Log.info { "Nodes are ready for the #{name} cluster" } - ready = true + true else - sleep 1 - timeout = timeout - 1 - Log.info { "Waiting for nodes on #{name} cluster to be ready: #{ready}" } - break if timeout <= 0 + Log.info { "Waiting for nodes on #{name} cluster to be ready..." } + false end end - ready + execution_complete end - def wait_until_pods_ready(wait_count : Int32 = 180) + def wait_until_pods_ready Log.info { "wait_until_pods_ready" } - ready = false - timeout = wait_count - until (ready == true || timeout <= 0) + execution_complete = repeat_with_timeout(timeout: POD_READINESS_TIMEOUT, errormsg: "Pod readiness timed-out") do all_pods_cmd = <<-STRING kubectl get pods -A -o go-template='{{range $index, $element := .items}}{{range .status.containerStatuses}}{{$element.metadata.name}}{{"\\n"}}{{end}}{{end}}' --kubeconfig #{kubeconfig} STRING @@ -193,16 +186,13 @@ class KindManager if pod_count.to_i == ready_count.to_i Log.info { "Pods on #{name} cluster are ready" } - ready = true + true else - sleep 1 - timeout = timeout - 1 - Log.info { "Waiting for pods on #{name} cluster to be ready: #{ready}" } - break if timeout <= 0 + Log.info { "Waiting for pods on #{name} cluster to be ready..." } + false end end - ready + execution_complete end - end end diff --git a/src/tasks/litmus_setup.cr b/src/tasks/litmus_setup.cr index 2a0412695..ce108019a 100644 --- a/src/tasks/litmus_setup.cr +++ b/src/tasks/litmus_setup.cr @@ -86,50 +86,40 @@ module LitmusManager end ## wait_for_test will wait for the completion of litmus test - def self.wait_for_test(test_name, chaos_experiment_name,total_chaos_duration,args, namespace : String = "default") - ## Maximum wait time is TCD (total chaos duration) + 60s (additional wait time) - delay=2 - timeout="#{total_chaos_duration}".to_i + 60 - retry=timeout/delay + def self.wait_for_test(test_name, chaos_experiment_name, args, namespace : String = "default") chaos_result_name = "#{test_name}-#{chaos_experiment_name}" - wait_count = 0 - status_code = -1 - experimentStatus = "" experimentStatus_cmd = "kubectl get chaosengine.litmuschaos.io #{test_name} -n #{namespace} -o jsonpath='{.status.engineStatus}'" Log.for("wait_for_test").info { "Checking experiment status #{experimentStatus_cmd}" } if check_verbose(args) ## Wait for completion of chaosengine which indicates the completion of chaos - until (status_code == 0 && experimentStatus == "Completed") || wait_count >= 1800 - sleep delay - experimentStatus_cmd = "kubectl get chaosengine.litmuschaos.io #{test_name} -n #{namespace} -o jsonpath='{.status.experiments[0].status}'" - Log.for("wait_for_test").info { "Checking experiment status #{experimentStatus_cmd}" } if check_verbose(args) - status_code = Process.run("#{experimentStatus_cmd}", shell: true, output: experimentStatus_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status - Log.for("wait_for_test").info { "status_code: #{status_code}" } if check_verbose(args) - Log.for("wait_for_test").info { "Checking experiment status #{experimentStatus_cmd}" } if check_verbose(args) + repeat_with_timeout(timeout: LITMUS_CHAOS_TEST_TIMEOUT, errormsg: "Litmus test has timed-out") do + status_code = Process.run("#{experimentStatus_cmd}", + shell: true, + output: experimentStatus_response = IO::Memory.new, + error: stderr = IO::Memory.new).exit_status + Log.for("wait_for_test").info { "#{chaos_experiment_name} status_code: #{status_code}" } if check_verbose(args) experimentStatus = experimentStatus_response.to_s - Log.info {"#{chaos_experiment_name} experiment status: "+experimentStatus} - - emoji_test_failed= "🗡️💀♻️" - Log.info { "experimentStatus #{experimentStatus}"} + Log.for("wait_for_test").info {"#{chaos_experiment_name} experiment status: " + experimentStatus} if (experimentStatus != "Waiting for Job Creation" && experimentStatus != "Running" && experimentStatus != "Completed") - Log.info {"#{test_name}: wait_for_test failed."} + true + else + status_code == 0 && experimentStatus == "Completed" end - wait_count = wait_count + 1 end - verdict = "" - wait_count = 0 verdict_cmd = "kubectl get chaosresults.litmuschaos.io #{chaos_result_name} -n #{namespace} -o jsonpath='{.status.experimentStatus.verdict}'" Log.for("wait_for_test").info { "Checking experiment verdict #{verdict_cmd}" } if check_verbose(args) ## Check the chaosresult verdict - until (status_code == 0 && verdict != "Awaited") || wait_count >= 30 - sleep delay - status_code = Process.run("#{verdict_cmd}", shell: true, output: verdict_response = IO::Memory.new, error: stderr = IO::Memory.new).exit_status + repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Litmus verdict aquiring has timed-out") do + status_code = Process.run("#{verdict_cmd}", + shell: true, + output: verdict_response = IO::Memory.new, + error: stderr = IO::Memory.new).exit_status Log.for("wait_for_test").info { "status_code: #{status_code}" } if check_verbose(args) Log.for("wait_for_test").info { "verdict: #{verdict_response.to_s}" } if check_verbose(args) verdict = verdict_response.to_s - wait_count = wait_count + 1 + status_code == 0 && verdict != "Awaited" end end diff --git a/src/tasks/platform/resilience.cr b/src/tasks/platform/resilience.cr index b470ccb97..1e0a9fae6 100644 --- a/src/tasks/platform/resilience.cr +++ b/src/tasks/platform/resilience.cr @@ -34,18 +34,16 @@ namespace "platform" do KubectlClient::Apply.file("reboot_daemon_pod.yml") KubectlClient::Get.wait_for_install("node-failure-coredns") - pod_ready = "" - pod_ready_timeout = 45 begin - until (pod_ready == "true" || pod_ready_timeout == 0) - pod_ready = KubectlClient::Get.pod_status("reboot", "--field-selector spec.nodeName=#{worker_node}").split(",")[2] - pod_ready_timeout = pod_ready_timeout - 1 - if pod_ready_timeout == 0 - next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Failed to install reboot daemon") - end - sleep 1 - puts "Waiting for reboot daemon to be ready" - puts "Reboot Daemon Ready Status: #{pod_ready}" + + execution_complete = repeat_with_timeout(timeout: POD_READINESS_TIMEOUT, errormsg: "Pod daemon installation has timed-out") do + pod_ready = KubectlClient::Get.pod_status("reboot", "--field-selector spec.nodeName=#{worker_node}").split(",")[2] == "true" + Log.info { "Waiting for reboot daemon to be ready. Current status: #{pod_ready}" } + pod_ready + end + + if !execution_complete + next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Failed to install reboot daemon") end # Find Reboot Daemon name @@ -53,40 +51,34 @@ namespace "platform" do start_reboot = KubectlClient.exec("#{reboot_daemon_pod} touch /tmp/reboot") #Watch for Node Failure. - pod_ready = "" - node_ready = "" - node_failure_timeout = 30 - until (pod_ready == "false" || node_ready == "False" || node_ready == "Unknown" || node_failure_timeout == 0) - pod_ready = KubectlClient::Get.pod_status("node-failure").split(",")[2] - node_ready = KubectlClient::Get.node_status("#{worker_node}") - Log.info { "Waiting for Node to go offline" } + execution_complete = repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Node shut-off has timed-out") do + pod_ready = KubectlClient::Get.pod_status("node-failure").split(",")[2] == "true" + node_ready = KubectlClient::Get.node_status("#{worker_node}") == "True" + Log.info { "Waiting for Node to go offline..." } Log.info { "Pod Ready Status: #{pod_ready}" } Log.info { "Node Ready Status: #{node_ready}" } - node_failure_timeout = node_failure_timeout - 1 - if node_failure_timeout == 0 - next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Node failed to go offline") - end - sleep 1 + !pod_ready || !node_ready + end + + if !execution_complete + next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Node failed to go offline") end #Watch for Node to come back online - pod_ready = "" - node_ready = "" - node_online_timeout = 300 - until (pod_ready == "true" && node_ready == "True" || node_online_timeout == 0) - pod_ready = KubectlClient::Get.pod_status("node-failure", "").split(",")[2] - node_ready = KubectlClient::Get.node_status("#{worker_node}") - Log.info { "Waiting for Node to come back online" } + execution_complete = repeat_with_timeout(timeout: NODE_READINESS_TIMEOUT, errormsg: "Node startup has timed-out") do + pod_ready = KubectlClient::Get.pod_status("node-failure", "").split(",")[2] == "true" + node_ready = KubectlClient::Get.node_status("#{worker_node}") == "True" + Log.info { "Waiting for Node to come back online..." } Log.info { "Pod Ready Status: #{pod_ready}" } Log.info { "Node Ready Status: #{node_ready}" } - node_online_timeout = node_online_timeout - 1 - if node_online_timeout == 0 - next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Node failed to come back online") - end - sleep 1 + pod_ready && node_ready end - CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Passed, "Node came back online") + if !execution_complete + next CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Failed, "Node failed to come back online") + end + + CNFManager::TestcaseResult.new(CNFManager::ResultStatus::Passed, "Node came back online") ensure Log.info { "node_failure cleanup" } delete_reboot_daemon = KubectlClient::Delete.file("reboot_daemon_pod.yml") diff --git a/src/tasks/utils/apisnoop.cr b/src/tasks/utils/apisnoop.cr index 45d5a5851..d946362e8 100644 --- a/src/tasks/utils/apisnoop.cr +++ b/src/tasks/utils/apisnoop.cr @@ -57,7 +57,7 @@ class ApiSnoop ShellCmd.run("pwd", "apisnoop_setup_kind_dir", true) kind_config = "kind+apisnoop.yaml" cluster = kind_manager.create_cluster(name, kind_config, false, k8s_version) - cluster.wait_until_nodes_ready(240) + cluster.wait_until_nodes_ready() cluster.wait_until_pods_ready() return cluster end diff --git a/src/tasks/utils/chaos_templates.cr b/src/tasks/utils/chaos_templates.cr index 664f8fc12..e19e3644b 100644 --- a/src/tasks/utils/chaos_templates.cr +++ b/src/tasks/utils/chaos_templates.cr @@ -6,8 +6,8 @@ class ChaosTemplates @app_namespace : String, @deployment_label : String, @deployment_label_value : String, - @total_chaos_duration : String, - @target_pod_name : String + @target_pod_name : String, + @total_chaos_duration : String = "120" ) end ECR.def_to_s("src/templates/chaos_templates/pod_io_stress.yml.ecr") @@ -32,7 +32,7 @@ class ChaosTemplates @app_namespace : String, @deployment_label : String, @deployment_label_value : String, - @total_chaos_duration : String + @total_chaos_duration : String = "60" ) end ECR.def_to_s("src/templates/chaos_templates/pod_network_latency.yml.ecr") @@ -45,7 +45,7 @@ class ChaosTemplates @app_namespace : String, @deployment_label : String, @deployment_label_value : String, - @total_chaos_duration : String + @total_chaos_duration : String = "60" ) end ECR.def_to_s("src/templates/chaos_templates/pod_network_corruption.yml.ecr") @@ -58,7 +58,7 @@ class ChaosTemplates @app_namespace : String, @deployment_label : String, @deployment_label_value : String, - @total_chaos_duration : String + @total_chaos_duration : String = "60" ) end ECR.def_to_s("src/templates/chaos_templates/pod_network_duplication.yml.ecr") @@ -83,8 +83,9 @@ class ChaosTemplates @app_namespace : String, @deployment_label : String, @deployment_label_value : String, - @total_chaos_duration : String, - @target_pod_name : String + @target_pod_name : String, + @total_chaos_duration : String = "30" + ) end ECR.def_to_s("src/templates/chaos_templates/pod_delete.yml.ecr") @@ -97,8 +98,8 @@ class ChaosTemplates @app_namespace : String, @deployment_label : String, @deployment_label_value : String, - @total_chaos_duration : String, - @target_pod_name : String + @target_pod_name : String, + @total_chaos_duration : String = "60" ) end ECR.def_to_s("src/templates/chaos_templates/pod_memory_hog.yml.ecr") @@ -111,8 +112,8 @@ class ChaosTemplates @app_namespace : String, @deployment_label : String, @deployment_label_value : String, - @total_chaos_duration : String, - @app_nodename : String + @app_nodename : String, + @total_chaos_duration : String = "90" ) end ECR.def_to_s("src/templates/chaos_templates/node_drain.yml.ecr") @@ -125,7 +126,7 @@ class ChaosTemplates @app_namespace : String, @deployment_label : String, @deployment_label_value : String, - @total_chaos_duration : String, + @total_chaos_duration : String = "120" ) end ECR.def_to_s("src/templates/chaos_templates/pod_dns_error.yml.ecr") diff --git a/src/tasks/utils/timeouts.cr b/src/tasks/utils/timeouts.cr new file mode 100644 index 000000000..ae8dd2594 --- /dev/null +++ b/src/tasks/utils/timeouts.cr @@ -0,0 +1,28 @@ +require "./utils.cr" + + +GENERIC_OPERATION_TIMEOUT = ENV.has_key?("CNF_TESTSUITE_GENERIC_OPERATION_TIMEOUT") ? ENV["CNF_TESTSUITE_GENERIC_OPERATION_TIMEOUT"].to_i : 60 +RESOURCE_CREATION_TIMEOUT = ENV.has_key?("CNF_TESTSUITE_RESOURCE_CREATION_TIMEOUT") ? ENV["CNF_TESTSUITE_RESOURCE_CREATION_TIMEOUT"].to_i : 120 +NODE_READINESS_TIMEOUT = ENV.has_key?("CNF_TESTSUITE_NODE_READINESS_TIMEOUT") ? ENV["CNF_TESTSUITE_NODE_READINESS_TIMEOUT"].to_i : 240 +POD_READINESS_TIMEOUT = ENV.has_key?("CNF_TESTSUITE_POD_READINESS_TIMEOUT") ? ENV["CNF_TESTSUITE_POD_READINESS_TIMEOUT"].to_i : 180 +LITMUS_CHAOS_TEST_TIMEOUT = ENV.has_key?("CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOUT") ? ENV["CNF_TESTSUITE_LITMUS_CHAOS_TEST_TIMEOUT"].to_i : 1800 + +def repeat_with_timeout(timeout, errormsg, reset_on_nil=false, delay=2, &block) + start_time = Time.utc + while (Time.utc - start_time).seconds < timeout + result = yield + if result.nil? + if reset_on_nil + start_time = Time.utc + else + raise "Unexpected nil result of executed block, check the return value or parameter 'reset_on_nil'" + end + elsif result + return true + end + sleep delay + Log.for("verbose").info { "Time left: #{timeout - (Time.utc - start_time).seconds} seconds" } + end + Log.error { errormsg } + false +end \ No newline at end of file diff --git a/src/tasks/utils/utils.cr b/src/tasks/utils/utils.cr index 649af196f..e19e1fa05 100644 --- a/src/tasks/utils/utils.cr +++ b/src/tasks/utils/utils.cr @@ -10,6 +10,7 @@ require "semantic_version" require "./dockerd.cr" require "./kyverno.cr" require "./http_helper.cr" +require "./timeouts.cr" require "ecr" module ShellCmd diff --git a/src/tasks/workload/compatibility.cr b/src/tasks/workload/compatibility.cr index ae705b0c4..403c4ba80 100644 --- a/src/tasks/workload/compatibility.cr +++ b/src/tasks/workload/compatibility.cr @@ -352,14 +352,6 @@ end def wait_for_scaling(resource, target_replica_count, args) VERBOSE_LOGGING.info "target_replica_count: #{target_replica_count}" if check_verbose(args) - if args.named.keys.includes? "wait_count" - wait_count_value = args.named["wait_count"] - else - wait_count_value = "45" - end - wait_count = wait_count_value.to_i - second_count = 0 - current_replicas = "0" replicas_cmd = "kubectl get #{resource["kind"]} #{resource["metadata"]["name"]} -o=jsonpath='{.status.readyReplicas}'" namespace = resource.dig("metadata", "namespace") @@ -370,11 +362,10 @@ def wait_for_scaling(resource, target_replica_count, args) output: replicas_stdout = IO::Memory.new, error: replicas_stderr = IO::Memory.new ) - previous_replicas = replicas_stdout.to_s - until current_replicas == target_replica_count || second_count > wait_count - Log.for("verbose").debug { "secound_count: #{second_count} wait_count: #{wait_count}" } if check_verbose(args) + current_replicas = replicas_stdout.to_s.empty? ? "0" : replicas_stdout.to_s + previous_replicas = current_replicas + repeat_with_timeout(timeout: GENERIC_OPERATION_TIMEOUT, errormsg: "Pod scaling has timed-out", reset_on_nil: true) do Log.for("verbose").info { "current_replicas before get #{resource["kind"]}: #{current_replicas}" } if check_verbose(args) - sleep 1 Log.for("verbose").debug { "$KUBECONFIG = #{ENV.fetch("KUBECONFIG", nil)}" } if check_verbose(args) Process.run( @@ -383,22 +374,12 @@ def wait_for_scaling(resource, target_replica_count, args) output: replicas_stdout = IO::Memory.new, error: replicas_stderr = IO::Memory.new ) - current_replicas = replicas_stdout.to_s - - Log.for("verbose").info { "current_replicas after get #{resource["kind"]}: #{current_replicas.inspect}" } if check_verbose(args) - - if current_replicas.empty? - current_replicas = "0" - previous_replicas = "0" - end - + current_replicas = replicas_stdout.to_s.empty? ? "0" : replicas_stdout.to_s if current_replicas.to_i != previous_replicas.to_i - second_count = 0 previous_replicas = current_replicas + next nil end - second_count = second_count + 1 - Log.for("verbose").info { "previous_replicas: #{previous_replicas}" } if check_verbose(args) - Log.for("verbose").info { "current_replicas: #{current_replicas}" } if check_verbose(args) + current_replicas == target_replica_count end current_replicas end diff --git a/src/tasks/workload/configuration.cr b/src/tasks/workload/configuration.cr index d9079f130..49ef8b04b 100644 --- a/src/tasks/workload/configuration.cr +++ b/src/tasks/workload/configuration.cr @@ -678,8 +678,6 @@ task "operator_installed" do |t, args| #TODO Warn if csv is not found for a subscription. csv_names = subscription_names.map do |subscription| - second_count = 0 - wait_count = 120 csv_created = nil resource_created = false diff --git a/src/tasks/workload/microservice.cr b/src/tasks/workload/microservice.cr index 57606fa5a..6572f5f25 100644 --- a/src/tasks/workload/microservice.cr +++ b/src/tasks/workload/microservice.cr @@ -492,8 +492,8 @@ task "sig_term_handled" do |t, args| pod_namespace = pod.dig("metadata", "namespace").as_s Log.info { "pod_name: #{pod_name}" } - # Wait for a pod to be available. Only wait for 20 seconds. - KubectlClient::Get.wait_for_resource_availability("pod", pod_name, pod_namespace, 60) + # Wait for a pod to be available. Only wait for 60 seconds. + KubectlClient::Get.wait_for_resource_availability("pod", pod_name, pod_namespace, GENERIC_OPERATION_TIMEOUT) status = pod["status"] if status["containerStatuses"]? diff --git a/src/tasks/workload/reliability.cr b/src/tasks/workload/reliability.cr index 7df39e326..618286f29 100644 --- a/src/tasks/workload/reliability.cr +++ b/src/tasks/workload/reliability.cr @@ -150,7 +150,6 @@ task "pod_network_latency", ["install_litmus"] do |t, args| KubectlClient::Annotate.run("--overwrite -n #{app_namespace} deploy/#{resource["name"]} litmuschaos.io/chaos=\"true\"") chaos_experiment_name = "pod-network-latency" - total_chaos_duration = "60" test_name = "#{resource["name"]}-#{Random::Secure.hex(4)}" chaos_result_name = "#{test_name}-#{chaos_experiment_name}" @@ -161,8 +160,7 @@ task "pod_network_latency", ["install_litmus"] do |t, args| "#{chaos_experiment_name}", app_namespace, "#{current_pod_key}", - "#{current_pod_value}", - total_chaos_duration + "#{current_pod_value}" ).to_s else template = ChaosTemplates::PodNetworkLatency.new( @@ -170,14 +168,13 @@ task "pod_network_latency", ["install_litmus"] do |t, args| "#{chaos_experiment_name}", app_namespace, "#{spec_labels.as_h.first_key}", - "#{spec_labels.as_h.first_value}", - total_chaos_duration + "#{spec_labels.as_h.first_value}" ).to_s end File.write("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml", template) KubectlClient::Apply.file("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml") - LitmusManager.wait_for_test(test_name,chaos_experiment_name,total_chaos_duration,args, namespace: app_namespace) + LitmusManager.wait_for_test(test_name, chaos_experiment_name, args, namespace: app_namespace) test_passed = LitmusManager.check_chaos_verdict(chaos_result_name,chaos_experiment_name,args, namespace: app_namespace) end end @@ -230,7 +227,6 @@ task "pod_network_corruption", ["install_litmus"] do |t, args| KubectlClient::Annotate.run("--overwrite -n #{app_namespace} deploy/#{resource["name"]} litmuschaos.io/chaos=\"true\"") chaos_experiment_name = "pod-network-corruption" - total_chaos_duration = "60" test_name = "#{resource["name"]}-#{Random.rand(99)}" chaos_result_name = "#{test_name}-#{chaos_experiment_name}" @@ -240,12 +236,11 @@ task "pod_network_corruption", ["install_litmus"] do |t, args| "#{chaos_experiment_name}", app_namespace, "#{spec_labels.first_key}", - "#{spec_labels.first_value}", - total_chaos_duration + "#{spec_labels.first_value}" ).to_s File.write("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml", template) KubectlClient::Apply.file("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml") - LitmusManager.wait_for_test(test_name,chaos_experiment_name,total_chaos_duration, args, namespace: app_namespace) + LitmusManager.wait_for_test(test_name, chaos_experiment_name, args, namespace: app_namespace) test_passed = LitmusManager.check_chaos_verdict(chaos_result_name,chaos_experiment_name, args, namespace: app_namespace) end end @@ -294,7 +289,6 @@ task "pod_network_duplication", ["install_litmus"] do |t, args| KubectlClient::Annotate.run("--overwrite -n #{app_namespace} deploy/#{resource["name"]} litmuschaos.io/chaos=\"true\"") chaos_experiment_name = "pod-network-duplication" - total_chaos_duration = "60" test_name = "#{resource["name"]}-#{Random.rand(99)}" chaos_result_name = "#{test_name}-#{chaos_experiment_name}" @@ -304,12 +298,11 @@ task "pod_network_duplication", ["install_litmus"] do |t, args| "#{chaos_experiment_name}", app_namespace, "#{spec_labels.first_key}", - "#{spec_labels.first_value}", - total_chaos_duration + "#{spec_labels.first_value}" ).to_s File.write("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml", template) KubectlClient::Apply.file("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml") - LitmusManager.wait_for_test(test_name,chaos_experiment_name,total_chaos_duration,args, namespace: app_namespace) + LitmusManager.wait_for_test(test_name, chaos_experiment_name, args, namespace: app_namespace) test_passed = LitmusManager.check_chaos_verdict(chaos_result_name,chaos_experiment_name,args, namespace: app_namespace) end end @@ -356,7 +349,6 @@ task "disk_fill", ["install_litmus"] do |t, args| KubectlClient::Annotate.run("--overwrite -n #{app_namespace} deploy/#{resource["name"]} litmuschaos.io/chaos=\"true\"") chaos_experiment_name = "disk-fill" - disk_fill_time = "100" test_name = "#{resource["name"]}-#{Random.rand(99)}" chaos_result_name = "#{test_name}-#{chaos_experiment_name}" @@ -372,7 +364,7 @@ task "disk_fill", ["install_litmus"] do |t, args| ).to_s File.write("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml", template) KubectlClient::Apply.file("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml") - LitmusManager.wait_for_test(test_name, chaos_experiment_name, disk_fill_time, args, namespace: app_namespace) + LitmusManager.wait_for_test(test_name, chaos_experiment_name, args, namespace: app_namespace) test_passed = LitmusManager.check_chaos_verdict(chaos_result_name, chaos_experiment_name, args, namespace: app_namespace) end test_passed @@ -451,7 +443,6 @@ task "pod_delete", ["install_litmus"] do |t, args| KubectlClient::Annotate.run("--overwrite -n #{app_namespace} deploy/#{resource["name"]} litmuschaos.io/chaos=\"true\"") chaos_experiment_name = "pod-delete" - total_chaos_duration = "30" target_pod_name = "" test_name = "#{resource["name"]}-#{Random.rand(99)}" chaos_result_name = "#{test_name}-#{chaos_experiment_name}" @@ -464,7 +455,6 @@ task "pod_delete", ["install_litmus"] do |t, args| app_namespace, "#{current_pod_key}", "#{current_pod_value}", - total_chaos_duration, target_pod_name ).to_s else @@ -474,7 +464,6 @@ task "pod_delete", ["install_litmus"] do |t, args| app_namespace, "#{spec_labels.as_h.first_key}", "#{spec_labels.as_h.first_value}", - total_chaos_duration, target_pod_name ).to_s end @@ -482,7 +471,7 @@ task "pod_delete", ["install_litmus"] do |t, args| Log.info { "template: #{template}" } File.write("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml", template) KubectlClient::Apply.file("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml") - LitmusManager.wait_for_test(test_name,chaos_experiment_name,total_chaos_duration,args, namespace: app_namespace) + LitmusManager.wait_for_test(test_name, chaos_experiment_name, args, namespace: app_namespace) end test_passed=LitmusManager.check_chaos_verdict(chaos_result_name,chaos_experiment_name,args, namespace: app_namespace) end @@ -531,7 +520,6 @@ task "pod_memory_hog", ["install_litmus"] do |t, args| KubectlClient::Annotate.run("--overwrite -n #{app_namespace} deploy/#{resource["name"]} litmuschaos.io/chaos=\"true\"") chaos_experiment_name = "pod-memory-hog" - total_chaos_duration = "60" target_pod_name = "" test_name = "#{resource["name"]}-#{Random.rand(99)}" chaos_result_name = "#{test_name}-#{chaos_experiment_name}" @@ -543,13 +531,12 @@ task "pod_memory_hog", ["install_litmus"] do |t, args| app_namespace, "#{spec_labels.first_key}", "#{spec_labels.first_value}", - total_chaos_duration, target_pod_name ).to_s File.write("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml", template) KubectlClient::Apply.file("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml") - LitmusManager.wait_for_test(test_name,chaos_experiment_name,total_chaos_duration,args, namespace: app_namespace) + LitmusManager.wait_for_test(test_name, chaos_experiment_name, args, namespace: app_namespace) test_passed = LitmusManager.check_chaos_verdict(chaos_result_name,chaos_experiment_name,args, namespace: app_namespace) end test_passed @@ -597,7 +584,6 @@ task "pod_io_stress", ["install_litmus"] do |t, args| KubectlClient::Annotate.run("--overwrite -n #{app_namespace} deploy/#{resource["name"]} litmuschaos.io/chaos=\"true\"") chaos_experiment_name = "pod-io-stress" - total_chaos_duration = "120" target_pod_name = "" chaos_test_name = "#{resource["name"]}-#{Random.rand(99)}" chaos_result_name = "#{chaos_test_name}-#{chaos_experiment_name}" @@ -609,13 +595,12 @@ task "pod_io_stress", ["install_litmus"] do |t, args| app_namespace, "#{spec_labels.first_key}", "#{spec_labels.first_value}", - total_chaos_duration, target_pod_name ).to_s File.write("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml", template) KubectlClient::Apply.file("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml") - LitmusManager.wait_for_test(chaos_test_name,chaos_experiment_name,total_chaos_duration,args, namespace: app_namespace) + LitmusManager.wait_for_test(chaos_test_name, chaos_experiment_name, args, namespace: app_namespace) test_passed = LitmusManager.check_chaos_verdict(chaos_result_name,chaos_experiment_name,args, namespace: app_namespace) end end @@ -670,7 +655,6 @@ task "pod_dns_error", ["install_litmus"] do |t, args| KubectlClient::Annotate.run("--overwrite -n #{app_namespace} deploy/#{resource["name"]} litmuschaos.io/chaos=\"true\"") chaos_experiment_name = "pod-dns-error" - total_chaos_duration = "120" target_pod_name = "" test_name = "#{resource["name"]}-#{Random.rand(99)}" chaos_result_name = "#{test_name}-#{chaos_experiment_name}" @@ -681,13 +665,12 @@ task "pod_dns_error", ["install_litmus"] do |t, args| "#{chaos_experiment_name}", app_namespace, "#{spec_labels.first_key}", - "#{spec_labels.first_value}", - total_chaos_duration, + "#{spec_labels.first_value}" ).to_s File.write("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml", template) KubectlClient::Apply.file("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml") - LitmusManager.wait_for_test(test_name,chaos_experiment_name,total_chaos_duration,args, namespace: app_namespace) + LitmusManager.wait_for_test(test_name, chaos_experiment_name, args, namespace: app_namespace) test_passed = LitmusManager.check_chaos_verdict(chaos_result_name,chaos_experiment_name,args, namespace: app_namespace) end end diff --git a/src/tasks/workload/state.cr b/src/tasks/workload/state.cr index 9d5a95aeb..95585cc58 100644 --- a/src/tasks/workload/state.cr +++ b/src/tasks/workload/state.cr @@ -322,7 +322,6 @@ task "node_drain", ["install_litmus"] do |t, args| KubectlClient::Annotate.run("--overwrite -n #{app_namespace} deploy/#{resource["name"]} litmuschaos.io/chaos=\"true\"") chaos_experiment_name = "node-drain" - total_chaos_duration = "90" test_name = "#{resource["name"]}-#{Random::Secure.hex(4)}" chaos_result_name = "#{test_name}-#{chaos_experiment_name}" @@ -332,14 +331,13 @@ task "node_drain", ["install_litmus"] do |t, args| app_namespace, "#{deployment_label}", "#{deployment_label_value}", - total_chaos_duration, app_nodeName ).to_s Log.for("node_drain").info { "Chaos test name: #{test_name}; Experiment name: #{chaos_experiment_name}; Label #{deployment_label}=#{deployment_label_value}; namespace: #{app_namespace}" } File.write("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml", template) KubectlClient::Apply.file("#{destination_cnf_dir}/#{chaos_experiment_name}-chaosengine.yml") - LitmusManager.wait_for_test(test_name,chaos_experiment_name,total_chaos_duration,args, namespace: app_namespace) + LitmusManager.wait_for_test(test_name, chaos_experiment_name, args, namespace: app_namespace) test_passed = LitmusManager.check_chaos_verdict(chaos_result_name,chaos_experiment_name,args, namespace: app_namespace) end