From 4a6a0e770f0bcbd599eb665c7d814e109e5f8845 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 5 Aug 2021 12:12:26 +0200 Subject: [PATCH] Set restart count in the container status --- Cargo.lock | 1 + Cargo.toml | 1 + src/provider/kubernetes/status.rs | 42 ++++++++++++++++++++- src/provider/states/pod/running.rs | 21 +++++++++-- src/provider/systemdmanager/service.rs | 21 +++-------- src/provider/systemdmanager/systemd1_api.rs | 4 ++ src/provider/systemdmanager/systemdunit.rs | 28 +++++++++----- 7 files changed, 89 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index efeafc7..60e6115 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2758,6 +2758,7 @@ dependencies = [ "handlebars", "hostname", "indoc", + "json-patch", "k8s-openapi", "krator", "kube", diff --git a/Cargo.toml b/Cargo.toml index 90422df..ea9b39a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ krator = { git = "https://github.com/stackabletech/krustlet.git", branch = "stac kube = { version= "0.48", default-features = false, features = ["derive", "native-tls"] } kubelet = { git = "https://github.com/stackabletech/krustlet.git", branch = "stackable_patches_v0.7.0", default-features = true, features= ["derive", "cli"] } # version = "0.7" Inflector = "0.11" +json-patch = "0.2" lazy_static = "1.4" log = "0.4" multimap = "0.8" diff --git a/src/provider/kubernetes/status.rs b/src/provider/kubernetes/status.rs index fc384de..3b356f2 100644 --- a/src/provider/kubernetes/status.rs +++ b/src/provider/kubernetes/status.rs @@ -1,7 +1,11 @@ //! Functions for patching the pod status +use anyhow::anyhow; use k8s_openapi::api::core::v1::Pod as KubePod; -use kube::{Api, Client}; +use kube::{ + api::{Patch, PatchParams}, + Api, Client, +}; use kubelet::{ container::{ContainerKey, Status}, pod::Pod, @@ -30,3 +34,39 @@ pub async fn patch_container_status( ); } } + +/// Patches the restart count of a container. +pub async fn patch_restart_count( + client: &Client, + pod: &Pod, + container_key: &ContainerKey, + restart_count: u32, +) -> anyhow::Result<()> { + let api: Api = Api::namespaced(client.clone(), pod.namespace()); + + let index = pod + .container_status_index(container_key) + .ok_or_else(|| anyhow!("Container not found"))?; + + let container_type = if container_key.is_init() { + "initContainer" + } else { + "container" + }; + + let patch = json_patch::Patch(vec![json_patch::PatchOperation::Replace( + json_patch::ReplaceOperation { + path: format!("/status/{}Statuses/{}/restartCount", container_type, index), + value: restart_count.into(), + }, + )]); + + api.patch_status( + pod.name(), + &PatchParams::default(), + &Patch::<()>::Json(patch), + ) + .await?; + + Ok(()) +} diff --git a/src/provider/states/pod/running.rs b/src/provider/states/pod/running.rs index b386c27..2785ba5 100644 --- a/src/provider/states/pod/running.rs +++ b/src/provider/states/pod/running.rs @@ -13,8 +13,9 @@ use tokio::time::Duration; use super::terminated::Terminated; use crate::provider::{ - kubernetes::status::patch_container_status, systemdmanager::service::ServiceState, PodHandle, - PodState, ProviderState, + kubernetes::status::{patch_container_status, patch_restart_count}, + systemdmanager::service::ServiceState, + PodHandle, PodState, ProviderState, }; #[derive(Debug, TransitionTo)] @@ -132,12 +133,26 @@ impl State for Running { container_failed = true; } - for container_handle in running_containers.values() { + for (container_key, container_handle) in running_containers.iter() { trace!( "Unit [{}] of service [{}] still running ...", container_handle.service_unit, pod_state.service_name ); + + match container_handle.systemd_service.restart_count().await { + Ok(restart_count) => { + if let Err(error) = + patch_restart_count(&client, &pod, container_key, restart_count).await + { + warn!("Could not patch restart count: {}", error); + } + } + Err(error) => warn!( + "Could retrieve restart count from unit [{}]: {}", + container_handle.service_unit, error + ), + } } } diff --git a/src/provider/systemdmanager/service.rs b/src/provider/systemdmanager/service.rs index 10da46a..2804be8 100644 --- a/src/provider/systemdmanager/service.rs +++ b/src/provider/systemdmanager/service.rs @@ -2,7 +2,6 @@ use super::systemd1_api::{ ActiveState, AsyncManagerProxy, AsyncServiceProxy, AsyncUnitProxy, SUB_STATE_SERVICE_EXITED, }; -use crate::provider::systemdmanager::systemd1_api::ServiceResult; use anyhow::anyhow; /// Represents the state of a service unit object. @@ -67,9 +66,9 @@ impl SystemdService { /// Returns a coarse-grained state of the service unit object. /// /// It is assumed that RemainAfterExit is set to "yes" in the given - /// unit. Otherwise it would not be possible to distinguish between - /// "inactive and never run" and "inactive and terminated - /// successfully". + /// unit if the service can terminate. Otherwise it would not be + /// possible to distinguish between "inactive and never run" and + /// "inactive and terminated successfully". pub async fn service_state(&self) -> anyhow::Result { let active_state = self.unit_proxy.active_state().await?; @@ -113,19 +112,11 @@ impl SystemdService { Ok(service_state) } - /// Checks if the result is not set to success. - pub async fn failed(&self) -> anyhow::Result { + pub async fn restart_count(&self) -> anyhow::Result { self.service_proxy - .result() + .nrestarts() .await - .map(|state| state != ServiceResult::Success) - .map_err(|error| { - anyhow!( - "Result of systemd unit [{}] cannot be retrieved: {}", - self.file, - error - ) - }) + .map_err(|e| anyhow!("Error receiving NRestarts of unit [{}]. {}", self.file, e)) } /// Retrieves the current invocation ID. diff --git a/src/provider/systemdmanager/systemd1_api.rs b/src/provider/systemdmanager/systemd1_api.rs index 609b0cb..c085fdb 100644 --- a/src/provider/systemdmanager/systemd1_api.rs +++ b/src/provider/systemdmanager/systemd1_api.rs @@ -507,6 +507,10 @@ trait Service { /// state (see ['ActiveState::Failed`]). #[dbus_proxy(property)] fn result(&self) -> zbus::Result; + + /// Number of restarts + #[dbus_proxy(property, name = "NRestarts")] + fn nrestarts(&self) -> zbus::Result; } /// A systemd job object diff --git a/src/provider/systemdmanager/systemdunit.rs b/src/provider/systemdmanager/systemdunit.rs index 1b1ae53..906635c 100644 --- a/src/provider/systemdmanager/systemdunit.rs +++ b/src/provider/systemdmanager/systemdunit.rs @@ -297,7 +297,8 @@ impl SystemDUnit { unit.set_property(Section::Service, "TimeoutStopSec", &termination_timeout); - unit.set_restart_option(RestartOption::from(restart_policy(&pod))); + let restart_option = RestartOption::from(restart_policy(pod)); + unit.set_restart_option(&restart_option); // Relieve the machine a little bit on restart loops but choose // a moderate value so that tests are not slowed down too much. @@ -307,10 +308,17 @@ impl SystemDUnit { // number of restarts. unit.set_start_limit_interval_sec_option(0); - // Setting RemainAfterExit to "yes" is necessary to reliably - // determine the state of the service unit object, see - // manager::SystemdManager::service_state. - unit.set_remain_after_exit_option(Boolean::Yes); + // If the service can terminate successfully then + // RemainAfterExit must be set to "yes" so that the state of the + // service unit object can be reliably determined after + // termination, see manager::SystemdManager::service_state. + // + // If Restart is set to "always" then the service cannot + // terminate and there is no need to determine the state after + // termination. Furthermore RemainAfterExit must not be set + // because otherwise the Restart option would be ignored when + // the service returns a successful return code. + unit.set_remain_after_exit_option((restart_option != RestartOption::Always).into()); if let Some(user_name) = SystemDUnit::get_user_name_from_pod_security_context(pod)? { if !user_mode { @@ -325,7 +333,7 @@ impl SystemDUnit { /// Configures whether the service shall be restarted when the /// service process exits, is killed, or a timeout is reached. - fn set_restart_option(&mut self, setting: RestartOption) { + fn set_restart_option(&mut self, setting: &RestartOption) { self.set_property(Section::Service, "Restart", &setting.to_string()); } @@ -632,7 +640,7 @@ mod test { StartLimitIntervalSec=0 [Service] - RemainAfterExit=yes + RemainAfterExit=no Restart=always RestartSec=2 TimeoutStopSec=30 @@ -674,7 +682,7 @@ mod test { Environment="LOG_DIR=/var/log/default-stackable" Environment="LOG_LEVEL=INFO" ExecStart=start.sh arg /etc/default-stackable - RemainAfterExit=yes + RemainAfterExit=no Restart=always RestartSec=2 StandardError=journal @@ -711,7 +719,7 @@ mod test { [Service] ExecStart=start.sh - RemainAfterExit=yes + RemainAfterExit=no Restart=always RestartSec=2 StandardError=journal @@ -737,7 +745,7 @@ mod test { StartLimitIntervalSec=0 [Service] - RemainAfterExit=yes + RemainAfterExit=no Restart=always RestartSec=2 TimeoutStopSec=10"}