Skip to content
This repository has been archived by the owner on Dec 21, 2021. It is now read-only.

Commit

Permalink
Set restart count in the container status
Browse files Browse the repository at this point in the history
  • Loading branch information
siegfriedweber committed Aug 19, 2021
1 parent edf7346 commit 4a6a0e7
Show file tree
Hide file tree
Showing 7 changed files with 89 additions and 29 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ krator = { git = "https://github.com/stackabletech/krustlet.git", branch = "stac
kube = { version= "0.48", default-features = false, features = ["derive", "native-tls"] }
kubelet = { git = "https://github.com/stackabletech/krustlet.git", branch = "stackable_patches_v0.7.0", default-features = true, features= ["derive", "cli"] } # version = "0.7"
Inflector = "0.11"
json-patch = "0.2"
lazy_static = "1.4"
log = "0.4"
multimap = "0.8"
Expand Down
42 changes: 41 additions & 1 deletion src/provider/kubernetes/status.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
//! Functions for patching the pod status
use anyhow::anyhow;
use k8s_openapi::api::core::v1::Pod as KubePod;
use kube::{Api, Client};
use kube::{
api::{Patch, PatchParams},
Api, Client,
};
use kubelet::{
container::{ContainerKey, Status},
pod::Pod,
Expand Down Expand Up @@ -30,3 +34,39 @@ pub async fn patch_container_status(
);
}
}

/// Patches the restart count of a container.
pub async fn patch_restart_count(
client: &Client,
pod: &Pod,
container_key: &ContainerKey,
restart_count: u32,
) -> anyhow::Result<()> {
let api: Api<KubePod> = Api::namespaced(client.clone(), pod.namespace());

let index = pod
.container_status_index(container_key)
.ok_or_else(|| anyhow!("Container not found"))?;

let container_type = if container_key.is_init() {
"initContainer"
} else {
"container"
};

let patch = json_patch::Patch(vec![json_patch::PatchOperation::Replace(
json_patch::ReplaceOperation {
path: format!("/status/{}Statuses/{}/restartCount", container_type, index),
value: restart_count.into(),
},
)]);

api.patch_status(
pod.name(),
&PatchParams::default(),
&Patch::<()>::Json(patch),
)
.await?;

Ok(())
}
21 changes: 18 additions & 3 deletions src/provider/states/pod/running.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ use tokio::time::Duration;

use super::terminated::Terminated;
use crate::provider::{
kubernetes::status::patch_container_status, systemdmanager::service::ServiceState, PodHandle,
PodState, ProviderState,
kubernetes::status::{patch_container_status, patch_restart_count},
systemdmanager::service::ServiceState,
PodHandle, PodState, ProviderState,
};

#[derive(Debug, TransitionTo)]
Expand Down Expand Up @@ -132,12 +133,26 @@ impl State<PodState> for Running {
container_failed = true;
}

for container_handle in running_containers.values() {
for (container_key, container_handle) in running_containers.iter() {
trace!(
"Unit [{}] of service [{}] still running ...",
container_handle.service_unit,
pod_state.service_name
);

match container_handle.systemd_service.restart_count().await {
Ok(restart_count) => {
if let Err(error) =
patch_restart_count(&client, &pod, container_key, restart_count).await
{
warn!("Could not patch restart count: {}", error);
}
}
Err(error) => warn!(
"Could retrieve restart count from unit [{}]: {}",
container_handle.service_unit, error
),
}
}
}

Expand Down
21 changes: 6 additions & 15 deletions src/provider/systemdmanager/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
use super::systemd1_api::{
ActiveState, AsyncManagerProxy, AsyncServiceProxy, AsyncUnitProxy, SUB_STATE_SERVICE_EXITED,
};
use crate::provider::systemdmanager::systemd1_api::ServiceResult;
use anyhow::anyhow;

/// Represents the state of a service unit object.
Expand Down Expand Up @@ -67,9 +66,9 @@ impl SystemdService {
/// Returns a coarse-grained state of the service unit object.
///
/// It is assumed that RemainAfterExit is set to "yes" in the given
/// unit. Otherwise it would not be possible to distinguish between
/// "inactive and never run" and "inactive and terminated
/// successfully".
/// unit if the service can terminate. Otherwise it would not be
/// possible to distinguish between "inactive and never run" and
/// "inactive and terminated successfully".
pub async fn service_state(&self) -> anyhow::Result<ServiceState> {
let active_state = self.unit_proxy.active_state().await?;

Expand Down Expand Up @@ -113,19 +112,11 @@ impl SystemdService {
Ok(service_state)
}

/// Checks if the result is not set to success.
pub async fn failed(&self) -> anyhow::Result<bool> {
pub async fn restart_count(&self) -> anyhow::Result<u32> {
self.service_proxy
.result()
.nrestarts()
.await
.map(|state| state != ServiceResult::Success)
.map_err(|error| {
anyhow!(
"Result of systemd unit [{}] cannot be retrieved: {}",
self.file,
error
)
})
.map_err(|e| anyhow!("Error receiving NRestarts of unit [{}]. {}", self.file, e))
}

/// Retrieves the current invocation ID.
Expand Down
4 changes: 4 additions & 0 deletions src/provider/systemdmanager/systemd1_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,10 @@ trait Service {
/// state (see ['ActiveState::Failed`]).
#[dbus_proxy(property)]
fn result(&self) -> zbus::Result<ServiceResult>;

/// Number of restarts
#[dbus_proxy(property, name = "NRestarts")]
fn nrestarts(&self) -> zbus::Result<u32>;
}

/// A systemd job object
Expand Down
28 changes: 18 additions & 10 deletions src/provider/systemdmanager/systemdunit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,8 @@ impl SystemDUnit {

unit.set_property(Section::Service, "TimeoutStopSec", &termination_timeout);

unit.set_restart_option(RestartOption::from(restart_policy(&pod)));
let restart_option = RestartOption::from(restart_policy(pod));
unit.set_restart_option(&restart_option);

// Relieve the machine a little bit on restart loops but choose
// a moderate value so that tests are not slowed down too much.
Expand All @@ -307,10 +308,17 @@ impl SystemDUnit {
// number of restarts.
unit.set_start_limit_interval_sec_option(0);

// Setting RemainAfterExit to "yes" is necessary to reliably
// determine the state of the service unit object, see
// manager::SystemdManager::service_state.
unit.set_remain_after_exit_option(Boolean::Yes);
// If the service can terminate successfully then
// RemainAfterExit must be set to "yes" so that the state of the
// service unit object can be reliably determined after
// termination, see manager::SystemdManager::service_state.
//
// If Restart is set to "always" then the service cannot
// terminate and there is no need to determine the state after
// termination. Furthermore RemainAfterExit must not be set
// because otherwise the Restart option would be ignored when
// the service returns a successful return code.
unit.set_remain_after_exit_option((restart_option != RestartOption::Always).into());

if let Some(user_name) = SystemDUnit::get_user_name_from_pod_security_context(pod)? {
if !user_mode {
Expand All @@ -325,7 +333,7 @@ impl SystemDUnit {

/// Configures whether the service shall be restarted when the
/// service process exits, is killed, or a timeout is reached.
fn set_restart_option(&mut self, setting: RestartOption) {
fn set_restart_option(&mut self, setting: &RestartOption) {
self.set_property(Section::Service, "Restart", &setting.to_string());
}

Expand Down Expand Up @@ -632,7 +640,7 @@ mod test {
StartLimitIntervalSec=0
[Service]
RemainAfterExit=yes
RemainAfterExit=no
Restart=always
RestartSec=2
TimeoutStopSec=30
Expand Down Expand Up @@ -674,7 +682,7 @@ mod test {
Environment="LOG_DIR=/var/log/default-stackable"
Environment="LOG_LEVEL=INFO"
ExecStart=start.sh arg /etc/default-stackable
RemainAfterExit=yes
RemainAfterExit=no
Restart=always
RestartSec=2
StandardError=journal
Expand Down Expand Up @@ -711,7 +719,7 @@ mod test {
[Service]
ExecStart=start.sh
RemainAfterExit=yes
RemainAfterExit=no
Restart=always
RestartSec=2
StandardError=journal
Expand All @@ -737,7 +745,7 @@ mod test {
StartLimitIntervalSec=0
[Service]
RemainAfterExit=yes
RemainAfterExit=no
Restart=always
RestartSec=2
TimeoutStopSec=10"}
Expand Down

0 comments on commit 4a6a0e7

Please sign in to comment.