Skip to content

Commit

Permalink
sched_ext: Enable the ops breather and eject BPF scheduler on softlockup
Browse files Browse the repository at this point in the history
On 2 x Intel Sapphire Rapids machines with 224 logical CPUs, a poorly
behaving BPF scheduler can live-lock the system by making multiple CPUs bang
on the same DSQ to the point where soft-lockup detection triggers before
SCX's own watchdog can take action. It also seems possible that the machine
can be live-locked enough to prevent scx_ops_helper, which is an RT task,
from running in a timely manner.

Implement scx_softlockup() which is called when three quarters of
soft-lockup threshold has passed. The function immediately enables the ops
breather and triggers an ops error to initiate ejection of the BPF
scheduler.

The previous and this patch combined enable the kernel to reliably recover
the system from live-lock conditions that can be triggered by a poorly
behaving BPF scheduler on Intel dual socket systems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
  • Loading branch information
htejun committed Nov 8, 2024
1 parent 62dcbab commit e32c260
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 0 deletions.
2 changes: 2 additions & 0 deletions include/linux/sched/ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,13 @@ struct sched_ext_entity {

void sched_ext_free(struct task_struct *p);
void print_scx_info(const char *log_lvl, struct task_struct *p);
void scx_softlockup(u32 dur_s);

#else /* !CONFIG_SCHED_CLASS_EXT */

static inline void sched_ext_free(struct task_struct *p) {}
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
static inline void scx_softlockup(u32 dur_s) {}

#endif /* CONFIG_SCHED_CLASS_EXT */
#endif /* _LINUX_SCHED_EXT_H */
45 changes: 45 additions & 0 deletions kernel/sched/ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,7 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
static unsigned long scx_in_softlockup;
static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
static int scx_ops_bypass_depth;
static bool scx_ops_init_task_enabled;
Expand Down Expand Up @@ -4614,6 +4615,49 @@ bool task_should_scx(struct task_struct *p)
return p->policy == SCHED_EXT;
}

/**
* scx_softlockup - sched_ext softlockup handler
*
* On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
* live-lock the system by making many CPUs target the same DSQ to the point
* where soft-lockup detection triggers. This function is called from
* soft-lockup watchdog when the triggering point is close and tries to unjam
* the system by enabling the breather and aborting the BPF scheduler.
*/
void scx_softlockup(u32 dur_s)
{
switch (scx_ops_enable_state()) {
case SCX_OPS_ENABLING:
case SCX_OPS_ENABLED:
break;
default:
return;
}

/* allow only one instance, cleared at the end of scx_ops_bypass() */
if (test_and_set_bit(0, &scx_in_softlockup))
return;

printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
smp_processor_id(), dur_s, scx_ops.name);

/*
* Some CPUs may be trapped in the dispatch paths. Enable breather
* immediately; otherwise, we might even be able to get to
* scx_ops_bypass().
*/
atomic_inc(&scx_ops_breather_depth);

scx_ops_error("soft lockup - CPU#%d stuck for %us",
smp_processor_id(), dur_s);
}

static void scx_clear_softlockup(void)
{
if (test_and_clear_bit(0, &scx_in_softlockup))
atomic_dec(&scx_ops_breather_depth);
}

/**
* scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
*
Expand Down Expand Up @@ -4724,6 +4768,7 @@ static void scx_ops_bypass(bool bypass)
atomic_dec(&scx_ops_breather_depth);
unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags);
scx_clear_softlockup();
}

static void free_exit_info(struct scx_exit_info *ei)
Expand Down
8 changes: 8 additions & 0 deletions kernel/watchdog.c
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,14 @@ static int is_softlockup(unsigned long touch_ts,
need_counting_irqs())
start_counting_irqs();

/*
* A poorly behaving BPF scheduler can live-lock the system into
* soft lockups. Tell sched_ext to try ejecting the BPF
* scheduler when close to a soft lockup.
*/
if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
scx_softlockup(now - touch_ts);

/* Warn about unreasonable delays. */
if (time_after(now, period_ts + get_softlockup_thresh()))
return now - touch_ts;
Expand Down
2 changes: 2 additions & 0 deletions tools/sched_ext/scx_show_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def ops_state_str(state):
print(f'switching_all : {read_int("scx_switching_all")}')
print(f'switched_all : {read_static_key("__scx_switched_all")}')
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}')
print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}')
print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
print(f'enable_seq : {read_atomic("scx_enable_seq")}')

0 comments on commit e32c260

Please sign in to comment.