From d828413318b3f4bc5e31c4f036d65172bf9793ed Mon Sep 17 00:00:00 2001 From: chenyichong Date: Mon, 24 Oct 2022 14:45:51 +0800 Subject: [PATCH] Add function: TT-sched info:This feature replaces CFS-sched Signed-off-by: chenyichong --- arch/x86/configs/hwe_desktop_defconfig | 17 +- include/linux/sched.h | 24 +- include/linux/sched/sysctl.h | 7 + init/Kconfig | 18 +- kernel/Kconfig.preempt | 1 + kernel/sched/Makefile | 4 + kernel/sched/bs.c | 1877 +++++++++++++++++++++++ kernel/sched/bs.h | 310 ++++ kernel/sched/bs_nohz.h | 966 ++++++++++++ kernel/sched/core.c | 65 +- kernel/sched/debug.c | 37 +- kernel/sched/fair_numa.h | 1960 ++++++++++++++++++++++++ kernel/sched/idle.c | 12 +- kernel/sched/sched.h | 50 + kernel/sched/tt_stats.h | 831 ++++++++++ kernel/sysctl.c | 41 + 16 files changed, 6198 insertions(+), 22 deletions(-) create mode 100644 kernel/sched/bs.c create mode 100644 kernel/sched/bs.h create mode 100644 kernel/sched/bs_nohz.h create mode 100644 kernel/sched/fair_numa.h create mode 100644 kernel/sched/tt_stats.h diff --git a/arch/x86/configs/hwe_desktop_defconfig b/arch/x86/configs/hwe_desktop_defconfig index a682f029e6acc..62cb58b881e5f 100644 --- a/arch/x86/configs/hwe_desktop_defconfig +++ b/arch/x86/configs/hwe_desktop_defconfig @@ -21,24 +21,22 @@ CONFIG_IKHEADERS=m CONFIG_LOG_BUF_SHIFT=18 CONFIG_UCLAMP_TASK=y CONFIG_NUMA_BALANCING=y +CONFIG_CGROUPS=y CONFIG_MEMCG=y CONFIG_BLK_CGROUP=y -CONFIG_CFS_BANDWIDTH=y -CONFIG_UCLAMP_TASK_GROUP=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_HUGETLB=y CONFIG_CPUSETS=y +# CONFIG_PROC_PID_CPUSET is not set CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y -CONFIG_CGROUP_MISC=y CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y -CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y CONFIG_KALLSYMS_ALL=y @@ -149,11 +147,7 @@ CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_MODULE_SIG_SHA512=y CONFIG_BLK_DEV_ZONED=y -CONFIG_BLK_DEV_THROTTLING=y CONFIG_BLK_WBT=y -CONFIG_BLK_CGROUP_IOLATENCY=y -CONFIG_BLK_CGROUP_IOCOST=y -CONFIG_BLK_CGROUP_IOPRIO=y CONFIG_BLK_SED_OPAL=y CONFIG_PARTITION_ADVANCED=y CONFIG_AIX_PARTITION=y @@ -174,7 +168,6 @@ CONFIG_SYSV68_PARTITION=y CONFIG_CMDLINE_PARTITION=y CONFIG_MQ_IOSCHED_KYBER=m CONFIG_IOSCHED_BFQ=m -CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y @@ -348,7 +341,6 @@ CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CGROUP=m CONFIG_NETFILTER_XT_MATCH_CLUSTER=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m @@ -584,7 +576,6 @@ CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_RSVP=m CONFIG_NET_CLS_RSVP6=m CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=m CONFIG_NET_CLS_BPF=m CONFIG_NET_CLS_FLOWER=m CONFIG_NET_CLS_MATCHALL=m @@ -637,8 +628,6 @@ CONFIG_QRTR_SMD=m CONFIG_QRTR_TUN=m CONFIG_NET_NCSI=y CONFIG_NCSI_OEM_CMD_GET_MAC=y -CONFIG_CGROUP_NET_PRIO=y -CONFIG_BPF_STREAM_PARSER=y CONFIG_NET_PKTGEN=m CONFIG_NET_DROP_MONITOR=y CONFIG_HAMRADIO=y @@ -2114,8 +2103,6 @@ CONFIG_TCG_TIS_ST33ZP24_SPI=m CONFIG_TELCLOCK=m CONFIG_XILLYBUS=m CONFIG_XILLYBUS_PCIE=m -CONFIG_RANDOM_TRUST_CPU=y -CONFIG_RANDOM_TRUST_BOOTLOADER=y CONFIG_I2C_CHARDEV=y CONFIG_I2C_MUX_GPIO=m CONFIG_I2C_MUX_LTC4306=m diff --git a/include/linux/sched.h b/include/linux/sched.h index 9ab98ec6122cf..6bf53335f78d2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -535,6 +535,23 @@ struct sched_statistics { #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; +#ifdef CONFIG_TT_SCHED +struct tt_node { + struct tt_node* next; + struct tt_node* prev; + unsigned int task_type; + u64 vruntime; + u64 start_time; + + u64 prev_wait_time; + u64 wait_time; + u64 prev_burst; + u64 curr_burst; + u64 burst; + unsigned int rt_sticky; +}; +#endif + struct sched_entity { /* For load-balancing: */ struct load_weight load; @@ -544,9 +561,14 @@ struct sched_entity { u64 exec_start; u64 sum_exec_runtime; - u64 vruntime; u64 prev_sum_exec_runtime; +#ifdef CONFIG_TT_SCHED + struct tt_node tt_node; +#endif + + u64 vruntime; + u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c1076b5e17fb1..dc896594c8d64 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -16,6 +16,13 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_TT_SCHED +extern unsigned int tt_balancer_opt; +extern unsigned int tt_grq_balance_ms; +extern unsigned int tt_max_lifetime; +extern int tt_rt_prio; +#endif + enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, SCHED_TUNABLESCALING_LOG, diff --git a/init/Kconfig b/init/Kconfig index fa63cc019ebfc..bb1374d526322 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -114,6 +114,19 @@ config THREAD_INFO_IN_TASK One subtle change that will be needed is to use try_get_task_stack() and put_task_stack() in save_thread_stack_tsk() and get_wchan(). +config TT_SCHED + bool "TT Scheduler" + default y + +config TT_ACCOUNTING_STATS + bool "TT include all accounting and statistics" + depends on TT_SCHED + default y + help + This will include all CFS tasks' load accounting and statistics. + If you are using 'performance' governor and do not depend/care + about tasks statistics, then choose N. Otherwise say Y. + menu "General setup" config BROKEN @@ -825,7 +838,7 @@ menu "Scheduler features" config UCLAMP_TASK bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL + depends on CPU_FREQ_GOV_SCHEDUTIL && TT_ACCOUNTING_STATS help This feature enables the scheduler to track the clamped utilization of each CPU based on RUNNABLE tasks scheduled on that CPU. @@ -1005,6 +1018,7 @@ config CGROUP_WRITEBACK menuconfig CGROUP_SCHED bool "CPU controller" + depends on !TT_SCHED default n help This feature lets CPU scheduler recognize task groups and control CPU @@ -1282,6 +1296,8 @@ config CHECKPOINT_RESTORE config SCHED_AUTOGROUP bool "Automatic process group scheduling" + default n + depends on !TT_SCHED select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c2f1fd95a8214..92d9bbbd78b20 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -118,6 +118,7 @@ config PREEMPT_DYNAMIC config SCHED_CORE bool "Core Scheduling for SMT" depends on SCHED_SMT + depends on !TT_SCHED help This option permits Core Scheduling, a means of coordinated task selection across SMT siblings. When enabled -- see diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 976092b7bd452..e32dc2a7ceb83 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,6 +29,10 @@ endif # build parallelizes well and finishes roughly at once: # obj-y += core.o +ifeq ($(CONFIG_TT_SCHED),y) +obj-y += bs.o +else obj-y += fair.o +endif obj-y += build_policy.o obj-y += build_utility.o diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c new file mode 100644 index 0000000000000..0dc73e664b6e7 --- /dev/null +++ b/kernel/sched/bs.c @@ -0,0 +1,1877 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * TT Scheduler Class (SCHED_NORMAL/SCHED_BATCH) + * + * Copyright (C) 2021, Hamad Al Marri + */ +#include "sched.h" +#include "pelt.h" +#include "tt_stats.h" +#include "fair_numa.h" +#include "bs.h" + +#define MAX_HRRN 0xFFFFFFFFFFFFFFFULL + +struct global_candidate { + struct rq *rq; + struct tt_node *candidate; + u64 hrrn; + + // for update + raw_spinlock_t lock; +}; + +struct global_candidate global_candidate = {0, 0, MAX_HRRN}; + +unsigned int __read_mostly tt_balancer_opt = 0; +unsigned int __read_mostly tt_grq_balance_ms = 6; + +unsigned int __read_mostly tt_max_lifetime = 22000; // in ms +int __read_mostly tt_rt_prio = -20; + +#define LOCK_GRQ(grf) ({ \ + rq_lock_irqsave(grq, &(grf)); \ + update_rq_clock(grq); \ +}) + +#define UNLOCK_GRQ(grf) ({ \ + rq_unlock(grq, &(grf)); \ + local_irq_restore((grf).flags); \ +}) + +#define INTERACTIVE_HRRN 2U +#define RT_WAIT_DELTA 800000U +#define RT_BURST_DELTA 2000000U +#define RT_BURST_MAX 4000000U + +#define HZ_PERIOD (1000000000 / HZ) +#define RACE_TIME 40000000 +#define FACTOR (RACE_TIME / HZ_PERIOD) + +#define IS_REALTIME(ttn) ((ttn)->task_type == TT_REALTIME) +#define IS_INTERACTIVE(ttn) ((ttn)->task_type == TT_INTERACTIVE) +#define IS_NO_TYPE(ttn) ((ttn)->task_type == TT_NO_TYPE) +#define IS_CPU_BOUND(ttn) ((ttn)->task_type == TT_CPU_BOUND) +#define IS_BATCH(ttn) ((ttn)->task_type == TT_BATCH) + +#define GEQ(a, b) ((s64)((a) - (b)) >= 0) // is a >= b +#define LEQ(a, b) ((s64)((a) - (b)) <= 0) // is a <= b +#define LES(a, b) ((s64)((a) - (b)) < 0) // is a < b +#define EQ_D(a, b, d) (LEQ(a, b + d) && GEQ(a, b - d)) + +#define HRRN_PERCENT(ttn, now) \ + (((ttn)->vruntime * 1000ULL) / (((now) - (ttn)->start_time) | 1)) + +static inline bool is_interactive(struct tt_node *ttn, u64 now, u64 _hrrn) +{ + u64 wait; + + if (LES(_hrrn, (u64) INTERACTIVE_HRRN)) + return false; + + wait = now - se_of(ttn)->exec_start; + if (wait && EQ_D(wait, ttn->prev_wait_time, RT_WAIT_DELTA)) + return false; + + return true; +} + +static inline bool is_realtime(struct tt_node *ttn, u64 now, int flags) +{ + u64 life_time, wait; + + // it has slept at least once + if (!ttn->wait_time) + return false; + + // life time >= 0.5s + life_time = now - task_of(se_of(ttn))->start_time; + if (LES(life_time, 500000000ULL)) + return false; + + // don't check wait time for migrated tasks + if (!(flags & ENQUEUE_MIGRATED)) { + /* it has relatively equal sleeping/waiting times + * (ex. it sleeps for ~10ms and run repeatedly) + */ + wait = now - se_of(ttn)->exec_start; + if (wait && !EQ_D(wait, ttn->prev_wait_time, RT_WAIT_DELTA)) + return false; + } + + // bursts before sleep are relatively equal (delta 2ms) + if (!EQ_D(ttn->burst, ttn->prev_burst, RT_BURST_DELTA)) + return false; + + // burst before sleep is <= 4ms + if (LEQ(ttn->burst, RT_BURST_MAX) && + LEQ(ttn->curr_burst, RT_BURST_MAX)) + return true; + + return false; +} + +static inline bool is_cpu_bound(struct tt_node *ttn) +{ + u64 _hrrn_percent; + + _hrrn_percent = ttn->vruntime * 100ULL; + _hrrn_percent /= ttn->wait_time + ttn->vruntime; + + // HRRN >= 80% + return (GEQ(_hrrn_percent, 80ULL)); +} + +static inline bool is_batch(struct tt_node *ttn, u64 _hrrn) +{ + // HRRN > 50% + return (LES(_hrrn, 2ULL)); +} + +static void detect_type(struct tt_node *ttn, u64 now, int flags) +{ + unsigned int new_type = TT_NO_TYPE; + unsigned int old_type = ttn->task_type; + u64 _hrrn; + unsigned int cpu; + + if (ttn->vruntime == 1) { + ttn->task_type = TT_NO_TYPE; + return; + } + + _hrrn = (ttn->wait_time + ttn->vruntime) / ttn->vruntime; + + if (is_realtime(ttn, now, flags)) + new_type = TT_REALTIME; + else if (is_interactive(ttn, now, _hrrn)) + new_type = TT_INTERACTIVE; + else if (is_cpu_bound(ttn)) + new_type = TT_CPU_BOUND; + else if (is_batch(ttn, _hrrn)) + new_type = TT_BATCH; + + if (new_type == TT_REALTIME) { + ttn->rt_sticky = 4; + } else if (IS_REALTIME(ttn) && ttn->rt_sticky) { + ttn->rt_sticky--; + return; + } + + if (new_type != old_type) { + cpu = task_cpu(task_of(se_of(ttn))); + + /* + * Recall: + * TT_REALTIME 0 + * TT_INTERACTIVE 1 + * TT_NO_TYPE 2 + * TT_CPU_BOUND 3 + * TT_BATCH 4 + */ + if (new_type == 1 && old_type != 1) + per_cpu(nr_lat_sensitive, cpu)++; + else if (old_type == 1 && new_type != 1) + dec_nr_lat_sensitive(cpu); + } + + ttn->task_type = new_type; +} + +static void normalize_lifetime(u64 now, struct tt_node *ttn) +{ + u64 max_life_ns, life_time, old_hrrn_x; + s64 diff; + + /* + * left shift 20 bits is approximately = * 1000000 + * we don't need the precision of life time + * Ex. for 22s, with left shift (20bits) == 23.06s + */ + max_life_ns = ((u64) tt_max_lifetime) << 20; + life_time = now - ttn->start_time; + diff = life_time - max_life_ns; + + if (likely(diff < 0)) + return; + + // unmark YIELD. No need to check or remark since + // this normalize action doesn't happen very often + YIELD_UNMARK(ttn); + + // multiply life_time by 1024 for more precision + old_hrrn_x = (life_time << 7) / ((ttn->vruntime >> 3) | 1); + + // reset life to half max_life (i.e ~15s) + ttn->start_time = now - (max_life_ns >> 1); + + // avoid division by zero + if (old_hrrn_x == 0) old_hrrn_x = 1; + + // reset vruntime based on old hrrn ratio + ttn->vruntime = ((max_life_ns << 9) / old_hrrn_x) | 1; +} + +static u64 convert_to_vruntime(u64 delta, struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + s64 prio_diff; + int prio = IS_REALTIME(&se->tt_node) ? tt_rt_prio : PRIO_TO_NICE(p->prio); + + if (prio == 0) + return delta; + + prio_diff = prio * 1000000; + prio_diff /= FACTOR; + + if ((s64)(delta + prio_diff) < 0) + return 1; + + return delta + prio_diff; +} + +static void update_candidate(struct cfs_rq *cfs_rq); + +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + struct tt_node *ttn = &curr->tt_node; + u64 now = sched_clock(); + u64 delta_exec; +#ifdef CONFIG_TT_ACCOUNTING_STATS + struct task_struct *curtask = task_of(curr); +#endif + + if (unlikely(!curr)) + return; + + delta_exec = now - curr->exec_start; + if (unlikely((s64)delta_exec <= 0)) + return; + + curr->exec_start = now; + +#ifdef CONFIG_TT_ACCOUNTING_STATS + if (schedstat_enabled()) { + struct sched_statistics *stats; + + stats = __schedstats_from_se(curr); + __schedstat_set(stats->exec_max, + max(delta_exec, stats->exec_max)); + } +#endif + curr->sum_exec_runtime += delta_exec; + +#ifdef CONFIG_TT_ACCOUNTING_STATS + schedstat_add(cfs_rq->exec_clock, delta_exec); +#endif + ttn->curr_burst += delta_exec; + ttn->vruntime += convert_to_vruntime(delta_exec, curr); + detect_type(ttn, now, 0); + normalize_lifetime(now, &curr->tt_node); + + if (IS_CAND_BL_ENABLED) { + cfs_rq->local_cand_hrrn = HRRN_PERCENT(&curr->tt_node, now); + update_candidate(cfs_rq); + } + +#ifdef CONFIG_TT_ACCOUNTING_STATS + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); + cgroup_account_cputime(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); +#endif +} + +static void update_curr_fair(struct rq *rq) +{ + update_curr(cfs_rq_of(&rq->curr->se)); +} + +/** + * Should `a` preempts `b`? + */ +static inline bool +entity_before(struct tt_node *a, struct tt_node *b) +{ + u64 now = sched_clock(); + + return (s64)(HRRN_PERCENT(a, now) - HRRN_PERCENT(b, now)) < 0; +} + +static void __enqueue_entity_port(struct tt_node **port, struct sched_entity *se) +{ + struct tt_node *ttn = &se->tt_node; + + ttn->next = ttn->prev = NULL; + + // if empty + if (!(*port)) { + (*port) = ttn; + } + else { + ttn->next = (*port); + (*port)->prev = ttn; + (*port) = ttn; + } +} + +static void __dequeue_entity_port(struct tt_node **port, struct sched_entity *se) +{ + struct tt_node *ttn = &se->tt_node; + + // if only one se in rq + if ((*port)->next == NULL) + (*port) = NULL; + // if it is the head + else if (ttn == (*port)) + (*port) = (*port)->next; +} + +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct tt_node *ttn = &se->tt_node; + + ttn->next = ttn->prev = NULL; + + // if empty + if (!cfs_rq->head) { + cfs_rq->head = ttn; + } + else { + ttn->next = cfs_rq->head; + cfs_rq->head->prev = ttn; + cfs_rq->head = ttn; + } +} + +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct tt_node *ttn = &se->tt_node; + struct tt_node *prev, *next; + + // if only one se in rq + if (cfs_rq->head->next == NULL) { + cfs_rq->head = NULL; + } + // if it is the head + else if (ttn == cfs_rq->head) { + cfs_rq->head = cfs_rq->head->next; + cfs_rq->head->prev = NULL; + } + // if in the middle + else { + prev = ttn->prev; + next = ttn->next; + + prev->next = next; + if (next) + next->prev = prev; + } +} + +static void +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + struct tt_node *ttn = &se->tt_node; + bool curr = cfs_rq->curr == se; + bool wakeup = (flags & ENQUEUE_WAKEUP); + u64 now = sched_clock(); + u64 wait; + + if (wakeup) { + wait = now - se->exec_start; + ttn->wait_time += wait; + detect_type(ttn, now, flags); + + ttn->prev_wait_time = wait; + } else { + detect_type(ttn, now, flags); + } + + update_curr(cfs_rq); + + /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its weight to reflect the new share of + * its group cfs_rq + * - Add its new weight to cfs_rq->load.weight + */ + update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); + account_entity_enqueue(cfs_rq, se); + check_schedstat_required(); + update_stats_enqueue_fair(cfs_rq, se, flags); + + if (!curr) + __enqueue_entity(cfs_rq, se); + + se->on_rq = 1; +} + +static inline int clear_this_candidate(struct sched_entity *se) +{ + struct tt_node *ttn = &se->tt_node; + struct tt_node *curr_can = READ_ONCE(global_candidate.candidate); + + if (ttn != curr_can) + return 0; + + WRITE_ONCE(global_candidate.candidate, NULL); + WRITE_ONCE(global_candidate.rq, NULL); + WRITE_ONCE(global_candidate.hrrn, MAX_HRRN); + + return 1; +} + + +static inline void clear_rq_candidate(struct cfs_rq *cfs_rq) +{ + struct rq *rq = READ_ONCE(global_candidate.rq); + + if (rq != rq_of(cfs_rq)) + return; + + WRITE_ONCE(global_candidate.candidate, NULL); + WRITE_ONCE(global_candidate.rq, NULL); + WRITE_ONCE(global_candidate.hrrn, MAX_HRRN); +} + +static inline bool +can_be_candidate(struct tt_node *ttn, int this_cpu) +{ + struct task_struct *p = task_of(se_of(ttn)); + + if (kthread_is_per_cpu(p)) + return 0; + + /* + * only realtime and interactive can + * be candidates + */ + if (ttn->task_type > TT_INTERACTIVE) + return 0; + + // just migrated + if (p->se.avg.last_update_time == 0) + return 0; + + if (task_running(cpu_rq(this_cpu), p)) + return 0; + + // some tasks are pinned to this cpu + if (p->nr_cpus_allowed <= 1) + return 0; + + if (is_migration_disabled(p)) + return 0; + + return 1; +} + +static void __update_candidate(struct cfs_rq *cfs_rq, struct tt_node *ttn) +{ + unsigned long flags; + u64 hrrn, curr_can_hrrn; + + curr_can_hrrn = READ_ONCE(global_candidate.hrrn); + hrrn = HRRN_PERCENT(ttn, sched_clock()); + + if ((s64)(hrrn - curr_can_hrrn) < 0) { + raw_spin_lock_irqsave(&global_candidate.lock, flags); + global_candidate.rq = rq_of(cfs_rq); + global_candidate.candidate = ttn; + global_candidate.hrrn = hrrn; + raw_spin_unlock_irqrestore(&global_candidate.lock, flags); + } +} + +static void update_candidate(struct cfs_rq *cfs_rq) +{ + struct tt_node *ttn = cfs_rq->head; + struct tt_node *next; + int this_cpu = cpu_of(rq_of(cfs_rq)); + + while (ttn && !can_be_candidate(ttn, this_cpu)) + ttn = ttn->next; + + if (!ttn) { + clear_rq_candidate(cfs_rq); + return; + } + + next = ttn->next; + while (next) { + if (can_be_candidate(next, this_cpu) && entity_before(next, ttn)) + ttn = next; + + next = next->next; + } + + __update_candidate(cfs_rq, ttn); +} + +static void +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + struct tt_node *ttn = &se->tt_node; + bool sleep = (flags & DEQUEUE_SLEEP); + u64 avg_wait; + + if (sleep) { + ttn->prev_burst = ttn->burst; + ttn->burst = ttn->curr_burst; + ttn->curr_burst = 0; + + if (IS_CPU_BOUND(ttn)) + ttn->task_type = TT_BATCH; + else if (IS_REALTIME(ttn)) { + avg_wait = ttn->prev_wait_time; + avg_wait += ttn->wait_time; + avg_wait /= 2ULL; + + if (LEQ(avg_wait, HZ_PERIOD)) + per_cpu(nr_lat_sensitive, cpu_of(rq_of(cfs_rq)))++; + } + } + + update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Subtract its load from the cfs_rq->runnable_avg. + * - Subtract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ + update_load_avg(cfs_rq, se, UPDATE_TG); + update_stats_dequeue_fair(cfs_rq, se, flags); + + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + + if (IS_CAND_BL_ENABLED && clear_this_candidate(se)) + update_candidate(cfs_rq); + + se->on_rq = 0; + account_entity_dequeue(cfs_rq, se); +} + +static void +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); + + /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's add the task's estimated utilization to the cfs_rq's + * estimated utilization, before we update schedutil. + */ + util_est_enqueue(&rq->cfs, p); + + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + + if (!se->on_rq) { + enqueue_entity(cfs_rq, se, flags); + cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; + } + + if (IS_CAND_BL_ENABLED) update_candidate(cfs_rq); + + add_nr_running(rq, 1); + + if (!task_new) + update_overutilized_status(rq); +} + +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + int idle_h_nr_running = task_has_idle_policy(p); + int task_sleep = flags & DEQUEUE_SLEEP; + + util_est_dequeue(&rq->cfs, p); + + dequeue_entity(cfs_rq, se, flags); + + cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; + + sub_nr_running(rq, 1); + util_est_update(&rq->cfs, p, task_sleep); +} + +static void yield_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + + /* + * Are we the only task in the tree? + */ + if (unlikely(rq->nr_running == 1)) + return; + + if (cfs_rq->h_nr_running > 1) + YIELD_MARK(&curr->se.tt_node); + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq); + } +} + +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) +{ + yield_task_fair(rq); + return true; +} + +static void +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (IS_CAND_BL_ENABLED) clear_this_candidate(se); + + if (se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + update_stats_wait_end_fair(cfs_rq, se); + __dequeue_entity(cfs_rq, se); + update_load_avg(cfs_rq, se, UPDATE_TG); + } + + if (IS_CAND_BL_ENABLED) { + update_candidate(cfs_rq); + cfs_rq->local_cand_hrrn = HRRN_PERCENT(&se->tt_node, sched_clock()); + } + + se->exec_start = sched_clock(); + cfs_rq->curr = se; + +#ifdef CONFIG_TT_ACCOUNTING_STATS + /* + * Track our maximum slice length, if the CPU's load is at + * least twice that of our own weight (i.e. dont track it + * when there are only lesser-weight tasks around): + */ + if (schedstat_enabled() && + rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { + struct sched_statistics *stats; + + stats = __schedstats_from_se(se); + __schedstat_set(stats->slice_max, + max((u64)stats->slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime)); + } +#endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; +} + +static struct sched_entity * +pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + struct tt_node *ttn = cfs_rq->head; + struct tt_node *next; + + if (!ttn) + return curr; + + next = ttn->next; + while (next) { + if (entity_before(next, ttn)) + ttn = next; + + next = next->next; + } + + if (curr && entity_before(&curr->tt_node, ttn)) + return curr; + + return se_of(ttn); +} + +struct task_struct * +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se; + struct task_struct *p; + int new_tasks; + + if (IS_CAND_BL_ENABLED) { + /* + * to cpu0, don't push any + * candidates to this rq + */ + cfs_rq->local_cand_hrrn = 0; + clear_rq_candidate(cfs_rq); + } + +again: + if (!sched_fair_runnable(rq)) + goto idle; + + if (prev) + put_prev_task(rq, prev); + + se = pick_next_entity(cfs_rq, NULL); + set_next_entity(cfs_rq, se); + + p = task_of(se); + + if (prev) + YIELD_UNMARK(&prev->se.tt_node); + +done: __maybe_unused; +#ifdef CONFIG_SMP + /* + * Move the next running task to the front of + * the list, so our cfs_tasks list becomes MRU + * one. + */ + list_move(&p->se.group_node, &rq->cfs_tasks); +#endif + + update_misfit_status(p, rq); + + return p; + +idle: + if (IS_CAND_BL_ENABLED) + cfs_rq->local_cand_hrrn = MAX_HRRN; + + if (!rf) + return NULL; + + new_tasks = newidle_balance(rq, rf); + + /* + * Because newidle_balance() releases (and re-acquires) rq->lock, it is + * possible for any higher priority task to appear. In that case we + * must re-start the pick_next_entity() loop. + */ + if (new_tasks < 0) + return RETRY_TASK; + + if (new_tasks > 0) + goto again; + + /* + * rq is about to be idle, check if we need to update the + * lost_idle_time of clock_pelt + */ + update_idle_rq_clock_pelt(rq); + + return NULL; +} + +static struct task_struct *__pick_next_task_fair(struct rq *rq) +{ + return pick_next_task_fair(rq, NULL, NULL); +} + +#ifdef CONFIG_SMP +static struct task_struct *pick_task_fair(struct rq *rq) +{ + struct sched_entity *se; + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *curr = cfs_rq->curr; + + if (!cfs_rq->nr_running) + return NULL; + + /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ + if (curr) { + if (curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; + } + + se = pick_next_entity(cfs_rq, curr); + + return task_of(se); +} +#endif + +static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) +{ + /* + * If still on the runqueue then deactivate_task() + * was not called and update_curr() has to be done: + */ + if (prev->on_rq) { + update_curr(cfs_rq); + update_stats_wait_start_fair(cfs_rq, prev); + __enqueue_entity(cfs_rq, prev); + update_load_avg(cfs_rq, prev, 0); + } + + cfs_rq->curr = NULL; +} + +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +{ + struct sched_entity *se = &prev->se; + + put_prev_entity(cfs_rq_of(se), se); +} + +static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +#ifdef CONFIG_SMP + if (task_on_rq_queued(p)) { + /* + * Move the next running task to the front of the list, so our + * cfs_tasks list becomes MRU one. + */ + list_move(&se->group_node, &rq->cfs_tasks); + } +#endif + + set_next_entity(cfs_rq, se); +} + +static void +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + struct sched_entity *next = pick_next_entity(cfs_rq, curr); + + if (next != curr) { + if (IS_CAND_BL_ENABLED) { + clear_this_candidate(next); + cfs_rq->local_cand_hrrn = HRRN_PERCENT(&next->tt_node, sched_clock()); + } + + resched_curr(rq_of(cfs_rq)); + } else if (IS_CAND_BL_ENABLED) { + clear_this_candidate(curr); + } +} + +static void +entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +{ + update_curr(cfs_rq); + + /* + * Ensure that runnable average is periodically updated. + */ + update_load_avg(cfs_rq, curr, UPDATE_TG); + + if (cfs_rq->nr_running > 1) + check_preempt_tick(cfs_rq, curr); + else if (IS_CAND_BL_ENABLED) + clear_rq_candidate(cfs_rq); +} + +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se, *wse = &p->se; + + if (unlikely(se == wse)) + return; + + if (test_tsk_need_resched(curr)) + return; + + /* Idle tasks are by definition preempted by non-idle tasks. */ + if (unlikely(task_has_idle_policy(curr)) && + likely(!task_has_idle_policy(p))) + goto preempt; + + /* + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): + */ + if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) + return; + + update_curr(cfs_rq_of(se)); + + if (entity_before(&wse->tt_node, &se->tt_node)) + goto preempt; + + return; + +preempt: + resched_curr(rq); +} + +#ifdef CONFIG_SMP +static int +balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +{ + if (rq->nr_running) + return 1; + + return newidle_balance(rq, rf) != 0; +} + +static void record_wakee(struct task_struct *p) +{ + /* + * Only decay a single time; tasks that have less then 1 wakeup per + * jiffy will not have built up many flips. + */ + if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { + current->wakee_flips >>= 1; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} + +/* + * Detect M:N waker/wakee relationships via a switching-frequency heuristic. + * + * A waker of many should wake a different task than the one last awakened + * at a frequency roughly N times higher than one of its wakees. + * + * In order to determine whether we should let the load spread vs consolidating + * to shared cache, we look for a minimum 'flip' frequency of llc_size in one + * partner, and a factor of lls_size higher frequency in the other. + * + * With both conditions met, we can be relatively sure that the relationship is + * non-monogamous, with partner count exceeding socket size. + * + * Waker/wakee being client/server, worker/dispatcher, interrupt source or + * whatever is irrelevant, spread criteria is apparent partner count exceeds + * socket size. + */ +static int wake_wide(struct task_struct *p) +{ + unsigned int master = current->wakee_flips; + unsigned int slave = p->wakee_flips; + int factor = __this_cpu_read(sd_llc_size); + + if (master < slave) + swap(master, slave); + if (slave < factor || master < slave * factor) + return 0; + return 1; +} + +/* + * The purpose of wake_affine() is to quickly determine on which CPU we can run + * soonest. For the purpose of speed we only consider the waking and previous + * CPU. + * + * wake_affine_idle() - only considers 'now', it check if the waking CPU is + * cache-affine and is (or will be) idle. + */ +static int +wake_affine_idle(int this_cpu, int prev_cpu, int sync) +{ + /* + * If this_cpu is idle, it implies the wakeup is from interrupt + * context. Only allow the move if cache is shared. Otherwise an + * interrupt intensive workload could force all tasks onto one + * node depending on the IO topology or IRQ affinity settings. + * + * If the prev_cpu is idle and cache affine then avoid a migration. + * There is no guarantee that the cache hot data from an interrupt + * is more important than cache hot data on the prev_cpu and from + * a cpufreq perspective, it's better to have higher utilisation + * on one CPU. + */ + if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) + return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; + + if (sync && cpu_rq(this_cpu)->nr_running == 1) + return this_cpu; + + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + + return nr_cpumask_bits; +} + +static int +wake_affine(struct task_struct *p, int this_cpu, int prev_cpu, int sync) +{ + int target = nr_cpumask_bits; + + target = wake_affine_idle(this_cpu, prev_cpu, sync); + + if (target == nr_cpumask_bits) + return prev_cpu; + + return target; +} + +static int find_energy_efficient_cpu(struct rq *rq, struct task_struct *p) +{ + int target = -1, cpu; + struct tt_node *ttn = &p->se.tt_node; + unsigned int min = ~0; + bool all_non_idle = true; + + /* + * If type is realtime, interactive, or no type, + * find non idle cpu. Otherwise, use normal balancing + */ + if (ttn->vruntime > 1 && ttn->task_type > TT_NO_TYPE) + return -1; + + for_each_online_cpu(cpu) { + if (unlikely(!cpumask_test_cpu(cpu, p->cpus_ptr))) + continue; + + if (idle_cpu(cpu)) { + all_non_idle = false; + continue; + } + + if (cpu_rq(cpu)->nr_running < min) { + target = cpu; + min = cpu_rq(cpu)->nr_running; + } + } + + /* + * If all cpus are non-idle, then fallback + * to normal TT balancing. Since no energy + * saving at this point, at least try to + * use cpu affain. + */ + if (all_non_idle) + return -1; + + return target; +} + +static int +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) +{ + struct rq *rq = cpu_rq(prev_cpu); + unsigned int min_prev = rq->nr_running; + unsigned int min = rq->nr_running; + int cpu = smp_processor_id(); + int this_cpu = smp_processor_id(); + int new_cpu = prev_cpu; + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + int want_affine = 0; + + if (IS_PWR_BL_ENABLED) { + int pe_cpu = find_energy_efficient_cpu(rq, p); + if (pe_cpu != -1) + return pe_cpu; + } + + /* + * required for stable ->cpus_allowed + */ + lockdep_assert_held(&p->pi_lock); + if (wake_flags & WF_TTWU) { + record_wakee(p); + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); + } + + for_each_cpu_wrap(cpu, cpu_online_mask, this_cpu) { + if (unlikely(!cpumask_test_cpu(cpu, p->cpus_ptr))) + continue; + + if (want_affine) { + if (cpu != prev_cpu) + new_cpu = wake_affine(p, cpu, prev_cpu, sync); + + return new_cpu; + } + + if (IS_GRQ_BL_ENABLED) + return smp_processor_id(); + + if (cpu_rq(cpu)->nr_running < min) { + new_cpu = cpu; + min = cpu_rq(cpu)->nr_running; + } + } + + if (IS_GRQ_BL_ENABLED) + return smp_processor_id(); + + if (min == min_prev) + return prev_cpu; + + return new_cpu; +} + +/* + * Is this task likely cache-hot: + */ +static int task_hot(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) +{ + s64 delta; + + lockdep_assert_rq_held(src_rq); + + if (p->sched_class != &fair_sched_class) + return 0; + + if (unlikely(task_has_idle_policy(p))) + return 0; + + /* SMT siblings share cache */ + if (cpus_share_cache(cpu_of(dst_rq), cpu_of(src_rq))) + return 0; + + if (sysctl_sched_migration_cost == -1) + return 1; + + if (sysctl_sched_migration_cost == 0) + return 0; + + delta = sched_clock() - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns 1, if task migration degrades locality + * Returns 0, if task migration improves locality i.e migration preferred. + * Returns -1, if task migration is not affected by locality. + */ +static int +migrate_degrades_locality(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) +{ + struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_weight, dst_weight; + int src_nid, dst_nid, dist; + + if (!static_branch_likely(&sched_numa_balancing)) + return -1; + + src_nid = cpu_to_node(cpu_of(src_rq)); + dst_nid = cpu_to_node(cpu_of(dst_rq)); + + if (src_nid == dst_nid) + return -1; + + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) { + if (src_rq->nr_running > src_rq->nr_preferred_running) + return 1; + else + return -1; + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) + return 0; + + /* Leaving a core idle is often worse than degrading locality. */ + if (dst_rq->idle_balance) + return -1; + + dist = node_distance(src_nid, dst_nid); + if (numa_group) { + src_weight = group_weight(p, src_nid, dist); + dst_weight = group_weight(p, dst_nid, dist); + } else { + src_weight = task_weight(p, src_nid, dist); + dst_weight = task_weight(p, dst_nid, dist); + } + + return dst_weight < src_weight; +} + +#else +static inline int migrate_degrades_locality(struct task_struct *p, + struct rq *dst_rq, struct rq *src_rq) +{ + return -1; +} +#endif + +static int +can_migrate_task_powersave(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) +{ + int tsk_cache_hot; + + /* Disregard pcpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; + + if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) + return 0; + + if (task_running(src_rq, p)) + return 0; + + tsk_cache_hot = migrate_degrades_locality(p, dst_rq, src_rq); + if (tsk_cache_hot == -1) + tsk_cache_hot = task_hot(p, dst_rq, src_rq); + + if (tsk_cache_hot > 0) + return 0; + + if (p->se.tt_node.task_type < TT_CPU_BOUND) + return 0; + + return 1; +} + +static int +can_migrate_task(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) +{ + int tsk_cache_hot; + + /* Disregard pcpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; + + if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) + return 0; + + if (task_running(src_rq, p)) + return 0; + + tsk_cache_hot = migrate_degrades_locality(p, dst_rq, src_rq); + if (tsk_cache_hot == -1) + tsk_cache_hot = task_hot(p, dst_rq, src_rq); + + if (tsk_cache_hot > 0) + return 0; + + return 1; +} + +static void pull_from(struct rq *dist_rq, + struct rq *src_rq, + struct rq_flags *src_rf, + struct task_struct *p) +{ + struct rq_flags rf; + + // detach task + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, cpu_of(dist_rq)); + + // unlock src rq + rq_unlock(src_rq, src_rf); + + // lock dist rq + rq_lock(dist_rq, &rf); + update_rq_clock(dist_rq); + + activate_task(dist_rq, p, ENQUEUE_NOCLOCK); + check_preempt_curr(dist_rq, p, 0); + + // unlock dist rq + rq_unlock(dist_rq, &rf); + + local_irq_restore(src_rf->flags); +} + +static int move_task_powersave(struct rq *dist_rq, struct rq *src_rq, + struct rq_flags *src_rf) +{ + struct cfs_rq *src_cfs_rq = &src_rq->cfs; + struct task_struct *p; + struct tt_node *ttn = src_cfs_rq->head; + + while (ttn) { + p = task_of(se_of(ttn)); + if (can_migrate_task_powersave(p, dist_rq, src_rq)) { + pull_from(dist_rq, src_rq, src_rf, p); + return 1; + } + + ttn = ttn->next; + } + + /* + * Here we know we have not migrated any task, + * thus, we need to unlock and return 0 + * Note: the pull_from does the unlocking for us. + */ + rq_unlock(src_rq, src_rf); + local_irq_restore(src_rf->flags); + + return 0; +} + +static int move_task(struct rq *dist_rq, struct rq *src_rq, + struct rq_flags *src_rf) +{ + struct cfs_rq *src_cfs_rq = &src_rq->cfs; + struct task_struct *p; + struct tt_node *ttn = src_cfs_rq->head; + + while (ttn) { + p = task_of(se_of(ttn)); + if (can_migrate_task(p, dist_rq, src_rq)) { + pull_from(dist_rq, src_rq, src_rf, p); + return 1; + } + + ttn = ttn->next; + } + + /* + * Here we know we have not migrated any task, + * thus, we need to unlock and return 0 + * Note: the pull_from does the unlocking for us. + */ + rq_unlock(src_rq, src_rf); + local_irq_restore(src_rf->flags); + + return 0; +} + +static int +can_migrate_candidate(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) +{ + /* Disregard pcpu kthreads; they are where they need to be. */ + if (kthread_is_per_cpu(p)) + return 0; + + if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) + return 0; + + if (task_running(src_rq, p)) + return 0; + + return 1; +} + +int idle_pull_global_candidate(struct rq *dist_rq) +{ + struct rq *src_rq; + struct task_struct *p; + struct rq_flags rf, src_rf; + struct tt_node *cand = READ_ONCE(global_candidate.candidate); + + if (!cand) + return 0; + + src_rq = READ_ONCE(global_candidate.rq); + if (!src_rq || src_rq == dist_rq) + return 0; + + rq_lock_irqsave(src_rq, &src_rf); + update_rq_clock(src_rq); + raw_spin_lock(&global_candidate.lock); + cand = global_candidate.candidate; + if (!cand) + goto fail_unlock; + + p = task_of(se_of(cand)); + if (task_rq(p) != src_rq || + !can_migrate_candidate(p, dist_rq, src_rq)) + goto fail_unlock; + + global_candidate.rq = NULL; + global_candidate.candidate = NULL; + global_candidate.hrrn = MAX_HRRN; + raw_spin_unlock(&global_candidate.lock); + + // detach task + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, cpu_of(dist_rq)); + // unlock src rq + rq_unlock(src_rq, &src_rf); + + // lock dist rq + rq_lock(dist_rq, &rf); + update_rq_clock(dist_rq); + activate_task(dist_rq, p, ENQUEUE_NOCLOCK); + check_preempt_curr(dist_rq, p, 0); + // unlock dist rq + rq_unlock(dist_rq, &rf); + + local_irq_restore(src_rf.flags); + + return 1; + +fail_unlock: + raw_spin_unlock(&global_candidate.lock); + rq_unlock(src_rq, &src_rf); + local_irq_restore(src_rf.flags); + return 0; +} + +static int +can_migrate_task_grq(struct tt_node *ttn, struct rq *dst_rq) +{ + struct task_struct *p = task_of(se_of(ttn)); + + if (kthread_is_per_cpu(p)) + return 0; + + if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) + return 0; + + if (task_running(grq, p)) + return 0; + + if (task_hot(p, dst_rq, grq)) + return 0; + + return 1; +} + +static struct sched_entity * +pick_next_entity_from_grq(struct rq *dist_rq, struct sched_entity *local) +{ + struct tt_node *ttn = grq->cfs.head; + struct tt_node *next; + + while (ttn && !can_migrate_task_grq(ttn, dist_rq)) + ttn = ttn->next; + + if (!ttn) + return local; + + next = ttn->next; + while (next) { + if (can_migrate_task_grq(next, dist_rq) && entity_before(next, ttn)) + ttn = next; + + next = next->next; + } + + if (local && entity_before(&local->tt_node, ttn)) + return local; + + return se_of(ttn); +} + +static int pull_from_grq(struct rq *dist_rq) +{ + struct rq_flags rf; + struct rq_flags grf; + struct sched_entity *se; + struct task_struct *p = NULL; + + if (dist_rq == grq) + return 0; + + /* if no tasks to pull, exit */ + if (!grq->cfs.head) + return 0; + + rq_lock_irqsave(grq, &grf); + update_rq_clock(grq); + + se = pick_next_entity_from_grq(dist_rq, NULL); + + if (!se) { + rq_unlock(grq, &grf); + local_irq_restore(grf.flags); + return 0; + } + + p = task_of(se); + + // detach task + deactivate_task(grq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, cpu_of(dist_rq)); + + // unlock src rq + rq_unlock(grq, &grf); + + // lock dist rq + rq_lock(dist_rq, &rf); + update_rq_clock(dist_rq); + + activate_task(dist_rq, p, ENQUEUE_NOCLOCK); + check_preempt_curr(dist_rq, p, 0); + + // unlock dist rq + rq_unlock(dist_rq, &rf); + local_irq_restore(grf.flags); + + return 1; +} + +static void active_pull_global_candidate(struct rq *dist_rq) +{ + struct cfs_rq *cfs_rq = &dist_rq->cfs; + u64 cand_hrrn = READ_ONCE(global_candidate.hrrn); + u64 local_hrrn = READ_ONCE(cfs_rq->local_cand_hrrn); + struct rq *src_rq; + struct task_struct *p; + struct rq_flags rf, src_rf; + struct tt_node *cand; + + cand = READ_ONCE(global_candidate.candidate); + + if (!cand) + return; + + if ((s64)(local_hrrn - cand_hrrn) <= 0) + return; + + src_rq = READ_ONCE(global_candidate.rq); + if (!src_rq || src_rq == dist_rq) + return; + + rq_lock_irqsave(src_rq, &src_rf); + update_rq_clock(src_rq); + raw_spin_lock(&global_candidate.lock); + cand = global_candidate.candidate; + cand_hrrn = global_candidate.hrrn; + + if (!cand) + goto fail_unlock; + + p = task_of(se_of(cand)); + if (task_rq(p) != src_rq || + !can_migrate_candidate(p, dist_rq, src_rq)) + goto fail_unlock; + + if ((s64)(local_hrrn - cand_hrrn) <= 0) + goto fail_unlock; + + global_candidate.rq = NULL; + global_candidate.candidate = NULL; + global_candidate.hrrn = MAX_HRRN; + raw_spin_unlock(&global_candidate.lock); + + // detach task + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, cpu_of(dist_rq)); + // unlock src rq + rq_unlock(src_rq, &src_rf); + + // lock dist rq + rq_lock(dist_rq, &rf); + update_rq_clock(dist_rq); + activate_task(dist_rq, p, ENQUEUE_NOCLOCK); + check_preempt_curr(dist_rq, p, 0); + // unlock dist rq + rq_unlock(dist_rq, &rf); + + local_irq_restore(src_rf.flags); + + return; + +fail_unlock: + raw_spin_unlock(&global_candidate.lock); + rq_unlock(src_rq, &src_rf); + local_irq_restore(src_rf.flags); +} + +static inline int on_null_domain(struct rq *rq) +{ + return unlikely(!rcu_dereference_sched(rq->sd)); +} + +#include "bs_nohz.h" + +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +{ + int this_cpu = this_rq->cpu; + struct rq *src_rq; + int src_cpu = -1, cpu; + int pulled_task = 0; + unsigned int max = 0; + struct rq_flags src_rf; + + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + this_rq->idle_stamp = rq_clock(this_rq); + + /* + * Do not pull tasks towards !active CPUs... + */ + if (IS_PWR_BL_ENABLED || !cpu_active(this_cpu)) + return 0; + + rq_unpin_lock(this_rq, rf); + raw_spin_unlock(&this_rq->__lock); + + update_blocked_averages(this_cpu); + + if (IS_CAND_BL_ENABLED) { + pulled_task = idle_pull_global_candidate(this_rq); + if (pulled_task) + goto out; + } else if (IS_GRQ_BL_ENABLED) { + pulled_task = pull_from_grq(this_rq); + goto out; + } + + for_each_online_cpu(cpu) { + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (this_rq->nr_running > 0) + goto out; + + if (cpu == this_cpu) + continue; + + src_rq = cpu_rq(cpu); + + if (src_rq->nr_running < 2) + continue; + + if (src_rq->nr_running > max) { + max = src_rq->nr_running; + src_cpu = cpu; + } + } + + if (src_cpu != -1) { + src_rq = cpu_rq(src_cpu); + + rq_lock_irqsave(src_rq, &src_rf); + update_rq_clock(src_rq); + + if (src_rq->nr_running < 2) { + rq_unlock(src_rq, &src_rf); + local_irq_restore(src_rf.flags); + } else { + pulled_task = move_task(this_rq, src_rq, &src_rf); + } + } + +out: + raw_spin_lock(&this_rq->__lock); + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) + this_rq->idle_stamp = 0; + else + nohz_newidle_balance(this_rq); + + rq_repin_lock(this_rq, rf); + + return pulled_task; +} + +void trigger_load_balance(struct rq *this_rq) +{ + int this_cpu = cpu_of(this_rq); + int cpu; + unsigned int max, min; + struct rq *max_rq, *min_rq, *c_rq; + struct rq_flags src_rf; + + if (unlikely(on_null_domain(this_rq) || !cpu_active(cpu_of(this_rq)))) + return; + + if (this_cpu != 0) + goto out; + + if (IS_CAND_BL_ENABLED) { + nohz_try_pull_from_candidate(); + } else if (IS_GRQ_BL_ENABLED) { + nohz_try_pull_from_grq(); + goto out; + } + + max = min = this_rq->nr_running; + max_rq = min_rq = this_rq; + + for_each_online_cpu(cpu) { + c_rq = cpu_rq(cpu); + + /* + * Don't need to rebalance while attached to NULL domain or + * runqueue CPU is not active + */ + if (unlikely(on_null_domain(c_rq) || !cpu_active(cpu))) + continue; + + if (c_rq->nr_running < min) { + min = c_rq->nr_running; + min_rq = c_rq; + } + + if (c_rq->nr_running > max) { + max = c_rq->nr_running; + max_rq = c_rq; + } + } + + if (min_rq == max_rq || max - min < 2) + goto out; + + rq_lock_irqsave(max_rq, &src_rf); + update_rq_clock(max_rq); + + if (max_rq->nr_running < 2) { + rq_unlock(max_rq, &src_rf); + local_irq_restore(src_rf.flags); + goto out; + } + + if (IS_PWR_BL_ENABLED && idle_cpu(cpu_of(min_rq)) && max - min == 2) + move_task_powersave(min_rq, max_rq, &src_rf); + else + move_task(min_rq, max_rq, &src_rf); + +out: +#ifdef CONFIG_TT_ACCOUNTING_STATS + if (time_after_eq(jiffies, this_rq->next_balance)) { + this_rq->next_balance = jiffies + msecs_to_jiffies(19); + update_blocked_averages(this_rq->cpu); + } +#endif + if (time_after_eq(jiffies, this_rq->lat_decay)) { + this_rq->lat_decay = jiffies + msecs_to_jiffies(4); + dec_nr_lat_sensitive(this_rq->cpu); + } + + nohz_balancer_kick(this_rq); +} + +void update_group_capacity(struct sched_domain *sd, int cpu) {} +#endif /* CONFIG_SMP */ + +static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) +{ + struct sched_entity *se = &curr->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + entity_tick(cfs_rq, se, queued); + + if (static_branch_unlikely(&sched_numa_balancing)) + task_tick_numa(rq, curr); + + update_misfit_status(curr, rq); + update_overutilized_status(task_rq(curr)); +} + +static void task_fork_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *curr; + struct rq *rq = this_rq(); + struct rq_flags rf; + struct tt_node *ttn = &p->se.tt_node; + + ttn->task_type = TT_NO_TYPE; + ttn->vruntime = 1; + ttn->prev_wait_time = 0; + ttn->wait_time = 0; + ttn->prev_burst = 0; + ttn->burst = 0; + ttn->curr_burst = 0; + ttn->rt_sticky = 0; + + rq_lock(rq, &rf); + update_rq_clock(rq); + + cfs_rq = task_cfs_rq(current); + + curr = cfs_rq->curr; + if (curr) { + update_curr(cfs_rq); + + if (sysctl_sched_child_runs_first) + resched_curr(rq); + } + + rq_unlock(rq, &rf); +} + +/* + * All the scheduling class methods: + */ +DEFINE_SCHED_CLASS(fair) = { + + .enqueue_task = enqueue_task_fair, + .dequeue_task = dequeue_task_fair, + .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, + + .check_preempt_curr = check_preempt_wakeup, + + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, + +#ifdef CONFIG_SMP + .balance = balance_fair, + .pick_task = pick_task_fair, + .select_task_rq = select_task_rq_fair, + .migrate_task_rq = migrate_task_rq_fair, + + .rq_online = rq_online_fair, + .rq_offline = rq_offline_fair, + + .task_dead = task_dead_fair, + .set_cpus_allowed = set_cpus_allowed_common, +#endif + + .task_tick = task_tick_fair, + .task_fork = task_fork_fair, + + .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, + .switched_to = switched_to_fair, + + .get_rr_interval = get_rr_interval_fair, + + .update_curr = update_curr_fair, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif +}; + +__init void init_sched_fair_class(void) +{ +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + +#ifdef CONFIG_NO_HZ_COMMON + nohz.next_balance = jiffies; + nohz.next_blocked = jiffies; + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); +#endif +#endif /* SMP */ + +} diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h new file mode 100644 index 0000000000000..6c645b0dd7607 --- /dev/null +++ b/kernel/sched/bs.h @@ -0,0 +1,310 @@ +#define YIELD_MARK(ttn) ((ttn)->vruntime |= 0x8000000000000000ULL) +#define YIELD_UNMARK(ttn) ((ttn)->vruntime &= 0x7FFFFFFFFFFFFFFFULL) + +/* + * After fork, child runs first. If set to 0 (default) then + * parent will (try to) run first. + */ +unsigned int sysctl_sched_child_runs_first __read_mostly = 1; + +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + +void __init sched_init_granularity(void) {} + +#ifdef CONFIG_SMP +/* + * For asym packing, by default the lower numbered CPU has higher priority. + */ +int __weak arch_asym_cpu_priority(int cpu) +{ + return -cpu; +} + +/* Give new sched_entity start runnable values to heavy its load in infant time */ +void update_max_interval(void) {} +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + +static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) +{ +#ifdef CONFIG_TT_ACCOUNTING_STATS + if (p->on_rq == TASK_ON_RQ_MIGRATING) { + /* + * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' + * rq->lock and can modify state directly. + */ + lockdep_assert_rq_held(task_rq(p)); + detach_entity_cfs_rq(&p->se); + + } else { + /* + * We are supposed to update the task to "current" time, then + * its up to date and ready to go to new CPU/cfs_rq. But we + * have difficulty in getting what current time is, so simply + * throw away the out-of-date time. This will result in the + * wakee task is less decayed, but giving the wakee more load + * sounds not bad. + */ + remove_entity_load_avg(&p->se); + } +#endif + /* Tell new CPU we are migrated */ + p->se.avg.last_update_time = 0; + + /* We have migrated, no longer consider this task hot */ + p->se.exec_start = 0; + + YIELD_UNMARK(&p->se.tt_node); + + update_scan_period(p, new_cpu); +} + +static void rq_online_fair(struct rq *rq) {} +static void rq_offline_fair(struct rq *rq) {} +static void task_dead_fair(struct task_struct *p) +{ +#ifdef CONFIG_TT_ACCOUNTING_STATS + remove_entity_load_avg(&p->se); +#else + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); + unsigned long flags; + + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); + ++cfs_rq->removed.nr; + raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); +#endif +} + +#endif /** CONFIG_SMP */ + +void init_cfs_rq(struct cfs_rq *cfs_rq) +{ + cfs_rq->tasks_timeline = RB_ROOT_CACHED; +#ifdef CONFIG_SMP + raw_spin_lock_init(&cfs_rq->removed.lock); +#endif +} + +#ifdef CONFIG_TT_ACCOUNTING_STATS +static void update_curr(struct cfs_rq *cfs_rq); + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + update_load_sub(&cfs_rq->load, se->load.weight); + } + dequeue_load_avg(cfs_rq, se); + + update_load_set(&se->load, weight); + +#ifdef CONFIG_SMP + do { + u32 divider = get_pelt_divider(&se->avg); + + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); + } while (0); +#endif + + enqueue_load_avg(cfs_rq, se); + if (se->on_rq) + update_load_add(&cfs_rq->load, se->load.weight); + +} +#endif + +void reweight_task(struct task_struct *p, int prio) +{ +#ifdef CONFIG_TT_ACCOUNTING_STATS + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct load_weight *load = &se->load; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + reweight_entity(cfs_rq, se, weight); + load->inv_weight = sched_prio_to_wmult[prio]; +#endif +} + +static inline struct sched_entity *se_of(struct tt_node *ttn) +{ + return container_of(ttn, struct sched_entity, tt_node); +} + +#ifdef CONFIG_SCHED_SMT +DEFINE_STATIC_KEY_FALSE(sched_smt_present); +EXPORT_SYMBOL_GPL(sched_smt_present); + +static inline void set_idle_cores(int cpu, int val) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + WRITE_ONCE(sds->has_idle_cores, val); +} + +static inline bool test_idle_cores(int cpu, bool def) +{ + struct sched_domain_shared *sds; + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) + return READ_ONCE(sds->has_idle_cores); + + return def; +} + +void __update_idle_core(struct rq *rq) +{ + int core = cpu_of(rq); + int cpu; + + rcu_read_lock(); + if (test_idle_cores(core, true)) + goto unlock; + + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (cpu == core) + continue; + + if (!available_idle_cpu(cpu)) + goto unlock; + } + + set_idle_cores(core, 1); +unlock: + rcu_read_unlock(); +} +#endif + +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct rq *rq = rq_of(cfs_rq); + + update_load_add(&cfs_rq->load, se->load.weight); +#ifdef CONFIG_SMP + account_numa_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); +#endif + cfs_rq->nr_running++; +} + +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_sub(&cfs_rq->load, se->load.weight); +#ifdef CONFIG_SMP + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); + list_del_init(&se->group_node); +#endif + cfs_rq->nr_running--; +} + +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +{ + if (!task_on_rq_queued(p)) + return; + + if (rq->cfs.nr_running == 1) + return; + + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (task_current(rq, p)) { + if (p->prio > oldprio) + resched_curr(rq); + } else + check_preempt_curr(rq, p, 0); +} + +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + detach_task_cfs_rq(p); +} + +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ + attach_task_cfs_rq(p); + + if (task_on_rq_queued(p)) { + /* + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. + */ + if (task_current(rq, p)) + resched_curr(rq); + else + check_preempt_curr(rq, p, 0); + } +} + +static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +#ifdef CONFIG_SCHED_DEBUG +#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ + for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos) + +void print_cfs_stats(struct seq_file *m, int cpu) +{ + struct cfs_rq *cfs_rq, *pos; + + rcu_read_lock(); + for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos) + print_cfs_rq(m, cpu, cfs_rq); + rcu_read_unlock(); +} + +#ifdef CONFIG_NUMA_BALANCING +void show_numa_stats(struct task_struct *p, struct seq_file *m) +{ + int node; + unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0; + struct numa_group *ng; + + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + for_each_online_node(node) { + if (p->numa_faults) { + tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)]; + tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + if (ng) { + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], + gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + print_numa_stats(m, node, tsf, tpf, gsf, gpf); + } + rcu_read_unlock(); +} +#endif /* CONFIG_NUMA_BALANCING */ +#endif diff --git a/kernel/sched/bs_nohz.h b/kernel/sched/bs_nohz.h new file mode 100644 index 0000000000000..01f6c6771ebbf --- /dev/null +++ b/kernel/sched/bs_nohz.h @@ -0,0 +1,966 @@ + +#ifdef CONFIG_NO_HZ_COMMON + +static struct { + cpumask_var_t idle_cpus_mask; + atomic_t nr_cpus; + int has_blocked; /* Idle CPUS has blocked load */ + int needs_update; /* Newly idle CPUs need their next_balance collated */ + unsigned long next_balance; /* in jiffy units */ + unsigned long next_blocked; /* Next update of blocked load in jiffies */ +} nohz ____cacheline_aligned; + +#endif /* CONFIG_NO_HZ_COMMON */ + +#ifdef CONFIG_NO_HZ_COMMON +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->avg.load_avg) + return true; + + if (cfs_rq->avg.util_avg) + return true; + + return false; +} + +static inline bool others_have_blocked(struct rq *rq) +{ + if (READ_ONCE(rq->avg_rt.util_avg)) + return true; + + if (READ_ONCE(rq->avg_dl.util_avg)) + return true; + + if (thermal_load_avg(rq)) + return true; + +#ifdef CONFIG_HAVE_SCHED_AVG_IRQ + if (READ_ONCE(rq->avg_irq.util_avg)) + return true; +#endif + + return false; +} + +static inline void update_blocked_load_tick(struct rq *rq) +{ + WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies); +} + +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ + if (!has_blocked) + rq->has_blocked_load = 0; +} +#else +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_tick(struct rq *rq) {} +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +#endif + +#ifdef CONFIG_TT_ACCOUNTING_STATS +static bool __update_blocked_others(struct rq *rq, bool *done) +{ + const struct sched_class *curr_class; + u64 now = rq_clock_pelt(rq); + unsigned long thermal_pressure; + bool decayed; + + /* + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, + * DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | + update_irq_load_avg(rq, 0); + + if (others_have_blocked(rq)) + *done = false; + + return decayed; +} + +static bool __update_blocked_fair(struct rq *rq, bool *done) +{ + struct cfs_rq *cfs_rq = &rq->cfs; + bool decayed; + + decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + if (cfs_rq_has_blocked(cfs_rq)) + *done = false; + + return decayed; +} + +static void update_blocked_averages(int cpu) +{ + bool decayed = false, done = true; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + update_blocked_load_tick(rq); + update_rq_clock(rq); + + decayed |= __update_blocked_others(rq, &done); + decayed |= __update_blocked_fair(rq, &done); + + update_blocked_load_status(rq, !done); + if (decayed) + cpufreq_update_util(rq, 0); + rq_unlock_irqrestore(rq, &rf); +} +#else +static void update_blocked_averages(int cpu) {} +#endif + +#ifdef CONFIG_NO_HZ_COMMON +/* + * idle load balancing details + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. + * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * anywhere yet. + */ + +static inline int find_new_ilb(void) +{ + int ilb; + const struct cpumask *hk_mask; + + hk_mask = housekeeping_cpumask(HK_TYPE_MISC); + + for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { + + if (ilb == smp_processor_id()) + continue; + + if (idle_cpu(ilb)) + return ilb; + } + + return nr_cpu_ids; +} + +/* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick any + * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). + */ +static void kick_ilb(unsigned int flags) +{ + int ilb_cpu; + + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; + + ilb_cpu = find_new_ilb(); + + if (ilb_cpu >= nr_cpu_ids) + return; + + /* + * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets + * the first flag owns it; cleared by nohz_csd_func(). + */ + flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); + if (flags & NOHZ_KICK_MASK) + return; + + /* + * This way we generate an IPI on the target CPU which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. + */ + smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); +} + +/* + * Check whether the capacity of the rq has been noticeably reduced by side + * activity. The imbalance_pct is used for the threshold. + * Return true is the capacity is reduced + */ +static inline int +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) +{ + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); +} + +/* + * Check whether a rq has a misfit task and if it looks like we can actually + * help that task: we can migrate the task to a CPU of higher capacity, or + * the task's current CPU is heavily pressured. + */ +static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) +{ + return rq->misfit_task_load && + (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + check_cpu_capacity(rq, sd)); +} + +/* + * Current decision point for kicking the idle load balancer in the presence + * of idle CPUs in the system. + */ +static void nohz_balancer_kick(struct rq *rq) +{ + unsigned long now = jiffies; + struct sched_domain_shared *sds; + struct sched_domain *sd; + int nr_busy, i, cpu = rq->cpu; + unsigned int flags = 0; + + if (unlikely(rq->idle_balance)) + return; + + /* + * We may be recently in ticked or tickless idle mode. At the first + * busy tick after returning from idle, we will update the busy stats. + */ + nohz_balance_exit_idle(rq); + + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing. + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return; + + if (READ_ONCE(nohz.has_blocked) && + time_after(now, READ_ONCE(nohz.next_blocked))) + flags = NOHZ_STATS_KICK; + + if (time_before(now, nohz.next_balance)) + goto out; + + if (rq->nr_running >= 2) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto out; + } + + rcu_read_lock(); + + sd = rcu_dereference(rq->sd); + if (sd) { + /* + * If there's a CFS task and the current CPU has reduced + * capacity; kick the ILB to see if there's a better CPU to run + * on. + */ + if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto unlock; + } + } + + sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); + if (sd) { + /* + * When ASYM_PACKING; see if there's a more preferred CPU + * currently idle; in which case, kick the ILB to move tasks + * around. + */ + for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { + if (sched_asym_prefer(i, cpu)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto unlock; + } + } + } + + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); + if (sd) { + /* + * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU + * to run the misfit task on. + */ + if (check_misfit_status(rq, sd)) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto unlock; + } + + /* + * For asymmetric systems, we do not want to nicely balance + * cache use, instead we want to embrace asymmetry and only + * ensure tasks have enough CPU capacity. + * + * Skip the LLC logic because it's not relevant in that case. + */ + goto unlock; + } + + sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); + if (sds) { + /* + * If there is an imbalance between LLC domains (IOW we could + * increase the overall cache use), we need some less-loaded LLC + * domain to pull some load. Likewise, we may need to spread + * load within the current LLC domain (e.g. packed SMT cores but + * other CPUs are idle). We can't really know from here how busy + * the others are - so just get a nohz balance going if it looks + * like this LLC domain has tasks we could move. + */ + nr_busy = atomic_read(&sds->nr_busy_cpus); + if (nr_busy > 1) { + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; + goto unlock; + } + } +unlock: + rcu_read_unlock(); +out: + if (READ_ONCE(nohz.needs_update)) + flags |= NOHZ_NEXT_KICK; + + if (flags) + kick_ilb(flags); +} + +static void set_cpu_sd_state_busy(int cpu) +{ + struct sched_domain *sd; + + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + atomic_inc(&sd->shared->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + +void nohz_balance_exit_idle(struct rq *rq) +{ + SCHED_WARN_ON(rq != this_rq()); + + if (likely(!rq->nohz_tick_stopped)) + return; + + rq->nohz_tick_stopped = 0; + cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + + set_cpu_sd_state_busy(rq->cpu); +} + +static void set_cpu_sd_state_idle(int cpu) +{ + struct sched_domain *sd; + + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + atomic_dec(&sd->shared->nr_busy_cpus); +unlock: + rcu_read_unlock(); +} + +/* + * This routine will record that the CPU is going idle with tick stopped. + * This info will be used in performing idle load balancing in the future. + */ +void nohz_balance_enter_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + SCHED_WARN_ON(cpu != smp_processor_id()); + + /* If this CPU is going down, then nothing needs to be done: */ + if (!cpu_active(cpu)) + return; + + /* Spare idle load balancing on CPUs that don't want to be disturbed: */ + if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) + return; + + /* + * Can be set safely without rq->lock held + * If a clear happens, it will have evaluated last additions because + * rq->lock is held during the check and the clear + */ + rq->has_blocked_load = 1; + + /* + * The tick is still stopped but load could have been added in the + * meantime. We set the nohz.has_blocked flag to trig a check of the + * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear + * of nohz.has_blocked can only happen after checking the new load + */ + if (rq->nohz_tick_stopped) + goto out; + + /* If we're a completely isolated CPU, we don't play: */ + if (on_null_domain(rq)) + return; + + rq->nohz_tick_stopped = 1; + + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + + /* + * Ensures that if nohz_idle_balance() fails to observe our + * @idle_cpus_mask store, it must observe the @has_blocked + * and @needs_update stores. + */ + smp_mb__after_atomic(); + + set_cpu_sd_state_idle(cpu); + + WRITE_ONCE(nohz.needs_update, 1); +out: + /* + * Each time a cpu enter idle, we assume that it has blocked load and + * enable the periodic update of the load of idle cpus + */ + WRITE_ONCE(nohz.has_blocked, 1); +} + +static bool update_nohz_stats(struct rq *rq) +{ + unsigned int cpu = rq->cpu; + + if (!rq->has_blocked_load) + return false; + + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return false; + + if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) + return true; + + update_blocked_averages(cpu); + + return rq->has_blocked_load; +} + +static void idle_balance(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu; + struct rq *src_rq; + int src_cpu = -1, cpu; + unsigned int max = 0; + struct rq_flags src_rf; + + if (IS_CAND_BL_ENABLED) { + if (idle_pull_global_candidate(this_rq)) + return; + } else if (IS_GRQ_BL_ENABLED) { + pull_from_grq(this_rq); + return; + } else if (IS_PWR_BL_ENABLED) + return; + + for_each_online_cpu(cpu) { + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (this_rq->nr_running > 0) + return; + + if (cpu == this_cpu) + continue; + + src_rq = cpu_rq(cpu); + + if (src_rq->nr_running < 2) + continue; + + if (src_rq->nr_running > max) { + max = src_rq->nr_running; + src_cpu = cpu; + } + } + + if (src_cpu == -1) + return; + + src_rq = cpu_rq(src_cpu); + + rq_lock_irqsave(src_rq, &src_rf); + update_rq_clock(src_rq); + + if (src_rq->nr_running < 2) { + rq_unlock(src_rq, &src_rf); + local_irq_restore(src_rf.flags); + } else { + move_task(this_rq, src_rq, &src_rf); + } +} + +/* + * Internal function that runs load balance for all idle cpus. The load balance + * can be a simple update of blocked load or a complete load balance with + * tasks movement depending of flags. + */ +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, + enum cpu_idle_type idle) +{ + /* Earliest time when we have to do rebalance again */ + unsigned long now = jiffies; + unsigned long next_balance = now + 60*HZ; + bool has_blocked_load = false; + int update_next_balance = 0; + int this_cpu = this_rq->cpu; + int balance_cpu; + struct rq *rq; + + SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); + + /* + * We assume there will be no idle load after this update and clear + * the has_blocked flag. If a cpu enters idle in the mean time, it will + * set the has_blocked flag and trigger another update of idle load. + * Because a cpu that becomes idle, is added to idle_cpus_mask before + * setting the flag, we are sure to not clear the state and not + * check the load of an idle cpu. + * + * Same applies to idle_cpus_mask vs needs_update. + */ + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 0); + + /* + * Ensures that if we miss the CPU, we must see the has_blocked + * store from nohz_balance_enter_idle(). + */ + smp_mb(); + + /* + * Start with the next CPU after this_cpu so we will end with this_cpu and let a + * chance for other idle cpu to pull load. + */ + for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) { + if (!idle_cpu(balance_cpu)) + continue; + + /* + * If this CPU gets work to do, stop the load balancing + * work being done for other CPUs. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + if (flags & NOHZ_STATS_KICK) + has_blocked_load = true; + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 1); + goto abort; + } + + rq = cpu_rq(balance_cpu); + + if (flags & NOHZ_STATS_KICK) + has_blocked_load |= update_nohz_stats(rq); + + /* + * If time for next balance is due, + * do the balance. + */ + if (time_after_eq(jiffies, rq->next_balance)) { + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + rq_unlock_irqrestore(rq, &rf); + + if (flags & NOHZ_BALANCE_KICK) + idle_balance(rq); + } + + if (time_after(next_balance, rq->next_balance)) { + next_balance = rq->next_balance; + update_next_balance = 1; + } + } + + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.next_blocked, + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); + +abort: + /* There is still blocked load, enable periodic update */ + if (has_blocked_load) + WRITE_ONCE(nohz.has_blocked, 1); +} + +/* + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +{ + unsigned int flags = this_rq->nohz_idle_balance; + + if (!flags) + return false; + + this_rq->nohz_idle_balance = 0; + + if (idle != CPU_IDLE) + return false; + + _nohz_idle_balance(this_rq, flags, idle); + + return true; +} + +/* + * Check if we need to run the ILB for updating blocked load before entering + * idle state. + */ +void nohz_run_idle_balance(int cpu) +{ + unsigned int flags; + + flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu)); + + /* + * Update the blocked load only if no SCHED_SOFTIRQ is about to happen + * (ie NOHZ_STATS_KICK set) and will do the same. + */ + if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE); +} + +static void nohz_newidle_balance(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu; + + /* + * This CPU doesn't want to be disturbed by scheduler + * housekeeping + */ + if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) + return; + + /* Will wake up very soon. No time for doing anything else*/ + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + + /* Don't need to update blocked load of idle CPUs*/ + if (!READ_ONCE(nohz.has_blocked) || + time_before(jiffies, READ_ONCE(nohz.next_blocked))) + return; + + /* + * Set the need to trigger ILB in order to update blocked load + * before entering idle state. + */ + atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); +} + +#else /* !CONFIG_NO_HZ_COMMON */ +static inline void nohz_balancer_kick(struct rq *rq) { } + +static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +{ + return false; +} + +static inline void nohz_newidle_balance(struct rq *this_rq) { } +#endif /* CONFIG_NO_HZ_COMMON */ + +static void update_curr_lightweight(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + struct tt_node *ttn = &curr->tt_node; + u64 now = sched_clock(); + u64 delta_exec; + + if (!curr) + return; + + delta_exec = now - curr->exec_start; + if (unlikely((s64)delta_exec <= 0)) + return; + + curr->exec_start = now; + curr->sum_exec_runtime += delta_exec; + + ttn->curr_burst += delta_exec; + ttn->vruntime += convert_to_vruntime(delta_exec, curr); + cfs_rq->local_cand_hrrn = HRRN_PERCENT(&curr->tt_node, now); +} + +static void nohz_try_pull_from_candidate(void) +{ + int cpu; + struct rq *rq; + struct cfs_rq *cfs_rq; +#ifdef CONFIG_NO_HZ_FULL + struct rq_flags rf; +#endif + + /* first, push to grq*/ + for_each_online_cpu(cpu) { + rq = cpu_rq(cpu); +#ifdef CONFIG_NO_HZ_FULL + cfs_rq = &rq->cfs; + + if (idle_cpu(cpu) || cfs_rq->nr_running > 1) + goto out; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + update_curr_lightweight(cfs_rq); + rq_unlock_irqrestore(rq, &rf); +out: +#endif + if (idle_cpu(cpu) || !sched_fair_runnable(rq)) + idle_pull_global_candidate(rq); + else + active_pull_global_candidate(rq); + } +} + +static int task_can_move_to_grq(struct task_struct *p, struct rq *src_rq) +{ + if (task_running(task_rq(p), p)) + return 0; + + if (kthread_is_per_cpu(p)) + return 0; + + if (is_migration_disabled(p)) + return 0; + + if (p->nr_cpus_allowed <= 1) + return 0; + + if (task_hot(p, grq, src_rq)) + return 0; + + return 1; +} + +void push_to_grq(struct rq *rq) +{ + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se; + struct tt_node *ttn, *next, *port = NULL; + struct task_struct *p; + struct rq_flags rf, grf; + + if (rq == grq) + return; + + if (!cfs_rq->head) + return; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + + /// dequeue tasks from this rq + ttn = cfs_rq->head; + while (ttn) { + next = ttn->next; + + se = se_of(ttn); + p = task_of(se); + + if (!task_can_move_to_grq(p, rq)) + goto next; + + // deactivate + deactivate_task(rq, p, DEQUEUE_NOCLOCK); + // enqueue to port + __enqueue_entity_port(&port, se); + + set_task_cpu(p, cpu_of(grq)); + +next: + ttn = next; + } + + rq_unlock_irqrestore(rq, &rf); + + if (!port) + return; + + LOCK_GRQ(grf); + + /// enqueue tasks to grq + while (port) { + se = se_of(port); + p = task_of(se); + // enqueue to port + __dequeue_entity_port(&port, se); + + // activate + activate_task(grq, p, ENQUEUE_NOCLOCK); + } + + UNLOCK_GRQ(grf); +} + +static int try_pull_from_grq(struct rq *dist_rq) +{ + struct rq_flags rf; + struct rq_flags grf; + struct cfs_rq *cfs_rq = &dist_rq->cfs; + struct sched_entity *se_global = NULL, *se_local = NULL; + struct task_struct *p = NULL; + struct tt_node *ttn; + + if (dist_rq == grq) + return 0; + + /* if no tasks to pull, exit */ + if (!grq->cfs.head) + return 0; + + rq_lock_irqsave(dist_rq, &rf); + update_rq_clock(dist_rq); + update_curr_lightweight(cfs_rq); + se_local = pick_next_entity(cfs_rq, cfs_rq->curr); + rq_unlock_irqrestore(dist_rq, &rf); + + rq_lock_irqsave(grq, &grf); + update_rq_clock(grq); + se_global = pick_next_entity_from_grq(dist_rq, se_local); + + if (se_global == se_local) { + rq_unlock(grq, &grf); + local_irq_restore(grf.flags); + return 0; + } + + ttn = &se_global->tt_node; + p = task_of(se_global); + + // detach task + deactivate_task(grq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, cpu_of(dist_rq)); + + // unlock src rq + rq_unlock(grq, &grf); + + // lock dist rq + rq_lock(dist_rq, &rf); + update_rq_clock(dist_rq); + + activate_task(dist_rq, p, ENQUEUE_NOCLOCK); + check_preempt_curr(dist_rq, p, 0); + + // unlock dist rq + rq_unlock(dist_rq, &rf); + local_irq_restore(grf.flags); + return 1; +} + +static inline void +update_grq_next_balance(struct rq *rq, int pulled) +{ + /* + * if not pulled any, keep eager, + * otherwise set next balance + */ + if (tt_grq_balance_ms && pulled) + rq->grq_next_balance = jiffies + msecs_to_jiffies(tt_grq_balance_ms); +} + +static void nohz_try_pull_from_grq(void) +{ + int cpu; + struct rq *rq; + struct cpumask idle_mask; + struct cpumask non_idle_mask; + bool balance_time; + int pulled = 0; + + cpumask_clear(&non_idle_mask); + + /* first, push to grq*/ + for_each_online_cpu(cpu) { + if (cpu == 0) continue; + if (!idle_cpu(cpu)) { + push_to_grq(cpu_rq(cpu)); + cpumask_set_cpu(cpu, &non_idle_mask); + } else { + cpumask_set_cpu(cpu, &idle_mask); + } + } + + /* second, idle cpus pull first */ + for_each_cpu(cpu, &idle_mask) { + if (cpu == 0 || !idle_cpu(cpu)) + continue; + + if (grq->cfs.nr_running <= 1) + return; + + rq = cpu_rq(cpu); + pulled = pull_from_grq(rq); + update_grq_next_balance(rq, pulled); + } + + /* last, non idle pull */ + for_each_cpu(cpu, &non_idle_mask) { + rq = cpu_rq(cpu); + balance_time = time_after_eq(jiffies, rq->grq_next_balance); + pulled = 0; + + if (grq->cfs.nr_running <= 1) + return; + + /* mybe it is idle now */ + if (idle_cpu(cpu)) + pulled = pull_from_grq(cpu_rq(cpu)); + else if (tt_grq_balance_ms == 0 || balance_time) + /* if not idle, try pull every grq_next_balance */ + pulled = try_pull_from_grq(rq); + + update_grq_next_balance(rq, pulled); + } +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +{ + struct rq *this_rq = this_rq(); + enum cpu_idle_type idle = this_rq->idle_balance ? + CPU_IDLE : CPU_NOT_IDLE; + + /* + * If this CPU has a pending nohz_balance_kick, then do the + * balancing on behalf of the other idle CPUs whose ticks are + * stopped. Do nohz_idle_balance *before* rebalance_domains to + * give the idle CPUs a chance to load balance. Else we may + * load balance only within the local sched_domain hierarchy + * and abort nohz_idle_balance altogether if we pull some load. + */ + nohz_idle_balance(this_rq, idle); +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2aa76b20ade23..3e9b2effa5aec 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -109,6 +109,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#ifdef CONFIG_TT_SCHED +struct rq *grq = NULL; +#endif + #ifdef CONFIG_SCHED_DEBUG /* * Debugging: various feature bits @@ -3088,6 +3092,14 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) kfree(user_mask); } +#ifdef CONFIG_TT_SCHED +inline void dec_nr_lat_sensitive(unsigned int cpu) +{ + if (per_cpu(nr_lat_sensitive, cpu)) + per_cpu(nr_lat_sensitive, cpu)--; +} +#endif + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -3133,6 +3145,12 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { +#ifdef CONFIG_TT_SCHED + if (task_is_lat_sensitive(p)) { + dec_nr_lat_sensitive(task_cpu(p)); + per_cpu(nr_lat_sensitive, new_cpu)++; + } +#endif if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; @@ -4588,7 +4606,9 @@ void wake_up_new_task(struct task_struct *p) { struct rq_flags rf; struct rq *rq; - +#ifdef CONFIG_TT_SCHED + int target_cpu = 0; +#endif raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); #ifdef CONFIG_SMP @@ -4602,12 +4622,25 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); +#ifdef CONFIG_TT_SCHED + target_cpu = select_task_rq(p, task_cpu(p), WF_FORK); + __set_task_cpu(p, target_cpu); +#else __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); +#endif #endif rq = __task_rq_lock(p, &rf); + +#ifdef CONFIG_TT_SCHED + if (task_is_lat_sensitive(p)) + per_cpu(nr_lat_sensitive, target_cpu)++; +#endif + update_rq_clock(rq); post_init_entity_util_avg(p); - +#ifdef CONFIG_TT_SCHED + p->se.tt_node.start_time = sched_clock(); +#endif activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -5446,7 +5479,9 @@ static void sched_tick_remote(struct work_struct *work) struct rq *rq = cpu_rq(cpu); struct task_struct *curr; struct rq_flags rf; +#ifndef CONFIG_TT_SCHED u64 delta; +#endif int os; /* @@ -5465,7 +5500,7 @@ static void sched_tick_remote(struct work_struct *work) goto out_unlock; update_rq_clock(rq); - +#ifndef CONFIG_TT_SCHED if (!is_idle_task(curr)) { /* * Make sure the next tick runs within a reasonable @@ -5474,6 +5509,7 @@ static void sched_tick_remote(struct work_struct *work) delta = rq_clock_task(rq) - curr->se.exec_start; WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); } +#endif curr->sched_class->task_tick(rq, curr, 0); calc_load_nohz_remote(rq); @@ -5974,6 +6010,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) for_each_cpu(i, smt_mask) { rq_i = cpu_rq(i); +#ifdef CONFIG_TT_SCHED + if (task_is_lat_sensitive(prev)) + dec_nr_lat_sensitive(prev->cpu); +#endif + /* * An online sibling might have gone offline before a task * could be picked for it, or it might be offline but later @@ -9497,6 +9538,9 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +#ifdef CONFIG_TT_SCHED +DEFINE_PER_CPU(int, nr_lat_sensitive); +#endif void __init sched_init(void) { @@ -9513,6 +9557,10 @@ void __init sched_init(void) wait_bit_init(); +#ifdef CONFIG_TT_SCHED + printk(KERN_INFO "TT CPU scheduler v5.16 by Hamad Al Marri."); +#endif + #ifdef CONFIG_FAIR_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -9617,8 +9665,16 @@ void __init sched_init(void) rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; + rq->lat_decay = jiffies; + rq->grq_next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; +#ifdef CONFIG_TT_SCHED + if (!grq) { + grq = rq; + printk(KERN_INFO "Global runqueue is on cpu %d", cpu_of(grq)); + } +#endif rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; @@ -9641,6 +9697,9 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); +#ifdef CONFIG_TT_SCHED + per_cpu(nr_lat_sensitive, i) = 0; +#endif #ifdef CONFIG_SCHED_CORE rq->core = rq; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bb3d63bdf4ae8..5feac5279765f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -172,7 +172,9 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[16]; +#ifndef CONFIG_TT_SCHED unsigned int scaling; +#endif if (cnt > 15) cnt = 15; @@ -181,6 +183,7 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = '\0'; +#ifndef CONFIG_TT_SCHED if (kstrtouint(buf, 10, &scaling)) return -EINVAL; @@ -190,6 +193,7 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, sysctl_sched_tunable_scaling = scaling; if (sched_update_scaling()) return -EINVAL; +#endif *ppos += cnt; return cnt; @@ -197,7 +201,9 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, static int sched_scaling_show(struct seq_file *m, void *v) { +#ifndef CONFIG_TT_SCHED seq_printf(m, "%d\n", sysctl_sched_tunable_scaling); +#endif return 0; } @@ -308,11 +314,12 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif +#ifndef CONFIG_TT_SCHED debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); - +#endif debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); @@ -579,8 +586,10 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { +#ifndef CONFIG_TT_SCHED s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, spread, rq0_min_vruntime, spread0; + struct rq *rq = cpu_rq(cpu); struct sched_entity *last; unsigned long flags; @@ -618,6 +627,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); +#endif + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", @@ -773,11 +784,13 @@ do { \ SEQ_printf(m, "\n"); } +#ifndef CONFIG_TT_SCHED static const char *sched_tunable_scaling_names[] = { "none", "logarithmic", "linear" }; +#endif static void sched_debug_header(struct seq_file *m) { @@ -816,19 +829,25 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + +#ifndef CONFIG_TT_SCHED PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_idle_min_granularity); PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); +#endif + #undef PN #undef P +#ifndef CONFIG_TT_SCHED SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", sysctl_sched_tunable_scaling, sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); +#endif SEQ_printf(m, "\n"); } @@ -956,6 +975,22 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, #define P_SCHEDSTAT(F) __PS(#F, schedstat_val(p->stats.F)) #define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->stats.F)) + +#ifdef CONFIG_TT_SCHED +#define PN_TT(F, S) SEQ_printf(m, "%-45s: %20s\n", #F, #S) + + if (p->se.tt_node.task_type == TT_NO_TYPE) + PN_TT(task_type, NO_TYPE); + else if (p->se.tt_node.task_type == TT_INTERACTIVE) + PN_TT(task_type, INTERACTIVE); + else if (p->se.tt_node.task_type == TT_REALTIME) + PN_TT(task_type, REALTIME); + else if (p->se.tt_node.task_type == TT_CPU_BOUND) + PN_TT(task_type, CPU_BOUND); + else if (p->se.tt_node.task_type == TT_BATCH) + PN_TT(task_type, BATCH); +#endif + PN(se.exec_start); PN(se.vruntime); PN(se.sum_exec_runtime); diff --git a/kernel/sched/fair_numa.h b/kernel/sched/fair_numa.h new file mode 100644 index 0000000000000..a19bb38163078 --- /dev/null +++ b/kernel/sched/fair_numa.h @@ -0,0 +1,1960 @@ +#include +#include +#ifdef CONFIG_NUMA_BALANCING +/* + * Approximate time to scan a full NUMA task in ms. The task scan period is + * calculated based on the tasks virtual memory size and + * numa_balancing_scan_size. + */ +unsigned int sysctl_numa_balancing_scan_period_min = 1000; +unsigned int sysctl_numa_balancing_scan_period_max = 60000; + +/* Portion of address space to scan in MB */ +unsigned int sysctl_numa_balancing_scan_size = 256; + +/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ +unsigned int sysctl_numa_balancing_scan_delay = 1000; + +struct numa_group { + refcount_t refcount; + + spinlock_t lock; /* nr_tasks, tasks */ + int nr_tasks; + pid_t gid; + int active_nodes; + + struct rcu_head rcu; + unsigned long total_faults; + unsigned long max_faults_cpu; + /* + * faults[] array is split into two regions: faults_mem and faults_cpu. + * + * Faults_cpu is used to decide whether memory should move + * towards the CPU. As a consequence, these stats are weighted + * more by CPU use than by memory faults. + */ + unsigned long faults[]; +}; + +/* + * For functions that can be called in multiple contexts that permit reading + * ->numa_group (see struct task_struct for locking rules). + */ +static struct numa_group *deref_task_numa_group(struct task_struct *p) +{ + return rcu_dereference_check(p->numa_group, p == current || + (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu))); +} + +static struct numa_group *deref_curr_numa_group(struct task_struct *p) +{ + return rcu_dereference_protected(p->numa_group, p == current); +} + +static inline unsigned long group_faults_priv(struct numa_group *ng); +static inline unsigned long group_faults_shared(struct numa_group *ng); + +static unsigned int task_nr_scan_windows(struct task_struct *p) +{ + unsigned long rss = 0; + unsigned long nr_scan_pages; + + /* + * Calculations based on RSS as non-present and empty pages are skipped + * by the PTE scanner and NUMA hinting faults should be trapped based + * on resident pages + */ + nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + rss = get_mm_rss(p->mm); + if (!rss) + rss = nr_scan_pages; + + rss = round_up(rss, nr_scan_pages); + return rss / nr_scan_pages; +} + +/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ +#define MAX_SCAN_WINDOW 2560 + +static unsigned int task_scan_min(struct task_struct *p) +{ + unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size); + unsigned int scan, floor; + unsigned int windows = 1; + + if (scan_size < MAX_SCAN_WINDOW) + windows = MAX_SCAN_WINDOW / scan_size; + floor = 1000 / windows; + + scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); + return max_t(unsigned int, floor, scan); +} + +static unsigned int task_scan_start(struct task_struct *p) +{ + unsigned long smin = task_scan_min(p); + unsigned long period = smin; + struct numa_group *ng; + + /* Scale the maximum scan period with the amount of shared memory. */ + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) { + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + + period *= refcount_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + } + rcu_read_unlock(); + + return max(smin, period); +} + +static unsigned int task_scan_max(struct task_struct *p) +{ + unsigned long smin = task_scan_min(p); + unsigned long smax; + struct numa_group *ng; + + /* Watch for min being lower than max due to floor calculations */ + smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p); + + /* Scale the maximum scan period with the amount of shared memory. */ + ng = deref_curr_numa_group(p); + if (ng) { + unsigned long shared = group_faults_shared(ng); + unsigned long private = group_faults_priv(ng); + unsigned long period = smax; + + period *= refcount_read(&ng->refcount); + period *= shared + 1; + period /= private + shared + 1; + + smax = max(smax, period); + } + + return max(smin, smax); +} + +static void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); + rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); +} + +static void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ + rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); + rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); +} + +/* Shared or private faults. */ +#define NR_NUMA_HINT_FAULT_TYPES 2 + +/* Memory and CPU locality */ +#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2) + +/* Averaged statistics, and temporary buffers. */ +#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2) + +pid_t task_numa_group_id(struct task_struct *p) +{ + struct numa_group *ng; + pid_t gid = 0; + + rcu_read_lock(); + ng = rcu_dereference(p->numa_group); + if (ng) + gid = ng->gid; + rcu_read_unlock(); + + return gid; +} + +/* + * The averaged statistics, shared & private, memory & CPU, + * occupy the first half of the array. The second half of the + * array is for current counters, which are averaged into the + * first set by task_numa_placement. + */ +static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) +{ + return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv; +} + +static inline unsigned long task_faults(struct task_struct *p, int nid) +{ + if (!p->numa_faults) + return 0; + + return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] + + p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)]; +} + +static inline unsigned long group_faults(struct task_struct *p, int nid) +{ + struct numa_group *ng = deref_task_numa_group(p); + + if (!ng) + return 0; + + return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] + + ng->faults[task_faults_idx(NUMA_MEM, nid, 1)]; +} + +static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) +{ + return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] + + group->faults[task_faults_idx(NUMA_CPU, nid, 1)]; +} + +static inline unsigned long group_faults_priv(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; + } + + return faults; +} + +static inline unsigned long group_faults_shared(struct numa_group *ng) +{ + unsigned long faults = 0; + int node; + + for_each_online_node(node) { + faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; + } + + return faults; +} + +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} + +/* Handle placement on systems where not all nodes are directly connected. */ +static unsigned long score_nearby_nodes(struct task_struct *p, int nid, + int maxdist, bool task) +{ + unsigned long score = 0; + int node; + + /* + * All nodes are directly connected, and the same distance + * from each other. No need for fancy placement algorithms. + */ + if (sched_numa_topology_type == NUMA_DIRECT) + return 0; + + /* + * This code is called for each node, introducing N^2 complexity, + * which should be ok given the number of nodes rarely exceeds 8. + */ + for_each_online_node(node) { + unsigned long faults; + int dist = node_distance(nid, node); + + /* + * The furthest away nodes in the system are not interesting + * for placement; nid was already counted. + */ + if (dist == sched_max_numa_distance || node == nid) + continue; + + /* + * On systems with a backplane NUMA topology, compare groups + * of nodes, and move tasks towards the group with the most + * memory accesses. When comparing two nodes at distance + * "hoplimit", only nodes closer by than "hoplimit" are part + * of each group. Skip other nodes. + */ + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist >= maxdist) + continue; + + /* Add up the faults from nearby nodes. */ + if (task) + faults = task_faults(p, node); + else + faults = group_faults(p, node); + + /* + * On systems with a glueless mesh NUMA topology, there are + * no fixed "groups of nodes". Instead, nodes that are not + * directly connected bounce traffic through intermediate + * nodes; a numa_group can occupy any set of nodes. + * The further away a node is, the less the faults count. + * This seems to result in good task placement. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + faults *= (sched_max_numa_distance - dist); + faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + } + + score += faults; + } + + return score; +} + +/* + * These return the fraction of accesses done by a particular task, or + * task group, on a particular numa node. The group weight is given a + * larger multiplier, in order to group tasks together that are almost + * evenly spread out between numa nodes. + */ +static inline unsigned long task_weight(struct task_struct *p, int nid, + int dist) +{ + unsigned long faults, total_faults; + + if (!p->numa_faults) + return 0; + + total_faults = p->total_numa_faults; + + if (!total_faults) + return 0; + + faults = task_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, true); + + return 1000 * faults / total_faults; +} + +static inline unsigned long group_weight(struct task_struct *p, int nid, + int dist) +{ + struct numa_group *ng = deref_task_numa_group(p); + unsigned long faults, total_faults; + + if (!ng) + return 0; + + total_faults = ng->total_faults; + + if (!total_faults) + return 0; + + faults = group_faults(p, nid); + faults += score_nearby_nodes(p, nid, dist, false); + + return 1000 * faults / total_faults; +} + +bool should_numa_migrate_memory(struct task_struct *p, struct page * page, + int src_nid, int dst_cpu) +{ + struct numa_group *ng = deref_curr_numa_group(p); + int dst_nid = cpu_to_node(dst_cpu); + int last_cpupid, this_cpupid; + + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + + /* + * Allow first faults or private faults to migrate immediately early in + * the lifetime of a task. The magic number 4 is based on waiting for + * two full passes of the "multi-stage node selection" test that is + * executed below. + */ + if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && + (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) + return true; + + /* + * Multi-stage node selection is used in conjunction with a periodic + * migration fault to build a temporal task<->page relation. By using + * a two-stage filter we remove short/unlikely relations. + * + * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate + * a task's usage of a particular page (n_p) per total usage of this + * page (n_t) (in a given time-span) to a probability. + * + * Our periodic faults will sample this probability and getting the + * same result twice in a row, given these samples are fully + * independent, is then given by P(n)^2, provided our sample period + * is sufficiently short compared to the usage pattern. + * + * This quadric squishes small probabilities, making it less likely we + * act on an unlikely task<->page relation. + */ + if (!cpupid_pid_unset(last_cpupid) && + cpupid_to_nid(last_cpupid) != dst_nid) + return false; + + /* Always allow migrate on private faults */ + if (cpupid_match_pid(p, last_cpupid)) + return true; + + /* A shared fault, but p->numa_group has not been set up yet. */ + if (!ng) + return true; + + /* + * Destination node is much more heavily used than the source + * node? Allow migration. + */ + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * + ACTIVE_NODE_FRACTION) + return true; + + /* + * Distribute memory according to CPU & memory use on each node, + * with 3/4 hysteresis to avoid unnecessary memory migrations: + * + * faults_cpu(dst) 3 faults_cpu(src) + * --------------- * - > --------------- + * faults_mem(dst) 4 faults_mem(src) + */ + return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > + group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; +} + +/* + * 'numa_type' describes the node at the moment of load balancing. + */ +enum numa_type { + /* The node has spare capacity that can be used to run more tasks. */ + node_has_spare = 0, + /* + * The node is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + node_fully_busy, + /* + * The node is overloaded and can't provide expected CPU cycles to all + * tasks. + */ + node_overloaded +}; + +/* Cached statistics for all CPUs within a node */ +struct numa_stats { + unsigned long load; + unsigned long runnable; + unsigned long util; + /* Total compute capacity of CPUs on a node */ + unsigned long compute_capacity; + unsigned int nr_running; + unsigned int weight; + enum numa_type node_type; + int idle_cpu; +}; + +static inline bool is_core_idle(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + int sibling; + + for_each_cpu(sibling, cpu_smt_mask(cpu)) { + if (cpu == sibling) + continue; + + if (!idle_cpu(sibling)) + return false; + } +#endif + + return true; +} + +struct task_numa_env { + struct task_struct *p; + + int src_cpu, src_nid; + int dst_cpu, dst_nid; + + struct numa_stats src_stats, dst_stats; + + int imbalance_pct; + int dist; + + struct task_struct *best_task; + long best_imp; + int best_cpu; +}; + +static unsigned long cpu_load(struct rq *rq) +{ + return cfs_rq_load_avg(&rq->cfs); +} + +static unsigned long cpu_runnable(struct rq *rq) +{ + return cfs_rq_runnable_avg(&rq->cfs); +} + +/* + * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. + * This is an approximation as the number of running tasks may not be + * related to the number of busy CPUs due to sched_setaffinity. + */ +static inline bool allow_numa_imbalance(int dst_running, int dst_weight) +{ + return (dst_running < (dst_weight >> 2)); +} + +#define NUMA_IMBALANCE_MIN 2 + +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int dst_weight) +{ + if (!allow_numa_imbalance(dst_running, dst_weight)) + return imbalance; + + /* + * Allow a small imbalance based on a simple pair of communicating + * tasks that remain local when the destination is lightly loaded. + */ + if (imbalance <= NUMA_IMBALANCE_MIN) + return 0; + + return imbalance; +} + +static inline enum +numa_type numa_classify(unsigned int imbalance_pct, + struct numa_stats *ns) +{ + if ((ns->nr_running > ns->weight) && + (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) || + ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100)))) + return node_overloaded; + + if ((ns->nr_running < ns->weight) || + (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) && + ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100)))) + return node_has_spare; + + return node_fully_busy; +} + +#ifdef CONFIG_SCHED_SMT +/* Forward declarations of select_idle_sibling helpers */ +static inline bool test_idle_cores(int cpu, bool def); +static inline int numa_idle_core(int idle_core, int cpu) +{ + if (!static_branch_likely(&sched_smt_present) || + idle_core >= 0 || !test_idle_cores(cpu, false)) + return idle_core; + + /* + * Prefer cores instead of packing HT siblings + * and triggering future load balancing. + */ + if (is_core_idle(cpu)) + idle_core = cpu; + + return idle_core; +} +#else +static inline int numa_idle_core(int idle_core, int cpu) +{ + return idle_core; +} +#endif + +/* + * Gather all necessary information to make NUMA balancing placement + * decisions that are compatible with standard load balancer. This + * borrows code and logic from update_sg_lb_stats but sharing a + * common implementation is impractical. + */ +static void update_numa_stats(struct task_numa_env *env, + struct numa_stats *ns, int nid, + bool find_idle) +{ + int cpu, idle_core = -1; + + memset(ns, 0, sizeof(*ns)); + ns->idle_cpu = -1; + + rcu_read_lock(); + for_each_cpu(cpu, cpumask_of_node(nid)) { + struct rq *rq = cpu_rq(cpu); + + ns->load += cpu_load(rq); + ns->runnable += cpu_runnable(rq); + ns->util += cpu_util(cpu); + ns->nr_running += rq->cfs.h_nr_running; + ns->compute_capacity += capacity_of(cpu); + + if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (READ_ONCE(rq->numa_migrate_on) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) + continue; + + if (ns->idle_cpu == -1) + ns->idle_cpu = cpu; + + idle_core = numa_idle_core(idle_core, cpu); + } + } + rcu_read_unlock(); + + ns->weight = cpumask_weight(cpumask_of_node(nid)); + + ns->node_type = numa_classify(env->imbalance_pct, ns); + + if (idle_core >= 0) + ns->idle_cpu = idle_core; +} + +static void task_numa_assign(struct task_numa_env *env, + struct task_struct *p, long imp) +{ + struct rq *rq = cpu_rq(env->dst_cpu); + + /* Check if run-queue part of active NUMA balance. */ + if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) { + int cpu; + int start = env->dst_cpu; + + /* Find alternative idle CPU. */ + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + if (cpu == env->best_cpu || !idle_cpu(cpu) || + !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { + continue; + } + + env->dst_cpu = cpu; + rq = cpu_rq(env->dst_cpu); + if (!xchg(&rq->numa_migrate_on, 1)) + goto assign; + } + + /* Failed to find an alternative idle CPU */ + return; + } + +assign: + /* + * Clear previous best_cpu/rq numa-migrate flag, since task now + * found a better CPU to move/swap. + */ + if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) { + rq = cpu_rq(env->best_cpu); + WRITE_ONCE(rq->numa_migrate_on, 0); + } + + if (env->best_task) + put_task_struct(env->best_task); + if (p) + get_task_struct(p); + + env->best_task = p; + env->best_imp = imp; + env->best_cpu = env->dst_cpu; +} + +static bool load_too_imbalanced(long src_load, long dst_load, + struct task_numa_env *env) +{ + long imb, old_imb; + long orig_src_load, orig_dst_load; + long src_capacity, dst_capacity; + + /* + * The load is corrected for the CPU capacity available on each node. + * + * src_load dst_load + * ------------ vs --------- + * src_capacity dst_capacity + */ + src_capacity = env->src_stats.compute_capacity; + dst_capacity = env->dst_stats.compute_capacity; + + imb = abs(dst_load * src_capacity - src_load * dst_capacity); + + orig_src_load = env->src_stats.load; + orig_dst_load = env->dst_stats.load; + + old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity); + + /* Would this change make things worse? */ + return (imb > old_imb); +} + +/* + * Maximum NUMA importance can be 1998 (2*999); + * SMALLIMP @ 30 would be close to 1998/64. + * Used to deter task migration. + */ +#define SMALLIMP 30 + +/* + * This checks if the overall compute and NUMA accesses of the system would + * be improved if the source tasks was migrated to the target dst_cpu taking + * into account that it might be best if task running on the dst_cpu should + * be exchanged with the source task + */ +static bool task_numa_compare(struct task_numa_env *env, + long taskimp, long groupimp, bool maymove) +{ + struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p); + struct rq *dst_rq = cpu_rq(env->dst_cpu); + long imp = p_ng ? groupimp : taskimp; + struct task_struct *cur; + long src_load, dst_load; + int dist = env->dist; + long moveimp = imp; + long load; + bool stopsearch = false; + + if (READ_ONCE(dst_rq->numa_migrate_on)) + return false; + + rcu_read_lock(); + cur = rcu_dereference(dst_rq->curr); + if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) + cur = NULL; + + /* + * Because we have preemption enabled we can get migrated around and + * end try selecting ourselves (current == env->p) as a swap candidate. + */ + if (cur == env->p) { + stopsearch = true; + goto unlock; + } + + if (!cur) { + if (maymove && moveimp >= env->best_imp) + goto assign; + else + goto unlock; + } + + /* Skip this swap candidate if cannot move to the source cpu. */ + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) + goto unlock; + + /* + * Skip this swap candidate if it is not moving to its preferred + * node and the best task is. + */ + if (env->best_task && + env->best_task->numa_preferred_nid == env->src_nid && + cur->numa_preferred_nid != env->src_nid) { + goto unlock; + } + + /* + * "imp" is the fault differential for the source task between the + * source and destination node. Calculate the total differential for + * the source task and potential destination task. The more negative + * the value is, the more remote accesses that would be expected to + * be incurred if the tasks were swapped. + * + * If dst and source tasks are in the same NUMA group, or not + * in any group then look only at task weights. + */ + cur_ng = rcu_dereference(cur->numa_group); + if (cur_ng == p_ng) { + imp = taskimp + task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); + /* + * Add some hysteresis to prevent swapping the + * tasks within a group over tiny differences. + */ + if (cur_ng) + imp -= imp / 16; + } else { + /* + * Compare the group weights. If a task is all by itself + * (not part of a group), use the task weight instead. + */ + if (cur_ng && p_ng) + imp += group_weight(cur, env->src_nid, dist) - + group_weight(cur, env->dst_nid, dist); + else + imp += task_weight(cur, env->src_nid, dist) - + task_weight(cur, env->dst_nid, dist); + } + + /* Discourage picking a task already on its preferred node */ + if (cur->numa_preferred_nid == env->dst_nid) + imp -= imp / 16; + + /* + * Encourage picking a task that moves to its preferred node. + * This potentially makes imp larger than it's maximum of + * 1998 (see SMALLIMP and task_weight for why) but in this + * case, it does not matter. + */ + if (cur->numa_preferred_nid == env->src_nid) + imp += imp / 8; + + if (maymove && moveimp > imp && moveimp > env->best_imp) { + imp = moveimp; + cur = NULL; + goto assign; + } + + /* + * Prefer swapping with a task moving to its preferred node over a + * task that is not. + */ + if (env->best_task && cur->numa_preferred_nid == env->src_nid && + env->best_task->numa_preferred_nid != env->src_nid) { + goto assign; + } + + /* + * If the NUMA importance is less than SMALLIMP, + * task migration might only result in ping pong + * of tasks and also hurt performance due to cache + * misses. + */ + if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2) + goto unlock; + + /* + * In the overloaded case, try and keep the load balanced. + */ + load = task_h_load(env->p) - task_h_load(cur); + if (!load) + goto assign; + + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + + if (load_too_imbalanced(src_load, dst_load, env)) + goto unlock; + +assign: + /* Evaluate an idle CPU for a task numa move. */ + if (!cur) { + int cpu = env->dst_stats.idle_cpu; + + /* Nothing cached so current CPU went idle since the search. */ + if (cpu < 0) + cpu = env->dst_cpu; + + /* + * If the CPU is no longer truly idle and the previous best CPU + * is, keep using it. + */ + if (!idle_cpu(cpu) && env->best_cpu >= 0 && + idle_cpu(env->best_cpu)) { + cpu = env->best_cpu; + } + + env->dst_cpu = cpu; + } + + task_numa_assign(env, cur, imp); + + /* + * If a move to idle is allowed because there is capacity or load + * balance improves then stop the search. While a better swap + * candidate may exist, a search is not free. + */ + if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu)) + stopsearch = true; + + /* + * If a swap candidate must be identified and the current best task + * moves its preferred node then stop the search. + */ + if (!maymove && env->best_task && + env->best_task->numa_preferred_nid == env->src_nid) { + stopsearch = true; + } +unlock: + rcu_read_unlock(); + + return stopsearch; +} + +static void task_numa_find_cpu(struct task_numa_env *env, + long taskimp, long groupimp) +{ + bool maymove = false; + int cpu; + + /* + * If dst node has spare capacity, then check if there is an + * imbalance that would be overruled by the load balancer. + */ + if (env->dst_stats.node_type == node_has_spare) { + unsigned int imbalance; + int src_running, dst_running; + + /* + * Would movement cause an imbalance? Note that if src has + * more running tasks that the imbalance is ignored as the + * move improves the imbalance from the perspective of the + * CPU load balancer. + * */ + src_running = env->src_stats.nr_running - 1; + dst_running = env->dst_stats.nr_running + 1; + imbalance = max(0, dst_running - src_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running, + env->dst_stats.weight); + + /* Use idle CPU if there is no imbalance */ + if (!imbalance) { + maymove = true; + if (env->dst_stats.idle_cpu >= 0) { + env->dst_cpu = env->dst_stats.idle_cpu; + task_numa_assign(env, NULL, 0); + return; + } + } + } else { + long src_load, dst_load, load; + /* + * If the improvement from just moving env->p direction is better + * than swapping tasks around, check if a move is possible. + */ + load = task_h_load(env->p); + dst_load = env->dst_stats.load + load; + src_load = env->src_stats.load - load; + maymove = !load_too_imbalanced(src_load, dst_load, env); + } + + for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { + /* Skip this CPU if the source task cannot migrate */ + if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) + continue; + + env->dst_cpu = cpu; + if (task_numa_compare(env, taskimp, groupimp, maymove)) + break; + } +} + +static int task_numa_migrate(struct task_struct *p) +{ + struct task_numa_env env = { + .p = p, + + .src_cpu = task_cpu(p), + .src_nid = task_node(p), + + .imbalance_pct = 112, + + .best_task = NULL, + .best_imp = 0, + .best_cpu = -1, + }; + unsigned long taskweight, groupweight; + struct sched_domain *sd; + long taskimp, groupimp; + struct numa_group *ng; + struct rq *best_rq; + int nid, ret, dist; + + /* + * Pick the lowest SD_NUMA domain, as that would have the smallest + * imbalance and would be the first to start moving tasks about. + * + * And we want to avoid any moving of tasks about, as that would create + * random movement of tasks -- counter the numa conditions we're trying + * to satisfy here. + */ + rcu_read_lock(); + sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); + if (sd) + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + rcu_read_unlock(); + + /* + * Cpusets can break the scheduler domain tree into smaller + * balance domains, some of which do not cross NUMA boundaries. + * Tasks that are "trapped" in such domains cannot be migrated + * elsewhere, so there is no point in (re)trying. + */ + if (unlikely(!sd)) { + sched_setnuma(p, task_node(p)); + return -EINVAL; + } + + env.dst_nid = p->numa_preferred_nid; + dist = env.dist = node_distance(env.src_nid, env.dst_nid); + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + update_numa_stats(&env, &env.src_stats, env.src_nid, false); + taskimp = task_weight(p, env.dst_nid, dist) - taskweight; + groupimp = group_weight(p, env.dst_nid, dist) - groupweight; + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); + + /* Try to find a spot on the preferred nid. */ + task_numa_find_cpu(&env, taskimp, groupimp); + + /* + * Look at other nodes in these cases: + * - there is no space available on the preferred_nid + * - the task is part of a numa_group that is interleaved across + * multiple NUMA nodes; in order to better consolidate the group, + * we need to check other locations. + */ + ng = deref_curr_numa_group(p); + if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { + for_each_online_node(nid) { + if (nid == env.src_nid || nid == p->numa_preferred_nid) + continue; + + dist = node_distance(env.src_nid, env.dst_nid); + if (sched_numa_topology_type == NUMA_BACKPLANE && + dist != env.dist) { + taskweight = task_weight(p, env.src_nid, dist); + groupweight = group_weight(p, env.src_nid, dist); + } + + /* Only consider nodes where both task and groups benefit */ + taskimp = task_weight(p, nid, dist) - taskweight; + groupimp = group_weight(p, nid, dist) - groupweight; + if (taskimp < 0 && groupimp < 0) + continue; + + env.dist = dist; + env.dst_nid = nid; + update_numa_stats(&env, &env.dst_stats, env.dst_nid, true); + task_numa_find_cpu(&env, taskimp, groupimp); + } + } + + /* + * If the task is part of a workload that spans multiple NUMA nodes, + * and is migrating into one of the workload's active nodes, remember + * this node as the task's preferred numa node, so the workload can + * settle down. + * A task that migrated to a second choice node will be better off + * trying for a better one later. Do not set the preferred node here. + */ + if (ng) { + if (env.best_cpu == -1) + nid = env.src_nid; + else + nid = cpu_to_node(env.best_cpu); + + if (nid != p->numa_preferred_nid) + sched_setnuma(p, nid); + } + + /* No better CPU than the current one was found. */ + if (env.best_cpu == -1) { + trace_sched_stick_numa(p, env.src_cpu, NULL, -1); + return -EAGAIN; + } + + best_rq = cpu_rq(env.best_cpu); + if (env.best_task == NULL) { + ret = migrate_task_to(p, env.best_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu); + return ret; + } + + ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu); + WRITE_ONCE(best_rq->numa_migrate_on, 0); + + if (ret != 0) + trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu); + put_task_struct(env.best_task); + return ret; +} + +/* Attempt to migrate a task to a CPU on the preferred node. */ +static void numa_migrate_preferred(struct task_struct *p) +{ + unsigned long interval = HZ; + + /* This task has no NUMA fault statistics yet */ + if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) + return; + + /* Periodically retry migrating the task to the preferred node */ + interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); + p->numa_migrate_retry = jiffies + interval; + + /* Success if task is already running on preferred CPU */ + if (task_node(p) == p->numa_preferred_nid) + return; + + /* Otherwise, try migrate to a CPU on the preferred node */ + task_numa_migrate(p); +} + +/* + * Find out how many nodes the workload is actively running on. Do this by + * tracking the nodes from which NUMA hinting faults are triggered. This can + * be different from the set of nodes where the workload's memory is currently + * located. + */ +static void numa_group_count_active_nodes(struct numa_group *numa_group) +{ + unsigned long faults, max_faults = 0; + int nid, active_nodes = 0; + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (faults > max_faults) + max_faults = faults; + } + + for_each_online_node(nid) { + faults = group_faults_cpu(numa_group, nid); + if (faults * ACTIVE_NODE_FRACTION > max_faults) + active_nodes++; + } + + numa_group->max_faults_cpu = max_faults; + numa_group->active_nodes = active_nodes; +} + +/* + * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS + * increments. The more local the fault statistics are, the higher the scan + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses. + */ +#define NUMA_PERIOD_SLOTS 10 +#define NUMA_PERIOD_THRESHOLD 7 + +/* + * Increase the scan period (slow down scanning) if the majority of + * our memory is already on our local node, or if the majority of + * the page accesses are shared with other processes. + * Otherwise, decrease the scan period. + */ +static void update_task_scan_period(struct task_struct *p, + unsigned long shared, unsigned long private) +{ + unsigned int period_slot; + int lr_ratio, ps_ratio; + int diff; + + unsigned long remote = p->numa_faults_locality[0]; + unsigned long local = p->numa_faults_locality[1]; + + /* + * If there were no record hinting faults then either the task is + * completely idle or all activity is in areas that are not of interest + * to automatic numa balancing. Related to that, if there were failed + * migration then it implies we are migrating too quickly or the local + * node is overloaded. In either case, scan slower + */ + if (local + shared == 0 || p->numa_faults_locality[2]) { + p->numa_scan_period = min(p->numa_scan_period_max, + p->numa_scan_period << 1); + + p->mm->numa_next_scan = jiffies + + msecs_to_jiffies(p->numa_scan_period); + + return; + } + + /* + * Prepare to scale scan period relative to the current period. + * == NUMA_PERIOD_THRESHOLD scan period stays the same + * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster) + * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower) + */ + period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS); + lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote); + ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared); + + if (ps_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are local. There is no need to + * do fast NUMA scanning, since memory is already local. + */ + int slot = ps_ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) { + /* + * Most memory accesses are shared with other tasks. + * There is no point in continuing fast NUMA scanning, + * since other tasks may just move the memory elsewhere. + */ + int slot = lr_ratio - NUMA_PERIOD_THRESHOLD; + if (!slot) + slot = 1; + diff = slot * period_slot; + } else { + /* + * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS, + * yet they are not on the local NUMA node. Speed up + * NUMA scanning to get the memory moved over. + */ + int ratio = max(lr_ratio, ps_ratio); + diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot; + } + + p->numa_scan_period = clamp(p->numa_scan_period + diff, + task_scan_min(p), task_scan_max(p)); + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); +} + +/* + * Get the fraction of time the task has been running since the last + * NUMA placement cycle. The scheduler keeps similar statistics, but + * decays those on a 32ms period, which is orders of magnitude off + * from the dozens-of-seconds NUMA balancing period. Use the scheduler + * stats only if the task is so new there are no NUMA statistics yet. + */ +static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) +{ + u64 runtime, delta, now; + /* Use the start of this time slice to avoid calculations. */ + now = p->se.exec_start; + runtime = p->se.sum_exec_runtime; + + if (p->last_task_numa_placement) { + delta = runtime - p->last_sum_exec_runtime; + *period = now - p->last_task_numa_placement; + + /* Avoid time going backwards, prevent potential divide error: */ + if (unlikely((s64)*period < 0)) + *period = 0; + } else { + delta = p->se.avg.load_sum; + *period = LOAD_AVG_MAX; + } + + p->last_sum_exec_runtime = runtime; + p->last_task_numa_placement = now; + + return delta; +} + +/* + * Determine the preferred nid for a task in a numa_group. This needs to + * be done in a way that produces consistent results with group_weight, + * otherwise workloads might not converge. + */ +static int preferred_group_nid(struct task_struct *p, int nid) +{ + nodemask_t nodes; + int dist; + + /* Direct connections between all NUMA nodes. */ + if (sched_numa_topology_type == NUMA_DIRECT) + return nid; + + /* + * On a system with glueless mesh NUMA topology, group_weight + * scores nodes according to the number of NUMA hinting faults on + * both the node itself, and on nearby nodes. + */ + if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { + unsigned long score, max_score = 0; + int node, max_node = nid; + + dist = sched_max_numa_distance; + + for_each_online_node(node) { + score = group_weight(p, node, dist); + if (score > max_score) { + max_score = score; + max_node = node; + } + } + return max_node; + } + + /* + * Finding the preferred nid in a system with NUMA backplane + * interconnect topology is more involved. The goal is to locate + * tasks from numa_groups near each other in the system, and + * untangle workloads from different sides of the system. This requires + * searching down the hierarchy of node groups, recursively searching + * inside the highest scoring group of nodes. The nodemask tricks + * keep the complexity of the search down. + */ + nodes = node_online_map; + for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { + unsigned long max_faults = 0; + nodemask_t max_group = NODE_MASK_NONE; + int a, b; + + /* Are there nodes at this distance from each other? */ + if (!find_numa_distance(dist)) + continue; + + for_each_node_mask(a, nodes) { + unsigned long faults = 0; + nodemask_t this_group; + nodes_clear(this_group); + + /* Sum group's NUMA faults; includes a==b case. */ + for_each_node_mask(b, nodes) { + if (node_distance(a, b) < dist) { + faults += group_faults(p, b); + node_set(b, this_group); + node_clear(b, nodes); + } + } + + /* Remember the top group. */ + if (faults > max_faults) { + max_faults = faults; + max_group = this_group; + /* + * subtle: at the smallest distance there is + * just one node left in each "group", the + * winner is the preferred nid. + */ + nid = a; + } + } + /* Next round, evaluate the nodes within max_group. */ + if (!max_faults) + break; + nodes = max_group; + } + return nid; +} + +static void task_numa_placement(struct task_struct *p) +{ + int seq, nid, max_nid = NUMA_NO_NODE; + unsigned long max_faults = 0; + unsigned long fault_types[2] = { 0, 0 }; + unsigned long total_faults; + u64 runtime, period; + spinlock_t *group_lock = NULL; + struct numa_group *ng; + + /* + * The p->mm->numa_scan_seq field gets updated without + * exclusive access. Use READ_ONCE() here to ensure + * that the field is read in a single access: + */ + seq = READ_ONCE(p->mm->numa_scan_seq); + if (p->numa_scan_seq == seq) + return; + p->numa_scan_seq = seq; + p->numa_scan_period_max = task_scan_max(p); + + total_faults = p->numa_faults_locality[0] + + p->numa_faults_locality[1]; + runtime = numa_get_avg_runtime(p, &period); + + /* If the task is part of a group prevent parallel updates to group stats */ + ng = deref_curr_numa_group(p); + if (ng) { + group_lock = &ng->lock; + spin_lock_irq(group_lock); + } + + /* Find the node with the highest number of faults */ + for_each_online_node(nid) { + /* Keep track of the offsets in numa_faults array */ + int mem_idx, membuf_idx, cpu_idx, cpubuf_idx; + unsigned long faults = 0, group_faults = 0; + int priv; + + for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) { + long diff, f_diff, f_weight; + + mem_idx = task_faults_idx(NUMA_MEM, nid, priv); + membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv); + cpu_idx = task_faults_idx(NUMA_CPU, nid, priv); + cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv); + + /* Decay existing window, copy faults since last scan */ + diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2; + fault_types[priv] += p->numa_faults[membuf_idx]; + p->numa_faults[membuf_idx] = 0; + + /* + * Normalize the faults_from, so all tasks in a group + * count according to CPU use, instead of by the raw + * number of faults. Tasks with little runtime have + * little over-all impact on throughput, and thus their + * faults are less important. + */ + f_weight = div64_u64(runtime << 16, period + 1); + f_weight = (f_weight * p->numa_faults[cpubuf_idx]) / + (total_faults + 1); + f_diff = f_weight - p->numa_faults[cpu_idx] / 2; + p->numa_faults[cpubuf_idx] = 0; + + p->numa_faults[mem_idx] += diff; + p->numa_faults[cpu_idx] += f_diff; + faults += p->numa_faults[mem_idx]; + p->total_numa_faults += diff; + if (ng) { + /* + * safe because we can only change our own group + * + * mem_idx represents the offset for a given + * nid and priv in a specific region because it + * is at the beginning of the numa_faults array. + */ + ng->faults[mem_idx] += diff; + ng->faults[cpu_idx] += f_diff; + ng->total_faults += diff; + group_faults += ng->faults[mem_idx]; + } + } + + if (!ng) { + if (faults > max_faults) { + max_faults = faults; + max_nid = nid; + } + } else if (group_faults > max_faults) { + max_faults = group_faults; + max_nid = nid; + } + } + + if (ng) { + numa_group_count_active_nodes(ng); + spin_unlock_irq(group_lock); + max_nid = preferred_group_nid(p, max_nid); + } + + if (max_faults) { + /* Set the new preferred node */ + if (max_nid != p->numa_preferred_nid) + sched_setnuma(p, max_nid); + } + + update_task_scan_period(p, fault_types[0], fault_types[1]); +} + +static inline int get_numa_group(struct numa_group *grp) +{ + return refcount_inc_not_zero(&grp->refcount); +} + +static inline void put_numa_group(struct numa_group *grp) +{ + if (refcount_dec_and_test(&grp->refcount)) + kfree_rcu(grp, rcu); +} + +static void task_numa_group(struct task_struct *p, int cpupid, int flags, + int *priv) +{ + struct numa_group *grp, *my_grp; + struct task_struct *tsk; + bool join = false; + int cpu = cpupid_to_cpu(cpupid); + int i; + + if (unlikely(!deref_curr_numa_group(p))) { + unsigned int size = sizeof(struct numa_group) + + NR_NUMA_HINT_FAULT_STATS * + nr_node_ids * sizeof(unsigned long); + + grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!grp) + return; + + refcount_set(&grp->refcount, 1); + grp->active_nodes = 1; + grp->max_faults_cpu = 0; + spin_lock_init(&grp->lock); + grp->gid = p->pid; + + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] = p->numa_faults[i]; + + grp->total_faults = p->total_numa_faults; + + grp->nr_tasks++; + rcu_assign_pointer(p->numa_group, grp); + } + + rcu_read_lock(); + tsk = READ_ONCE(cpu_rq(cpu)->curr); + + if (!cpupid_match_pid(tsk, cpupid)) + goto no_join; + + grp = rcu_dereference(tsk->numa_group); + if (!grp) + goto no_join; + + my_grp = deref_curr_numa_group(p); + if (grp == my_grp) + goto no_join; + + /* + * Only join the other group if its bigger; if we're the bigger group, + * the other task will join us. + */ + if (my_grp->nr_tasks > grp->nr_tasks) + goto no_join; + + /* + * Tie-break on the grp address. + */ + if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp) + goto no_join; + + /* Always join threads in the same process. */ + if (tsk->mm == current->mm) + join = true; + + /* Simple filter to avoid false positives due to PID collisions */ + if (flags & TNF_SHARED) + join = true; + + /* Update priv based on whether false sharing was detected */ + *priv = !join; + + if (join && !get_numa_group(grp)) + goto no_join; + + rcu_read_unlock(); + + if (!join) + return; + + BUG_ON(irqs_disabled()); + double_lock_irq(&my_grp->lock, &grp->lock); + + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { + my_grp->faults[i] -= p->numa_faults[i]; + grp->faults[i] += p->numa_faults[i]; + } + my_grp->total_faults -= p->total_numa_faults; + grp->total_faults += p->total_numa_faults; + + my_grp->nr_tasks--; + grp->nr_tasks++; + + spin_unlock(&my_grp->lock); + spin_unlock_irq(&grp->lock); + + rcu_assign_pointer(p->numa_group, grp); + + put_numa_group(my_grp); + return; + +no_join: + rcu_read_unlock(); + return; +} + +/* + * Get rid of NUMA statistics associated with a task (either current or dead). + * If @final is set, the task is dead and has reached refcount zero, so we can + * safely free all relevant data structures. Otherwise, there might be + * concurrent reads from places like load balancing and procfs, and we should + * reset the data back to default state without freeing ->numa_faults. + */ +void task_numa_free(struct task_struct *p, bool final) +{ + /* safe: p either is current or is being freed by current */ + struct numa_group *grp = rcu_dereference_raw(p->numa_group); + unsigned long *numa_faults = p->numa_faults; + unsigned long flags; + int i; + + if (!numa_faults) + return; + + if (grp) { + spin_lock_irqsave(&grp->lock, flags); + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + grp->faults[i] -= p->numa_faults[i]; + grp->total_faults -= p->total_numa_faults; + + grp->nr_tasks--; + spin_unlock_irqrestore(&grp->lock, flags); + RCU_INIT_POINTER(p->numa_group, NULL); + put_numa_group(grp); + } + + if (final) { + p->numa_faults = NULL; + kfree(numa_faults); + } else { + p->total_numa_faults = 0; + for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) + numa_faults[i] = 0; + } +} + +/* + * Got a PROT_NONE fault for a page on @node. + */ +void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) +{ + struct task_struct *p = current; + bool migrated = flags & TNF_MIGRATED; + int cpu_node = task_node(current); + int local = !!(flags & TNF_FAULT_LOCAL); + struct numa_group *ng; + int priv; + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + /* for example, ksmd faulting in a user's mm */ + if (!p->mm) + return; + + /* Allocate buffer to track faults on a per-node basis */ + if (unlikely(!p->numa_faults)) { + int size = sizeof(*p->numa_faults) * + NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids; + + p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN); + if (!p->numa_faults) + return; + + p->total_numa_faults = 0; + memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); + } + + /* + * First accesses are treated as private, otherwise consider accesses + * to be private if the accessing pid has not changed + */ + if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) { + priv = 1; + } else { + priv = cpupid_match_pid(p, last_cpupid); + if (!priv && !(flags & TNF_NO_GROUP)) + task_numa_group(p, last_cpupid, flags, &priv); + } + + /* + * If a workload spans multiple NUMA nodes, a shared fault that + * occurs wholly within the set of nodes that the workload is + * actively using should be counted as local. This allows the + * scan rate to slow down when a workload has settled down. + */ + ng = deref_curr_numa_group(p); + if (!priv && !local && ng && ng->active_nodes > 1 && + numa_is_active_node(cpu_node, ng) && + numa_is_active_node(mem_node, ng)) + local = 1; + + /* + * Retry to migrate task to preferred node periodically, in case it + * previously failed, or the scheduler moved us. + */ + if (time_after(jiffies, p->numa_migrate_retry)) { + task_numa_placement(p); + numa_migrate_preferred(p); + } + + if (migrated) + p->numa_pages_migrated += pages; + if (flags & TNF_MIGRATE_FAIL) + p->numa_faults_locality[2] += pages; + + p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; + p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; + p->numa_faults_locality[local] += pages; +} + +static void reset_ptenuma_scan(struct task_struct *p) +{ + /* + * We only did a read acquisition of the mmap sem, so + * p->mm->numa_scan_seq is written to without exclusive access + * and the update is not guaranteed to be atomic. That's not + * much of an issue though, since this is just used for + * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not + * expensive, to avoid any form of compiler optimizations: + */ + WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1); + p->mm->numa_scan_offset = 0; +} + +/* + * The expensive part of numa migration is done from task_work context. + * Triggered from task_tick_numa(). + */ +static void task_numa_work(struct callback_head *work) +{ + unsigned long migrate, next_scan, now = jiffies; + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + u64 runtime = p->se.sum_exec_runtime; + struct vm_area_struct *vma; + unsigned long start, end; + unsigned long nr_pte_updates = 0; + long pages, virtpages; + + SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); + + work->next = work; + /* + * Who cares about NUMA placement when they're dying. + * + * NOTE: make sure not to dereference p->mm before this check, + * exit_task_work() happens _after_ exit_mm() so we could be called + * without p->mm even though we still had it when we enqueued this + * work. + */ + if (p->flags & PF_EXITING) + return; + + if (!mm->numa_next_scan) { + mm->numa_next_scan = now + + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + } + + /* + * Enforce maximal scan/migration frequency.. + */ + migrate = mm->numa_next_scan; + if (time_before(now, migrate)) + return; + + if (p->numa_scan_period == 0) { + p->numa_scan_period_max = task_scan_max(p); + p->numa_scan_period = task_scan_start(p); + } + + next_scan = now + msecs_to_jiffies(p->numa_scan_period); + if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) + return; + + /* + * Delay this task enough that another task of this mm will likely win + * the next time around. + */ + p->node_stamp += 2 * TICK_NSEC; + + start = mm->numa_scan_offset; + pages = sysctl_numa_balancing_scan_size; + pages <<= 20 - PAGE_SHIFT; /* MB in pages */ + virtpages = pages * 8; /* Scan up to this much virtual space */ + if (!pages) + return; + + + if (!mmap_read_trylock(mm)) + return; + vma = find_vma(mm, start); + if (!vma) { + reset_ptenuma_scan(p); + start = 0; + vma = mm->mmap; + } + for (; vma; vma = vma->vm_next) { + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + continue; + } + + /* + * Shared library pages mapped by multiple processes are not + * migrated as it is expected they are cache replicated. Avoid + * hinting faults in read-only file-backed mappings or the vdso + * as migrating the pages will be of marginal benefit. + */ + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + continue; + + /* + * Skip inaccessible VMAs to avoid any confusion between + * PROT_NONE and NUMA hinting ptes + */ + if (!vma_is_accessible(vma)) + continue; + + do { + start = max(start, vma->vm_start); + end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); + end = min(end, vma->vm_end); + nr_pte_updates = change_prot_numa(vma, start, end); + + /* + * Try to scan sysctl_numa_balancing_size worth of + * hpages that have at least one present PTE that + * is not already pte-numa. If the VMA contains + * areas that are unused or already full of prot_numa + * PTEs, scan up to virtpages, to skip through those + * areas faster. + */ + if (nr_pte_updates) + pages -= (end - start) >> PAGE_SHIFT; + virtpages -= (end - start) >> PAGE_SHIFT; + + start = end; + if (pages <= 0 || virtpages <= 0) + goto out; + + cond_resched(); + } while (end != vma->vm_end); + } + +out: + /* + * It is possible to reach the end of the VMA list but the last few + * VMAs are not guaranteed to the vma_migratable. If they are not, we + * would find the !migratable VMA on the next scan but not reset the + * scanner to the start so check it now. + */ + if (vma) + mm->numa_scan_offset = start; + else + reset_ptenuma_scan(p); + mmap_read_unlock(mm); + + /* + * Make sure tasks use at least 32x as much time to run other code + * than they used here, to limit NUMA PTE scanning overhead to 3% max. + * Usually update_task_scan_period slows down scanning enough; on an + * overloaded system we need to limit overhead on a per task basis. + */ + if (unlikely(p->se.sum_exec_runtime != runtime)) { + u64 diff = p->se.sum_exec_runtime - runtime; + p->node_stamp += 32 * diff; + } +} + +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + /* Protect against double add, see task_tick_numa and task_numa_work */ + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + RCU_INIT_POINTER(p->numa_group, NULL); + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + init_task_work(&p->numa_work, task_numa_work); + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = NUMA_NO_NODE; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + +/* + * Drive the periodic memory faults.. + */ +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ + struct callback_head *work = &curr->numa_work; + u64 period, now; + + /* + * We don't care about NUMA placement if we don't have memory. + */ + if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) + return; + + /* + * Using runtime rather than walltime has the dual advantage that + * we (mostly) drive the selection from busy threads and that the + * task needs to have done some actual work before we bother with + * NUMA placement. + */ + now = curr->se.sum_exec_runtime; + period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; + + if (now > curr->node_stamp + period) { + if (!curr->node_stamp) + curr->numa_scan_period = task_scan_start(curr); + curr->node_stamp += period; + + if (!time_before(jiffies, curr->mm->numa_next_scan)) + task_work_add(curr, work, TWA_RESUME); + } +} + +static void update_scan_period(struct task_struct *p, int new_cpu) +{ + int src_nid = cpu_to_node(task_cpu(p)); + int dst_nid = cpu_to_node(new_cpu); + + if (!static_branch_likely(&sched_numa_balancing)) + return; + + if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING)) + return; + + if (src_nid == dst_nid) + return; + + /* + * Allow resets if faults have been trapped before one scan + * has completed. This is most likely due to a new task that + * is pulled cross-node due to wakeups or load balancing. + */ + if (p->numa_scan_seq) { + /* + * Avoid scan adjustments if moving to the preferred + * node or if the task was not previously running on + * the preferred node. + */ + if (dst_nid == p->numa_preferred_nid || + (p->numa_preferred_nid != NUMA_NO_NODE && + src_nid != p->numa_preferred_nid)) + return; + } + + p->numa_scan_period = task_scan_start(p); +} + +#else +static void task_tick_numa(struct rq *rq, struct task_struct *curr) +{ +} + +static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) +{ +} + +static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) +{ +} + +static inline void update_scan_period(struct task_struct *p, int new_cpu) +{ +} + +#endif /* CONFIG_NUMA_BALANCING */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ecb0d70528775..8e1195995e0b0 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -258,6 +258,12 @@ static void cpuidle_idle_call(void) static void do_idle(void) { int cpu = smp_processor_id(); +#ifdef CONFIG_TT_SCHED + int pm_disabled = per_cpu(nr_lat_sensitive, cpu); + + if (IS_PWR_BL_ENABLED) + pm_disabled = 0; +#endif /* * Check if we need to update blocked load @@ -296,7 +302,11 @@ static void do_idle(void) * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + if (cpu_idle_force_poll || tick_check_broadcast_expired() +#ifdef CONFIG_TT_SCHED + || pm_disabled > 0 +#endif + ) { tick_nohz_idle_restart_tick(); cpu_idle_poll(); } else { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 08fdb9ccd14dc..d921b32b45619 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -7,6 +7,9 @@ #include #include +#include +#include +#include #include #include #include @@ -96,6 +99,24 @@ # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif +#ifdef CONFIG_TT_SCHED +#define TT_REALTIME 0 +#define TT_INTERACTIVE 1 +#define TT_NO_TYPE 2 +#define TT_CPU_BOUND 3 +#define TT_BATCH 4 + +#define TT_BL_NORM 0 +#define TT_BL_CAND 1 +#define TT_BL_GRQ 2 +#define TT_BL_PWR 3 +extern struct rq *grq; + +#define IS_CAND_BL_ENABLED (tt_balancer_opt == TT_BL_CAND) +#define IS_GRQ_BL_ENABLED (tt_balancer_opt == TT_BL_GRQ) +#define IS_PWR_BL_ENABLED (tt_balancer_opt == TT_BL_PWR) +#endif + struct rq; struct cpuidle_state; @@ -209,6 +230,15 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } +#ifdef CONFIG_TT_SCHED +static inline int task_is_lat_sensitive(struct task_struct *p) +{ + unsigned int tt = p->se.tt_node.task_type; + + return (tt == TT_INTERACTIVE); +} +#endif + #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) static inline void update_avg(u64 *avg, u64 sample) @@ -539,9 +569,14 @@ struct cfs_rq { * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr; +#ifdef CONFIG_TT_SCHED + struct tt_node *head; + u64 local_cand_hrrn; +#else struct sched_entity *next; struct sched_entity *last; struct sched_entity *skip; +#endif /* CONFIG_TT_SCHED */ #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; @@ -966,6 +1001,10 @@ struct rq { struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; +#ifdef CONFIG_TT_SCHED + unsigned long lat_decay; + unsigned long grq_next_balance; +#endif struct mm_struct *prev_mm; unsigned int clock_update_flags; @@ -1755,6 +1794,9 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); +#ifdef CONFIG_TT_SCHED +DECLARE_PER_CPU(int, nr_lat_sensitive); +#endif extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { @@ -2246,6 +2288,10 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); +#ifdef CONFIG_TT_SCHED +extern int idle_pull_global_candidate(struct rq *dist_rq); +#endif + extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); static inline struct task_struct *get_push_task(struct rq *rq) @@ -2389,6 +2435,10 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +#ifdef CONFIG_TT_SCHED +extern inline void dec_nr_lat_sensitive(unsigned int cpu); +#endif + extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; diff --git a/kernel/sched/tt_stats.h b/kernel/sched/tt_stats.h new file mode 100644 index 0000000000000..64fe6363641e3 --- /dev/null +++ b/kernel/sched/tt_stats.h @@ -0,0 +1,831 @@ +#ifdef CONFIG_TT_ACCOUNTING_STATS +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +static inline void +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_se(se); + + if (entity_is_task(se)) + p = task_of(se); + + __update_stats_wait_start(rq_of(cfs_rq), p, stats); +} + +static inline void +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct sched_statistics *stats; + struct task_struct *p = NULL; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_se(se); + + /* + * When the sched_schedstat changes from 0 to 1, some sched se + * maybe already in the runqueue, the se->statistics.wait_start + * will be 0.So it will let the delta wrong. We need to avoid this + * scenario. + */ + if (unlikely(!schedstat_val(stats->wait_start))) + return; + + if (entity_is_task(se)) + p = task_of(se); + + __update_stats_wait_end(rq_of(cfs_rq), p, stats); +} + +static inline void +update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct sched_statistics *stats; + struct task_struct *tsk = NULL; + + if (!schedstat_enabled()) + return; + + stats = __schedstats_from_se(se); + + if (entity_is_task(se)) + tsk = task_of(se); + + __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats); +} + +/* + * Task is being enqueued - update stats: + */ +static inline void +update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + if (!schedstat_enabled()) + return; + + /* + * Are we enqueueing a waiting task? (for current tasks + * a dequeue/enqueue event is a NOP) + */ + if (se != cfs_rq->curr) + update_stats_wait_start_fair(cfs_rq, se); + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper_fair(cfs_rq, se); +} + +static inline void +update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + + if (!schedstat_enabled()) + return; + + /* + * Mark the end of the wait period if dequeueing a + * waiting task: + */ + if (se != cfs_rq->curr) + update_stats_wait_end_fair(cfs_rq, se); + + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + unsigned int state; + + /* XXX racy against TTWU */ + state = READ_ONCE(tsk->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(tsk->stats.sleep_start, + rq_clock(rq_of(cfs_rq))); + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(tsk->stats.block_start, + rq_clock(rq_of(cfs_rq))); + } +} +#else +static inline void +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +static inline void +update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +#endif /* CONFIG_TT_ACCOUNTING_STATS */ + +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) +{ + struct rq *rq = rq_of(cfs_rq); + + if (&rq->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq, flags); + } +} + +#if defined(CONFIG_NUMA_BALANCING) || (defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS)) +static unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.load_avg; +} + +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.runnable_avg; +} + +static inline unsigned long cpu_util(int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned int util; + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); + + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + return min_t(unsigned long, util, capacity_orig_of(cpu)); +} + +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.avg.load_avg; +} +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) +/* + * The margin used when comparing utilization with CPU capacity. + * + * (default: ~20%) + */ +#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) + +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; +} + +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u32 divider = get_pelt_divider(&se->avg); + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; +} +#else +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} + +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_pelt() + * @cfs_rq: cfs_rq to update + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. + */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +{ + unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; + struct sched_avg *sa = &cfs_rq->avg; + int decayed = 0; + + if (cfs_rq->removed.nr) { + unsigned long r; + u32 divider = get_pelt_divider(&cfs_rq->avg); + + raw_spin_lock(&cfs_rq->removed.lock); + swap(cfs_rq->removed.util_avg, removed_util); + swap(cfs_rq->removed.load_avg, removed_load); + swap(cfs_rq->removed.runnable_avg, removed_runnable); + cfs_rq->removed.nr = 0; + raw_spin_unlock(&cfs_rq->removed.lock); + + r = removed_load; + sub_positive(&sa->load_avg, r); + sa->load_sum = sa->load_avg * divider; + + r = removed_util; + sub_positive(&sa->util_avg, r); + sa->util_sum = sa->util_avg * divider; + + r = removed_runnable; + sub_positive(&sa->runnable_avg, r); + sa->runnable_sum = sa->runnable_avg * divider; + + /* + * removed_runnable is the unweighted version of removed_load so we + * can use it to estimate removed_load_sum. + */ + add_tg_cfs_propagate(cfs_rq, + -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); + + decayed = 1; + } + + decayed |= __update_load_avg_cfs_rq(now, cfs_rq); + +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->load_last_update_time_copy = sa->last_update_time; +#endif + + return decayed; +} + +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = get_pelt_divider(&cfs_rq->avg); + + /* + * When we attach the @se to the @cfs_rq, we must align the decay + * window because without that, really weird and wonderful things can + * happen. + * + * XXX illustrate + */ + se->avg.last_update_time = cfs_rq->avg.last_update_time; + se->avg.period_contrib = cfs_rq->avg.period_contrib; + + /* + * Hell(o) Nasty stuff.. we need to recompute _sum based on the new + * period_contrib. This isn't strictly correct, but since we're + * entirely outside of the PELT hierarchy, nobody cares if we truncate + * _sum a little. + */ + se->avg.util_sum = se->avg.util_avg * divider; + + se->avg.runnable_sum = se->avg.runnable_avg * divider; + + se->avg.load_sum = divider; + if (se_weight(se)) { + se->avg.load_sum = + div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); + } + + enqueue_load_avg(cfs_rq, se); + cfs_rq->avg.util_avg += se->avg.util_avg; + cfs_rq->avg.util_sum += se->avg.util_sum; + cfs_rq->avg.runnable_avg += se->avg.runnable_avg; + cfs_rq->avg.runnable_sum += se->avg.runnable_sum; + + add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); + + cfs_rq_util_change(cfs_rq, 0); + + trace_pelt_cfs_tp(cfs_rq); +} + +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ +static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = get_pelt_divider(&cfs_rq->avg); + + dequeue_load_avg(cfs_rq, se); + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + + add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); + + cfs_rq_util_change(cfs_rq, 0); + + trace_pelt_cfs_tp(cfs_rq); +} + +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 +#define DO_ATTACH 0x4 + +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + u64 now = cfs_rq_clock_pelt(cfs_rq); + int decayed; + + /* + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration + */ + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cfs_rq, se); + + decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed |= propagate_entity_load_avg(se); + + if (!se->avg.last_update_time && (flags & DO_ATTACH)) { + + /* + * DO_ATTACH means we're here from enqueue_entity(). + * !last_update_time means we've passed through + * migrate_task_rq_fair() indicating we migrated. + * + * IOW we're enqueueing a task on a new CPU. + */ + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); + + } else if (decayed) { + cfs_rq_util_change(cfs_rq, 0); + + if (flags & UPDATE_TG) + update_tg_load_avg(cfs_rq); + } +} + +#ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + u64 last_update_time_copy; + u64 last_update_time; + + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); + + return last_update_time; +} +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.last_update_time; +} +#endif + +/* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +static void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg_blocked_se(last_update_time, se); +} + +/* + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). + */ +static void remove_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + unsigned long flags; + + /* + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + */ + + sync_entity_load_avg(se); + + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); + ++cfs_rq->removed.nr; + cfs_rq->removed.util_avg += se->avg.util_avg; + cfs_rq->removed.load_avg += se->avg.load_avg; + cfs_rq->removed.runnable_avg += se->avg.runnable_avg; + raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); +} + +static inline unsigned long task_util(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.util_avg); +} + +static inline unsigned long _task_util_est(struct task_struct *p) +{ + struct util_est ue = READ_ONCE(p->se.avg.util_est); + + return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); +} + +static inline unsigned long task_util_est(struct task_struct *p) +{ + return max(task_util(p), _task_util_est(p)); +} + +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + +static inline void util_est_enqueue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued += _task_util_est(p); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) + +/* + * Check if a (signed) value is within a specified (unsigned) margin, + * based on the observation that: + * + * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) + * + * NOTE: this only works when value + margin < INT_MAX. + */ +static inline bool within_margin(int value, int margin) +{ + return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); +} + +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) +{ + long last_ewma_diff, last_enqueued_diff; + struct util_est ue; + + if (!sched_feat(UTIL_EST)) + return; + + /* + * Skip update of task's estimated utilization when the task has not + * yet completed an activation, e.g. being migrated. + */ + if (!task_sleep) + return; + + /* + * If the PELT values haven't changed since enqueue time, + * skip the util_est update. + */ + ue = p->se.avg.util_est; + if (ue.enqueued & UTIL_AVG_UNCHANGED) + return; + + last_enqueued_diff = ue.enqueued; + + /* + * Reset EWMA on utilization increases, the moving average is used only + * to smooth utilization decreases. + */ + ue.enqueued = task_util(p); + if (sched_feat(UTIL_EST_FASTUP)) { + if (ue.ewma < ue.enqueued) { + ue.ewma = ue.enqueued; + goto done; + } + } + + /* + * Skip update of task's estimated utilization when its members are + * already ~1% close to its last activation value. + */ + last_ewma_diff = ue.enqueued - ue.ewma; + last_enqueued_diff -= ue.enqueued; + if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) { + if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN)) + goto done; + + return; + } + + /* + * To avoid overestimation of actual task utilization, skip updates if + * we cannot grant there is idle time in this CPU. + */ + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) + return; + + /* + * Update Task's estimated utilization + * + * When *p completes an activation we can consolidate another sample + * of the task size. This is done by storing the current PELT value + * as ue.enqueued and by using this value to update the Exponential + * Weighted Moving Average (EWMA): + * + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) + * = w * ( last_ewma_diff ) + ewma(t-1) + * = w * (last_ewma_diff + ewma(t-1) / w) + * + * Where 'w' is the weight of new samples, which is configured to be + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) + */ + ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; + ue.ewma += last_ewma_diff; + ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; +done: + ue.enqueued |= UTIL_AVG_UNCHANGED; + WRITE_ONCE(p->se.avg.util_est, ue); + + trace_sched_util_est_se_tp(&p->se); +} + +static inline int task_fits_capacity(struct task_struct *p, long capacity) +{ + return fits_capacity(uclamp_task_util(p), capacity); +} + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return; + + if (!p || p->nr_cpus_allowed == 1) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + rq->misfit_task_load = 0; + return; + } + + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); +} + +#else /* CONFIG_SMP && CONFIG_TT_ACCOUNTING_STATS */ + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + return true; +} + +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 +#define DO_ATTACH 0x0 + +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) +{ + cfs_rq_util_change(cfs_rq, 0); +} + +static inline void remove_entity_load_avg(struct sched_entity *se) {} + +static inline void +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} + +static inline void +util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} + +#endif /* CONFIG_SMP */ + +#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) +static inline bool cpu_overutilized(int cpu) +{ + return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); +} + +static inline void update_overutilized_status(struct rq *rq) +{ + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { + WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); + } +} + +#else +static inline void update_overutilized_status(struct rq *rq) { } +#endif + +#ifdef CONFIG_TT_ACCOUNTING_STATS +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* Catch up with the cfs_rq and remove our load when we leave */ + update_load_avg(cfs_rq, se, 0); + detach_entity_load_avg(cfs_rq, se); +} +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) +/* Give new sched_entity start runnable values to heavy its load in infant time */ +void init_entity_runnable_average(struct sched_entity *se) +{ + struct sched_avg *sa = &se->avg; + + memset(sa, 0, sizeof(*sa)); + + /* + * Tasks are initialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are initialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); + + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ +} + +static void attach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* Synchronize entity with its cfs_rq */ + update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + attach_entity_load_avg(cfs_rq, se); +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + detach_entity_cfs_rq(se); +} + +static void attach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + attach_entity_cfs_rq(se); +} + +void post_init_entity_util_avg(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); + long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + } + + sa->runnable_avg = sa->util_avg; + + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); + return; + } + + attach_entity_cfs_rq(se); +} +#else /* !CONFIG_SMP */ +static void detach_task_cfs_rq(struct task_struct *p) {} +static void attach_task_cfs_rq(struct task_struct *p) {} +void init_entity_runnable_average(struct sched_entity *se) {} +void post_init_entity_util_avg(struct task_struct *p) {} +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c42ba2d669dcc..9d770f5e89376 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,6 +106,13 @@ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; +#ifdef CONFIG_TT_SCHED +static int neg_twenty = -20; +static int nineteen = 19; +static unsigned long three = 3; +static unsigned long zero_ul = 0; +#endif + #ifdef CONFIG_PROC_SYSCTL /** @@ -1669,6 +1676,40 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_TT_SCHED + { + .procname = "sched_tt_balancer_opt", + .data = &tt_balancer_opt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero_ul, + .extra2 = &three, + }, + { + .procname = "sched_tt_grq_balance_ms", + .data = &tt_grq_balance_ms, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_tt_max_lifetime", + .data = &tt_max_lifetime, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_tt_rt_prio", + .data = &tt_rt_prio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_twenty, + .extra2 = &nineteen, + }, +#endif #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats",