From 8cd1bdcfe0b8131c17c43991c321d7004ebd0c14 Mon Sep 17 00:00:00 2001
From: Yuan Liu <liuyuan@google.com>
Date: Tue, 1 Nov 2016 16:10:12 -0700
Subject: [PATCH] lkl: Direct irq and fix direct syscall degration

There is two major issues in current direct syscall implementation:

1. When there is already a thread pending in syscall, direct syscall
degrages to wakeup idle and the performance is bad in that case. This is
actually common in applications that have a trafficless control
connection.
2. IRQ is not direct if LKL is in idle.

Both issuses are actually because of the same thing: LKL can't
reschedule when LKL is in idle. The patch adds such support.

There are two downside of this patch:
1. need to change kernel/sched/idle.c to expose cpu_idle_loop.
2. lkl_idle_tail_schedule must be in sync with idle.c

There downsides seem OK given the performance we achieve from this
patch.

For common case, it saves one context switch (direct irq) and I can
observe 10% TCP_RR improvement on my desktop.

Signed-off-by: Yuan Liu <liuyuan@google.com>
---
 arch/lkl/include/asm/cpu.h         |  5 +-
 arch/lkl/include/asm/thread_info.h |  1 +
 arch/lkl/kernel/cpu.c              | 94 +++++++++++++++++++++++++-----
 arch/lkl/kernel/syscalls.c         | 12 ----
 arch/lkl/kernel/threads.c          | 35 +++++++++--
 5 files changed, 115 insertions(+), 32 deletions(-)

diff --git a/arch/lkl/include/asm/cpu.h b/arch/lkl/include/asm/cpu.h
index 1bffb16a51f467..67436cd72f43de 100644
--- a/arch/lkl/include/asm/cpu.h
+++ b/arch/lkl/include/asm/cpu.h
@@ -7,8 +7,11 @@ int lkl_cpu_try_run_irq(int irq);
 int lkl_cpu_init(void);
 void lkl_cpu_shutdown(void);
 void lkl_cpu_wait_shutdown(void);
-void lkl_cpu_wakeup(void);
+void lkl_cpu_wakeup_idle(void);
 void lkl_cpu_change_owner(lkl_thread_t owner);
 void lkl_cpu_set_irqs_pending(void);
+void lkl_idle_tail_schedule(void);
+int lkl_cpu_idle_pending(void);
+extern void cpu_idle_loop(void);
 
 #endif /* _ASM_LKL_CPU_H */
diff --git a/arch/lkl/include/asm/thread_info.h b/arch/lkl/include/asm/thread_info.h
index cd4b91dd1464b6..2202be67b7bce7 100644
--- a/arch/lkl/include/asm/thread_info.h
+++ b/arch/lkl/include/asm/thread_info.h
@@ -60,6 +60,7 @@ void threads_cleanup(void);
 #define TIF_SCHED_JB			7
 #define TIF_SCHED_EXIT			8
 #define TIF_HOST_THREAD			9
+#define TIF_IDLE			10
 
 static inline void set_ti_thread_flag(struct thread_info *ti, int flag);
 
diff --git a/arch/lkl/kernel/cpu.c b/arch/lkl/kernel/cpu.c
index c99db15abe3f17..7bf282526a02e1 100644
--- a/arch/lkl/kernel/cpu.c
+++ b/arch/lkl/kernel/cpu.c
@@ -1,5 +1,8 @@
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/tick.h>
 #include <asm/host_ops.h>
 #include <asm/cpu.h>
 #include <asm/thread_info.h>
@@ -50,6 +53,10 @@ struct lkl_cpu {
 	struct lkl_sem *sem;
 	/* semaphore for the idle thread */
 	struct lkl_sem *idle_sem;
+	/* if the idle thread is pending */
+	bool idle_pending;
+	/* jmp_buf used for idle thread to restart */
+	struct lkl_jmp_buf idle_jb;
 	/* semaphore used for shutdown */
 	struct lkl_sem *shutdown_sem;
 } cpu;
@@ -126,18 +133,19 @@ void lkl_cpu_put(void)
 		lkl_ops->mutex_lock(cpu.lock);
 	}
 
-	if (need_resched()) {
+	if (need_resched() && cpu.count == 1) {
+		if (in_interrupt())
+			lkl_bug("%s: in interrupt\n", __func__);
+		lkl_ops->mutex_unlock(cpu.lock);
 		if (test_thread_flag(TIF_HOST_THREAD)) {
-			if (cpu.count == 1 && !in_interrupt()) {
-				lkl_ops->mutex_unlock(cpu.lock);
-				set_current_state(TASK_UNINTERRUPTIBLE);
-				if (!thread_set_sched_jmp())
-					schedule();
-				return;
-			}
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!thread_set_sched_jmp())
+				schedule();
 		} else {
-			lkl_cpu_wakeup();
+			if (!thread_set_sched_jmp())
+				lkl_idle_tail_schedule();
 		}
+		return;
 	}
 
 	if (--cpu.count > 0) {
@@ -210,20 +218,36 @@ void arch_cpu_idle(void)
 
 		lkl_ops->thread_exit();
 	}
-
 	/* enable irqs now to allow direct irqs to run */
 	local_irq_enable();
 
+	if (need_resched())
+		return;
+
+	cpu.idle_pending = true;
 	lkl_cpu_put();
 
 	lkl_ops->sem_down(cpu.idle_sem);
 
-	lkl_cpu_get();
+	cpu.idle_pending = false;
+	lkl_ops->jmp_buf_longjmp(&cpu.idle_jb, 1);
+}
+
 
-	run_irqs();
+void arch_cpu_idle_prepare(void)
+{
+	set_ti_thread_flag(current_thread_info(), TIF_IDLE);
+	/*
+	 * We hijack the idle loop here so that we can let the idle thread
+	 * jump back to the beginning.
+	 */
+	while (1) {
+		if (!lkl_ops->jmp_buf_set(&cpu.idle_jb))
+			cpu_idle_loop();
+	}
 }
 
-void lkl_cpu_wakeup(void)
+void lkl_cpu_wakeup_idle(void)
 {
 	lkl_ops->sem_up(cpu.idle_sem);
 }
@@ -242,3 +266,47 @@ int lkl_cpu_init(void)
 
 	return 0;
 }
+
+/*
+ * Simulate the exit path of idle loop so that we can schedule when LKL is
+ * in idle.
+ * It's just a duplication of those in idle.c so a better way is to refactor
+ * idle.c to expose such function.
+ */
+void lkl_idle_tail_schedule(void)
+{
+
+	if (!cpu.idle_pending ||
+		!test_bit(TIF_IDLE, &current_thread_info()->flags))
+		lkl_bug("%s: not in idle\n", __func__);
+
+	start_critical_timings();
+	__current_set_polling();
+
+	if (WARN_ON_ONCE(irqs_disabled()))
+		local_irq_enable();
+
+	rcu_idle_exit();
+	arch_cpu_idle_exit();
+	preempt_set_need_resched();
+	tick_nohz_idle_exit();
+	__current_clr_polling();
+
+	/*
+	 * memory barrier copied from idle.c
+	 */
+	smp_mb__after_atomic();
+
+	/*
+	 * Didn't find a way to include kernel/sched/sched.h for
+	 * sched_ttwu_pending().
+	 * Anyway, it's no op when not CONFIG_SMP.
+	 */
+
+	schedule_preempt_disabled();
+}
+
+int lkl_cpu_idle_pending(void)
+{
+	return cpu.idle_pending;
+}
diff --git a/arch/lkl/kernel/syscalls.c b/arch/lkl/kernel/syscalls.c
index 790d6c9d4c42c8..ba733b8a8e4030 100644
--- a/arch/lkl/kernel/syscalls.c
+++ b/arch/lkl/kernel/syscalls.c
@@ -93,15 +93,12 @@ static unsigned int task_key;
 long lkl_syscall(long no, long *params)
 {
 	struct task_struct *task = host0;
-	static int count;
 	long ret;
 
 	ret = lkl_cpu_get();
 	if (ret < 0)
 		return ret;
 
-	count++;
-
 	if (lkl_ops->tls_get) {
 		task = lkl_ops->tls_get(task_key);
 		if (!task) {
@@ -116,16 +113,7 @@ long lkl_syscall(long no, long *params)
 
 	ret = run_syscall(no, params);
 
-	if (count > 1) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!thread_set_sched_jmp())
-			schedule();
-		count--;
-		return ret;
-	}
-
 out:
-	count--;
 	lkl_cpu_put();
 
 	return ret;
diff --git a/arch/lkl/kernel/threads.c b/arch/lkl/kernel/threads.c
index 049344b585209a..5480ecdc806d8d 100644
--- a/arch/lkl/kernel/threads.c
+++ b/arch/lkl/kernel/threads.c
@@ -84,25 +84,48 @@ struct thread_info *_current_thread_info = &init_thread_union.thread_info;
  */
 static struct task_struct *abs_prev = &init_task;
 
+/*
+ * Reimplement to make sure there is no atomic op.
+ * Copied from include/asm-generic/bitops/non-atomic.h
+ */
+static inline int test_bit_no_atomic(int nr, const unsigned long *addr)
+{
+	return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+
 struct task_struct *__switch_to(struct task_struct *prev,
 				struct task_struct *next)
 {
 	struct thread_info *_prev = task_thread_info(prev);
 	struct thread_info *_next = task_thread_info(next);
 	unsigned long _prev_flags = _prev->flags;
+	bool wakeup_idle = test_bit_no_atomic(TIF_IDLE, &_next->flags) &&
+				lkl_cpu_idle_pending();
 
 	_current_thread_info = task_thread_info(next);
 	_next->prev_sched = prev;
 	abs_prev = prev;
 
 	BUG_ON(!_next->tid);
-	lkl_cpu_change_owner(_next->tid);
 
-	lkl_ops->sem_up(_next->sched_sem);
-	if (test_bit(TIF_SCHED_JB, &_prev_flags)) {
+	if (test_bit_no_atomic(TIF_SCHED_JB, &_prev_flags)) {
+		/* Atomic. Must be done before wakeup next */
 		clear_ti_thread_flag(_prev, TIF_SCHED_JB);
+	}
+	if (wakeup_idle)
+		schedule_tail(abs_prev);
+	lkl_cpu_change_owner(_next->tid);
+
+	/* No kernel code is allowed after wakeup next */
+	if (wakeup_idle)
+		lkl_cpu_wakeup_idle();
+	else
+		lkl_ops->sem_up(_next->sched_sem);
+
+	if (test_bit_no_atomic(TIF_SCHED_JB, &_prev_flags)) {
 		lkl_ops->jmp_buf_longjmp(&_prev->sched_jb, 1);
-	} else if (test_bit(TIF_SCHED_EXIT, &_prev_flags)) {
+	} else if (test_bit_no_atomic(TIF_SCHED_EXIT, &_prev_flags)) {
 		lkl_ops->thread_exit();
 	} else {
 		lkl_ops->sem_down(_prev->sched_sem);
@@ -132,8 +155,8 @@ void switch_to_host_task(struct task_struct *task)
 		if (!thread_set_sched_jmp())
 			schedule();
 	} else {
-		lkl_cpu_wakeup();
-		lkl_cpu_put();
+		if (!thread_set_sched_jmp())
+			lkl_idle_tail_schedule();
 	}
 
 	lkl_ops->sem_down(task_thread_info(task)->sched_sem);