forked from nikitadanilov/usched
-
Notifications
You must be signed in to change notification settings - Fork 0
/
usched.c
341 lines (328 loc) · 13.3 KB
/
usched.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
/*
* Overview
* --------
*
* A simple dispatcher for cooperative user-space threads.
*
* A typical implementation of user-space threads allocates a separate stack for
* each thread when the thread is created and then dispatches threads (as
* decided by the scheduler) through some context switching mechanism, for
* example, longjmp().
*
* In usched all threads (represented by struct ustack) are executed on the same
* "native" stack. When a thread is about to block (usched_block()), a memory
* buffer for the stack used by this thread is allocated and the stack is copied
* to the buffer. After that the part of the stack used by the blocking thread
* is discarded (by longjmp()-ing to the base of the stack) and a new thread is
* selected. The stack of the selected thread is restored from its buffer and
* the thread is resumed by longjmp()-ing to the usched_block() that blocked it.
*
* The focus of this implementation is simplicity: the total size of usched.[ch]
* is less than 120LOC, as measured by SLOCCount.
*
* Advantages:
*
* - no need to allocate maximal possible stack at thread initialisation:
* stack buffer is allocated as needed. It is also possible to free the
* buffer when the thread is resumed (not currently implemented);
*
* - thread that doesn't block has 0 overhead: it is executed as a native
* function call (through a function pointer) without any context
* switching;
*
* - because the threads are executed on the stack of the same native
* underlying thread, native synchronisation primitives (mutices, etc.)
* work, although the threads share underlying TLS. Of course one cannot
* use native primitives to synchronise between usched threads running on
* the same native thread.
*
* Disadvantages:
*
* - stack copying introduces overhead (memcpy()) in each context switch;
*
* - because stacks are moved around, addresses on a thread stack are only
* valid while the thread is running. This invalidates certain common
* programming idioms: other threads and heap cannot store pointers to the
* stacks, at least to the stacks of the blocked threads. Note that Go
* language, and probably other run-times, maintains a similar invariant.
*
* Usage
* -----
*
* usched is only a dispatcher and not a scheduler: it blocks and resumes
* threads but
*
* - it does not keep track of threads (specifically allocation and freeing
* of struct ustack instances is done elsewhere),
*
* - it implements no scheduling policies.
*
* These things are left to the user, together with stack buffers allocation and
* freeing. The user supplies 3 call-backs:
*
* - usched::s_next(): the scheduling function. This call-backs returns the
* next thread to execute. This can be either a new (never before
* executed) thread initialised with ustack_init(), or it can be a blocked
* thread. The user must keep track of blocked and runnable threads,
* presumably by providing wrappers to ustack_init() and ustack_block()
* that would record thread state changes. It is up to usched::s_next() to
* block and wait for events if there are no runnable threads and all
* threads are waiting for something. If usched::s_next() returns NULL,
* the dispatcher loop usched_run() exits. It is in general unsafe to
* re-start it again (unless you can guarantee that it will be restarted
* with exactly the same stack pointer, see the anchor assertion in
* usched_run()) so usched::s_next() should return NULL only when it is
* time to shutdown;
*
* - usched::s_alloc(): allocates new stack buffer of at least the specified
* size. The user have full control over stack buffer allocation. It is
* possible to pre-allocate the buffer when the thread is initialised
* (reducing the cost of usched_block()), it is possible to cache buffers,
* etc.;
*
* - usched::s_free(): frees the previously allocated stack buffer.
*
* An implementation of these call-backs must satisfy the following
* requirements:
*
* - usched::s_next() returns only new or blocked threads. That is, if a
* thread completed by returning from its startup function ustack::u_f()
* or aborted by calling usched_abort(), usched::s_next() never returns it
* again;
*
* - usched::s_alloc() always returns with the allocated stack buffer. That
* is, usched::s_alloc() has the following options if it fails to allocate
* the buffer of the specified size:
*
* + do some cleanup and then abort the current thread by calling
* ustack_abort(), which longjmp-s to usched_run(). The aborted
* thread must never be returned by usched::s_next();
*
* + do some cleanup and then longjmp to some point within the thread
* stack, so that the thread continues the execution (and may re-try
* blocking later again);
*
* + longjmp out of usched_run() completely or kill the current native
* thread.
*
* After installing the above call-backs in an instance of struct usched, the
* user calls usched_run() either from the main thread or form a dedicated
* native thread.
*
* rr.[ch] provides a simple "round-robin" scheduler implementing all the
* call-backs. Use it carefully, it was only tested with rmain.c benchmark.
*
* Pictures!
* ---------
*
* The following diagrams show stack management by usched. The stack grows from
* right to left.
*
* At the entrance to the dispatcher loop. usched_run(S):
*
* usched_run()
* ----------------------------------------------+--------------+-------+
* | buf | anchor | ... |
* ----------------------------------------------+--------------+-------+
* ^
* |
* sp = S->s_buf
*
* A new (never before executed) thread U is selected by S->s_next(), launch()
* calls the thread startup function U->u_f():
*
* U->u_f() launch() usched_run()
* -----------------------------+---------+-----+--------------+-------+
* | | pad | buf | anchor | ... |
* -----------------------------+---------+-----+--------------+-------+
* ^ ^
* | |
* sp U->u_bottom
*
* Thread executes as usual on the stack, until it blocks by calling
* usched_block():
*
*
* usched_block() bar() U->u_f() launch() usched_run()
* ----------+------+-----+-----+---------+-----+--------------+-------+
* | here | ... | | | pad | buf | anchor | ... |
* ----------+------+-----+-----+---------+-----+--------------+-------+
* ^ ^ ^
* | +-- sp = U->u_cont |
* | U->u_bottom
* U->u_top
*
* The stack from U->u_top to U->u_bottom is copied into the stack buffer
* U->u_stack, and control returns to usched_run() by longjmp(S->s_buf):
*
* usched_run()
* ----------------------------------------------+--------------+-------+
* | buf | anchor | ... |
* ----------------------------------------------+--------------+-------+
* ^
* |
* sp = S->s_buf
*
* Next, suppose S->s_next() selects a previously blocked thread V ready to be
* resumed. usched_run() calls cont(V).
*
* cont() usched_run()
* ----------------------------------------+-----+--------------+-------+
* | | buf | anchor | ... |
* ----------------------------------------+-----+--------------+-------+
* ^
* |
* sp
*
* cont() copies the stack from the buffer to [V->u_top, V->u_bottom]
* range. It's important that this memcpy() operation does not overwrite
* cont()'s own stack frame, this is why pad[] array is needed in launch(): it
* advances V->u_bottom and gives cont() some space to operate.
*
* usched_block() foo() V->u_f() cont() usched_run()
* ---------+------+-----+-----+--------+--+-----+--------------+-------+
* | here | ... | | | | | buf | anchor | ... |
* ---------+------+-----+-----+--------+--+-----+--------------+-------+
* ^ ^ ^ ^
* | +-- V->u_cont | +-- sp
* | |
* V->u_top V->u_bottom
*
* Then cont() longjmp()-s to V->u_cont, restoring V execution context:
*
* usched_block() foo() V->u_f() cont() usched_run()
* ---------+------+-----+-----+--------+--+-----+--------------+-------+
* | here | ... | | | | | buf | anchor | ... |
* ---------+------+-----+-----+--------+--+-----+--------------+-------+
* ^
* +-- sp = V->u_cont
*
* V continues its execution as if it returned from usched_block().
*
* Multiprocessing
* ---------------
*
* By design, a single instance of struct usched cannot take advantage of
* multiple processors, because all its threads are executing within a single
* native thread. Multiple instances of struct usched can co-exist within a
* single process address space, but a ustack thread created for one instance
* cannot be migrated to another. One possible strategy to add support for
* multiple processors is to create multiple instances of struct usched and
* schedule them (that is, schedule the threads running respective
* usched_run()-s) to processors via pthread_setaffinity_np() or similar.
*
* Current limitations
* -------------------
*
* - the stack is assumed to grow toward lower addresses. This is easy to fix,
* if necessary;
*
* - the implementation is not signal-safe. Fixing this can be as easy as
* replacing *jmp() calls with their sig*jmp() counterparts. At the moment
* signal-based code, like gperf -lprofiler library, would most likely crash
* usched;
*
* - usched.c must be compiled without optimisations and with
* -fno-stack-protector option (gcc);
*
* - usched threads are cooperative: a thread will continue to run until it
* completes of blocks. Adding preemption (via signal-based timers) is
* relatively easy, the actual preemption decision will be relegated to the
* external "scheduler" via a new usched::s_preempt() call-back invoked from
* a signal handler.
*
*/
/* cc -fno-stack-protector */
#include "usched.h"
#include <setjmp.h>
#include <stdlib.h>
#include <errno.h>
#include <stddef.h>
#include <assert.h>
#include <string.h>
static void launch(struct ustack *u);
static void cont (struct ustack *u);
static void stack_in (struct ustack *u);
static void stack_out(struct ustack *u);
static __thread struct ustack *current = NULL;
void usched_run(struct usched *s)
{
int anchor;
uintptr_t buf[5];
struct ustack *u;
assert(s->s_anchor == NULL || s->s_anchor == &anchor);
assert(s->s_next != NULL);
assert(s->s_alloc != NULL);
assert(s->s_free != NULL);
s->s_anchor = &anchor;
s->s_buf = buf;
__builtin_setjmp((void **)buf);
while ((u = s->s_next(s)) != NULL) {
current = u;
u->u_bottom != NULL ? cont(u) : launch(u);
current = NULL;
}
}
void ustack_init (struct ustack *u, struct usched *s,
void (*f)(void *), void *arg,
void *stack, int len)
{
*u = (struct ustack) {
.u_sched = s,
.u_stack = stack,
.u_len = len,
.u_f = f,
.u_arg = arg
};
}
void ustack_block(void)
{
uintptr_t here[5];
assert((void *)&here < current->u_bottom);
if (__builtin_setjmp((void **)here) == 0) {
current->u_cont = here;
current->u_top = current->u_cont - 32;
stack_out(current);
__builtin_longjmp((void **)current->u_sched->s_buf, 1);
}
}
void ustack_abort(void)
{
__builtin_longjmp((void **)current->u_sched->s_buf, 1);
}
enum { PAD = 300 };
static void launch(struct ustack *u)
{
char pad[PAD]; /* Prevents stack_in() from smashing cont() frame. */
u->u_bottom = pad;
(*u->u_f)(u->u_arg);
}
static void cont(struct ustack *u)
{
stack_in(u);
__builtin_longjmp((void **)u->u_cont, 1);
}
static void stack_in(struct ustack *u)
{
memcpy(u->u_top, u->u_stack, u->u_bottom - u->u_top);
}
static void stack_out(struct ustack *u)
{
struct usched *s = u->u_sched;
int used = u->u_bottom - u->u_top;
if (u->u_stack != NULL && u->u_len < used) {
s->s_free(s, u->u_stack, u->u_len);
u->u_stack = NULL;
}
if (u->u_stack == NULL) {
u->u_stack = s->s_alloc(s, used);
u->u_len = used;
assert(u->u_stack != NULL);
}
memcpy(u->u_stack, u->u_top, used);
}
struct ustack *ustack_self()
{
assert(current != NULL);
return current;
}