diff --git a/Documentation/virt/uml/nommu-uml.rst b/Documentation/virt/uml/nommu-uml.rst index a98bfd9d2f38c3..3194b6ff88770a 100644 --- a/Documentation/virt/uml/nommu-uml.rst +++ b/Documentation/virt/uml/nommu-uml.rst @@ -30,6 +30,27 @@ called under nommu/UML environment. works. - return to userspace +When users enable the zpoline syscall hook (configured with boot +parameter ``zpoline=1``), the code path looks like below; + +- boot kernel, setup zpoline trampoline code (detailed later) at address 0x0 +- (userspace starts) +- calls ``vfork``/``execve`` syscalls +- during execve, more specifically during ``load_elf_fdpic_binary()`` + function, kernel translates ``syscall``/``sysenter`` instructions with ``call + *%rax``, which usually point to address 0 to ``NR_syscalls`` (around + 512), where trampoline code was installed during startup. +- when syscalls are issued by userspace, it jumps to ``*%rax``, slides + until ``nop`` instructions end, and jump to hooked function, + ``__kernel_vsyscall``, which is an entrypoint for syscall under nommu + UML environment. +- call handler function in ``sys_call_table[]`` and follow how UML syscall + works. +- return to userspace + +With zpoline syscall hook, the latency is greatly improved while +startup time of a process cost a bit. See more detail in the +Benchmark section. What are the differences from MMU-full UML ? ============================================ @@ -42,7 +63,9 @@ MMU-full UML doesn't have: - generic implementation of memcpy/strcpy/futex is also used - alternate syscall entrypoint without ptrace - alternate syscall hook - - hook syscall by seccomp filter + - hook syscall by seccomp filter (when zpoline isn't used) + - translation of ``syscall``/``sysenter`` instructions to a trampoline + code and syscall hooks (when zpoline is used) With those modifications, it allows us to use unmodified userspace binaries with nommu UML. @@ -128,23 +151,27 @@ lmbench and (self-crafted) getpid benchmark (with v6.12-rc2 uml/next tree). .. csv-table:: lmbench (usec) - :header: ,native,um,um-nommu(s) - - select-10 ,0.5544,29.7143,2.8920 - select-100 ,2.3992,27.7262,3.7794 - select-1000 ,20.4708,42.0885,12.6920 - syscall ,0.1734,26.2471,2.6070 - read ,0.3433,29.8828,2.6923 - write ,0.2866,25.9753,2.6925 - stat ,1.9195,40.1164,3.1813 - open/close ,3.8657,63.4730,6.2049 - fork+sh ,1161.1111,5216.5000,462.3077 - fork+execve ,536.5263,2117.0000,131.0633 + :header: ,native,um,um-nommu(s),um-nommu(z) + + select-10 ,0.5544,29.7143,2.8920,0.2834 + select-100 ,2.3992,27.7262,3.7794,1.1732 + select-1000 ,20.4708,42.0885,12.6920,10.0434 + syscall ,0.1734,26.2471,2.6070,0.0999 + read ,0.3433,29.8828,2.6923,0.1327 + write ,0.2866,25.9753,2.6925,0.1325 + stat ,1.9195,40.1164,3.1813,0.4642 + open/close ,3.8657,63.4730,6.2049,0.7283 + fork+sh ,1161.1111,5216.5000,462.3077,18744.0000 + fork+execve ,536.5263,2117.0000,131.0633,4840.6667 .. csv-table:: do_getpid bench (nsec) - :header: ,native,um,um-nommu(s) + :header: ,native,um,um-nommu(s),um-nommu(z) - getpid, 172 , 26807 , 2614 + getpid, 172 , 26807 , 2614, 104 + + +(um-nommu(z) is nommu with zpoline syscall hook, um-nommu(s) is with +seccomp syscall hook, respectively) Limitations =========== @@ -164,14 +191,40 @@ implementation inherits the characteristics of other nommu kernels Thus, we have limited options to userspace programs. We have tested Alpine Linux with musl-libc, which has a support nommu kernel. +access to mmap_min_addr (if zpoline enabled) +-------------------------------------------- +As the mechanism of syscall translations relies on an ability to +write/read memory address zero (0x0), we need to configure host kernel +with the following command:: + +% sh -c "echo 0 > /proc/sys/vm/mmap_min_addr" + supported architecture ---------------------- The current implementation of nommu UML only works on x86_64 SUBARCH. We have not tested with 32-bit environment. +target of syscall translation (if zpoline enabled) +-------------------------------------------------- +The syscall translation only applies to the executable and interpreter +of ELF binary files which are processed by execve(2) syscall for the +moment: other libraries such as linked library and dlopen-ed one +aren't translated; we may be able to trigger the translation by +LD_PRELOAD. JIT compiler generated code is also generated after execve +thus, it is not currently translated. + +Note that with musl-libc in Alpine Linux which we've been tested, most +of syscalls are implemented in the interpreter file +(ld-musl-x86_64.so) and calling syscall/sysenter instructions from the +linked/loaded libraries might be rare. But it is definitely possible +so, a workaround with LD_PRELOAD is effective. + Further readings about NOMMU UML ================================ - NOMMU UML (original code by Ricardo Koller) - https://static.sched.com/hosted_files/ossna2020/ec/kollerr_linux_um_nommu.pdf + +- zpoline: syscall translation mechanism + - https://www.usenix.org/conference/atc23/presentation/yasukata diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 92d9d9b96d1a35..738715a6d6b3fa 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -351,6 +351,9 @@ static inline int os_setup_seccomp(void) } #else extern int os_setup_seccomp(void); + +/* zpoline.c */ +extern int um_zpoline_enabled; #endif #endif diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h index 33f69f1eac10d6..6f5977ff0d213a 100644 --- a/arch/x86/um/asm/elf.h +++ b/arch/x86/um/asm/elf.h @@ -188,6 +188,9 @@ do { \ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); +struct elf_fdpic_params; +extern int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params); extern unsigned long um_vdso_addr; #define AT_SYSINFO_EHDR 33 diff --git a/arch/x86/um/nommu/Makefile b/arch/x86/um/nommu/Makefile index 068e44ad4c4d71..20df59790e7a5b 100644 --- a/arch/x86/um/nommu/Makefile +++ b/arch/x86/um/nommu/Makefile @@ -6,3 +6,17 @@ else endif obj-y = do_syscall_$(BITS).o entry_$(BITS).o process.o signal.o syscalls_$(BITS).o os-Linux/ +obj-y += zpoline.o + +# used by zpoline.c to translate syscall/sysenter instructions +# note: only in x86_64 w/ !CONFIG_MMU +inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +targets += inat-tables.c +$(obj)/../../lib/inat.o: $(obj)/inat-tables.c +obj-y += ../../lib/insn.o ../../lib/inat.o diff --git a/arch/x86/um/nommu/zpoline.c b/arch/x86/um/nommu/zpoline.c new file mode 100644 index 00000000000000..2e30ee7193625e --- /dev/null +++ b/arch/x86/um/nommu/zpoline.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * zpoline.c + * + * Replace syscall/sysenter instructions to `call *%rax` to hook syscalls. + * + */ +//#define DEBUG +#include +#include +#include +#include +#include +#include +#include + +int um_zpoline_enabled; +/* start of trampoline code area */ +static char *__zpoline_start; + +static int __zpoline_translate_syscalls(struct elf_fdpic_params *params) +{ + int count = 0, loop; + struct insn insn; + unsigned long addr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; + struct elfhdr *ehdr = (struct elfhdr *)params->elfhdr_addr; + + if (!ehdr) + return 0; + + seg = params->loadmap->segs; + phdr = params->phdrs; + for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { + if (phdr->p_type != PT_LOAD) + continue; + addr = seg->addr; + /* skip translation of trampoline code */ + if (addr <= (unsigned long)(&__zpoline_start[0] + 0x1000 + 0x0100)) { + pr_warn("%lx: address is in the range of trampoline", addr); + return -EINVAL; + } + + /* translate only segment with Executable flag */ + if (!(phdr->p_flags & PF_X)) { + seg++; + continue; + } + + pr_debug("translation 0x%lx-0x%llx", addr, + seg->addr + seg->p_memsz); + /* now ready to translate */ + while (addr < (seg->addr + seg->p_memsz)) { + insn_init(&insn, (void *)addr, MAX_INSN_SIZE, 1); + insn_get_length(&insn); + + insn_get_opcode(&insn); + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + pr_debug("%lx: found syscall/sysenter", addr); + *(char *)addr = 0xff; // callq + *((char *)addr + 1) = 0xd0; // *%rax + count++; + break; + } + default: + break; + } + + addr += insn.length; + if (insn.length == 0) { + pr_debug("%lx: length zero with byte %x. skip ?", + addr, insn.opcode.bytes[0]); + addr += 1; + } + } + seg++; + } + return count; +} + +/** + * elf_arch_finalize_exec() - architecture hook to translate syscall/sysenter + * + * translate syscall/sysenter instruction upon loading ELF binary file + * on execve(2)&co syscall. + * + * suppose we have those instructions: + * + * mov $sysnr, %rax + * syscall 0f 05 + * + * this will translate it with: + * + * mov $sysnr, %rax (<= untouched) + * call *(%rax) ff d0 + * + * this will finally called hook function guided by trampoline code installed + * at setup_zpoline_trampoline(). + * + * @exec_params: ELF meta data for executable file + * @interp_params: ELF meta data for the interpreter file + */ +int elf_arch_finalize_exec(struct elf_fdpic_params *exec_params, + struct elf_fdpic_params *interp_params) +{ + int err = 0, count = 0; + struct mm_struct *mm = current->mm; + + /* zpoline disabled */ + if (!um_zpoline_enabled) + return 0; + + if (down_write_killable(&mm->mmap_lock)) + return -EINTR; + + /* translate for the executable */ + err = __zpoline_translate_syscalls(exec_params); + if (err < 0) { + pr_info("zpoline: xlate error %d", err); + goto out; + } + count += err; + pr_debug("zpoline: rewritten (exec) %d syscalls\n", count); + + /* translate for the interpreter */ + err = __zpoline_translate_syscalls(interp_params); + if (err < 0) { + pr_info("zpoline: xlate error %d", err); + goto out; + } + count += err; + + err = 0; + pr_debug("zpoline: rewritten (exec+interp) %d syscalls\n", count); + +out: + up_write(&mm->mmap_lock); + return err; +} + +/** + * setup_zpoline_trampoline() - install trampoline code for zpoline + * + * setup trampoline code for syscall hooks + * + * the trampoline code guides to call hooked function, __kernel_vsyscall + * in this case, via nop slides at the memory address zero (thus, zpoline). + * + * loaded binary by exec(2) is translated to call the function. + */ +static int __init setup_zpoline_trampoline(void) +{ + int i, ret; + int ptr; + + if (!um_zpoline_enabled) + return 0; + + /* zpoline: map area of trampoline code started from addr 0x0 */ + __zpoline_start = 0x0; + + ret = os_map_memory((void *) 0, -1, 0, PAGE_SIZE, 1, 1, 1); + if (ret) + panic("map failed\n NOTE: /proc/sys/vm/mmap_min_addr should be set 0\n"); + + /* fill nop instructions until the trampoline code */ + for (i = 0; i < NR_syscalls; i++) + __zpoline_start[i] = 0x90; + + /* optimization to skip old syscalls */ + /* short jmp */ + __zpoline_start[214 /* __NR_epoll_ctl_old */] = 0xeb; + /* range of a short jmp : -128 ~ +127 */ + __zpoline_start[215 /* __NR_epoll_wait_old */] = 127; + + /** + * FIXME: shift red zone area to properly handle the case + */ + + /** + * put code for jumping to __kernel_vsyscall. + * + * here we embed the following code. + * + * movabs [$addr],%r11 + * jmpq *%r11 + * + */ + ptr = NR_syscalls; + /* 49 bb [64-bit addr (8-byte)] movabs [64-bit addr (8-byte)],%r11 */ + __zpoline_start[ptr++] = 0x49; + __zpoline_start[ptr++] = 0xbb; + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 0)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 1)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 2)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 3)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 4)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 5)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 6)); + __zpoline_start[ptr++] = ((uint64_t) __kernel_vsyscall >> (8 * 7)); + + /* + * pretending to be syscall instruction by putting return + * address in %rcx. + */ + /* 48 8b 0c 24 mov (%rsp),%rcx */ + __zpoline_start[ptr++] = 0x48; + __zpoline_start[ptr++] = 0x8b; + __zpoline_start[ptr++] = 0x0c; + __zpoline_start[ptr++] = 0x24; + + /* 41 ff e3 jmp *%r11 */ + __zpoline_start[ptr++] = 0x41; + __zpoline_start[ptr++] = 0xff; + __zpoline_start[ptr++] = 0xe3; + + /* permission: XOM (PROT_EXEC only) */ + ret = os_protect_memory(0, PAGE_SIZE, 0, 0, 1); + if (ret) + panic("failed: can't configure permission on trampoline code"); + + pr_info("zpoline: setting up trampoline code done\n"); + return 0; +} +arch_initcall(setup_zpoline_trampoline); + +static int __init zpoline_set(char *str) +{ + int val = 0; + + get_option(&str, &val); + um_zpoline_enabled = val; + return 1; +} +__setup("zpoline=", zpoline_set); diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c index cbae2584124fd0..b3866550738616 100644 --- a/arch/x86/um/vdso/um_vdso.c +++ b/arch/x86/um/vdso/um_vdso.c @@ -19,15 +19,31 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz); __kernel_old_time_t __vdso_time(__kernel_old_time_t *t); long __vdso_getcpu(unsigned int *cpu, unsigned int *node, struct getcpu_cache *unused); +/* XXX: FIXME, always trap SIGSYS on nommu, cannot use zpoline path as + * we don't know how to retrieve um_zpoline_enabled from vdso object ??? + */ +#define __VDSO_SYSCALL1(sysnr, ret, a0) { \ + do { \ + asm("syscall" \ + : "=a" (ret) \ + : "0" (sysnr), "D" (a0) \ + : "rcx", "r11", "memory"); \ + } while (0); \ + } +#define __VDSO_SYSCALL2(sysnr, ret, a0, a1) { \ + do { \ + asm("syscall" \ + : "=a" (ret) \ + : "0" (sysnr), "D" (a0), "S" (a1) \ + : "rcx", "r11", "memory"); \ + } while (0); \ + } + int __vdso_clock_gettime(clockid_t clock, struct __kernel_old_timespec *ts) { long ret; - asm("syscall" - : "=a" (ret) - : "0" (__NR_clock_gettime), "D" (clock), "S" (ts) - : "rcx", "r11", "memory"); - + __VDSO_SYSCALL2(__NR_clock_gettime, ret, clock, ts); return ret; } int clock_gettime(clockid_t, struct __kernel_old_timespec *) @@ -37,11 +53,7 @@ int __vdso_gettimeofday(struct __kernel_old_timeval *tv, struct timezone *tz) { long ret; - asm("syscall" - : "=a" (ret) - : "0" (__NR_gettimeofday), "D" (tv), "S" (tz) - : "rcx", "r11", "memory"); - + __VDSO_SYSCALL2(__NR_gettimeofday, ret, tv, tz); return ret; } int gettimeofday(struct __kernel_old_timeval *, struct timezone *) @@ -51,10 +63,7 @@ __kernel_old_time_t __vdso_time(__kernel_old_time_t *t) { long secs; - asm volatile("syscall" - : "=a" (secs) - : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); - + __VDSO_SYSCALL1(__NR_time, secs, t); return secs; } __kernel_old_time_t time(__kernel_old_time_t *t) __attribute__((weak, alias("__vdso_time")));